In [None]:
import matplotlib.pyplot as plt
from matplotlib import text
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OrdinalEncoder
import warnings
from scipy.stats import sem
import numpy.random as rd
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

In [None]:
own_data = pd.read_csv()
boats = pd.unique(own_data['boat_trip_id'])

In [None]:
# Uses moving average
def average_speed(n, data,boats):
    '''
        n: how many point to do the moving average
        data: own_data
        boats: boat trip ids
    '''
    final_speed = []
    for boat in boats:
        speed = np.array(data[data['boat_trip_id']==boat]['Speed'])

        new_speed = np.zeros(len(speed))
        new_speed[:n] = speed[:n]
        
        for i in range(n, len(speed)-n):
            
            new_speed[i] = np.mean(speed[i-n:i+n+1])
        
        new_speed[-n:] = speed[-n:] 
        final_speed = np.hstack((final_speed,new_speed))
        
    return final_speed

for i in range(1, 21):

    avs = average_speed(i,own_data, boats)
    own_data.insert(5,'MovingAverage'+str(i),avs)
    



In [None]:
# Speed cols
columns_mas = np.hstack((['Speed'],['MovingAverage'+str(i) for i in range(1,21)]))

# Table columns
cols = ['MovingAverage','Model','AccTrain','AccTest','PreTrain','PreTest','RecTrain','RecTest']

table = pd.DataFrame(columns=cols)

# Target
y = own_data['target']
enconder = OrdinalEncoder()
y = enconder.fit_transform(np.array(y).reshape(-1,1)).ravel()

# Define Models
rf_cl = RandomForestClassifier(random_state=2)
xgb_cl = XGBClassifier()
param_grid = {'booster':['gbtree'],
              'eta':[0.001,0.01,0.1,0.2,0.3,0.002],
              'max_depth':[2,3,4,5,6,7,8,9,10],
              'gamma':[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
              'subsample':[0.5,0.6,0.8,0.9,1],
              'colsample_bytree':[0.5,0.6,0.8,0.9,1],
              'objective':['binary:logistic']
}
rscv = RandomizedSearchCV(xgb_cl,param_grid,cv = 3, n_iter=5,random_state=2)
param_grid = [
        {'n_estimators': [10,20,30,40,50,60,70,80,90,100,150,200],
         'criterion': ['gini','entropy'],
         'max_features': [None, 'sqrt', 'log2'],
         'min_samples_split' : [0,1,2,3,4,5,6,7,8,9,10],
         'min_samples_leaf': [0,1,2,3,4,5,6,7,8,9,10]},
        {'bootstrap': [True]},
]
rscv2 = RandomizedSearchCV(rf_cl, param_grid, cv = 3, n_iter=5,random_state=2)


In [None]:
def evaluate_models(model1,model2,model3,table):

    # Trains the models with the different speed columns
    for col_mas in columns_mas:

        columns_train = ['Latitude','Longitude',col_mas,'Hours','Month']

        X = own_data[columns_train]
        
        xtrain, xtest, ytrain, ytest = train_test_split(X,y,shuffle=True,random_state=2,train_size=0.6)

        ## Random Forest ##

        model1.fit(xtrain,ytrain)
        ypred_tr = model1.predict(xtrain)
        acctrain = accuracy_score(ypred_tr,ytrain)
        pretrain = precision_score(ypred_tr,ytrain)
        rectrain = recall_score(ypred_tr,ytrain)
        ypred = model1.predict(xtest)
        acctest = accuracy_score(ypred,ytest)
        pretest = precision_score(ypred,ytest)
        rectest = recall_score(ypred,ytest)
        #print(model1.best_params_)
        to_append = {'MovingAverage':col_mas,'Model':'RaFo','AccTrain':acctrain,'AccTest':acctest,
                    'PreTrain':pretrain,'PreTest':pretest,'RecTrain':rectrain,'RecTest':rectest}
        table = table.append(to_append, ignore_index=True)

        ## XGBoost ##

        model2.fit(xtrain,ytrain)
        #print(model2.best_params_)
        ypred_tr = model2.predict(xtrain)
        acctrain = accuracy_score(ypred_tr,ytrain)
        pretrain = precision_score(ypred_tr,ytrain)
        rectrain = recall_score(ypred_tr,ytrain)
        ypred = model2.predict(xtest)
        acctest = accuracy_score(ypred,ytest)
        pretest = precision_score(ypred,ytest)
        rectest = recall_score(ypred,ytest)

        to_append = {'MovingAverage':col_mas,'Model':'XGBo','AccTrain':acctrain,'AccTest':acctest,
                    'PreTrain':pretrain,'PreTest':pretest,'RecTrain':rectrain,'RecTest':rectest}
        table = table.append(to_append,ignore_index=True)

        ## Gradient Boosting ##

        model3.fit(xtrain,ytrain)
        ypred_tr = model3.predict(xtrain)
        acctrain = accuracy_score(ypred_tr,ytrain)
        pretrain = precision_score(ypred_tr,ytrain)
        rectrain = recall_score(ypred_tr,ytrain)
        ypred = model3.predict(xtest)
        acctest = accuracy_score(ypred,ytest)
        pretest = precision_score(ypred,ytest)
        rectest = recall_score(ypred,ytest)

        to_append = {'MovingAverage':col_mas,'Model':'GrBo','AccTrain':acctrain,'AccTest':acctest,
                    'PreTrain':pretrain,'PreTest':pretest,'RecTrain':rectrain,'RecTest':rectest}
        table = table.append(to_append,ignore_index=True)
        
        print(col_mas)

    return table
    


In [None]:
table = evaluate_models(RandomForestClassifier(random_state=2),XGBClassifier(),GradientBoostingClassifier(),table)

# Results visualization

tmp2 is regarding sampling by boat approach, not considered for the sake of simplicity, but it was commented

In [None]:
plt.figure(figsize = (10,7))
x = np.arange(0,20+1,1)
color = [(43/255,57/255,136/255),(42/255,196/255,244/255),(251/255,174/255,39/255)]
mod = ['GrBo','RaFo','XGBo']
markers = ['o','^','s']
ticks = [[94,96,98,100],]
i=0
for model in mod:
    j=1

    tmp1 = table[table['Model']==model]
    #tmp2 = tabletrips[tabletrips['Model']==model]

    plt.subplot(3,2,j)
    plt.title('Sample unit by point')
    plt.ylabel('Acuracy (%)')
    plt.plot(x,tmp1['AccTest']*100,marker = markers[i],color = color[i], linestyle = '--')
    plt.xticks(ticks = np.arange(0,21,5), labels = np.arange(0,21,5),color = 'white')
    #plt.yticks(ticks = np.arange(0.96,1,0.02), labels = np.arange(0.96,1,0.02)*100)
    
    #plt.subplot(3,2,j+1)
    #plt.title('Sample unit by boat')
    #
    #plt.plot(x,tmp2['AccTest']*100,marker = markers[i],color = color[i], linestyle = '--')
    #plt.xticks(ticks = np.arange(0,21,5), labels = np.arange(0,21,5),color = 'white')
    j+=2

    plt.subplot(3,2,j)
    plt.ylabel('Precision (%)')
    plt.plot(x,tmp1['PreTest']*100,marker = markers[i],color = color[i], linestyle = '--')
    plt.xticks(ticks = np.arange(0,21,5), labels = np.arange(0,21,5),color = 'white')

    #plt.subplot(3,2,j+1)
    #
    #plt.plot(x,tmp2['PreTest']*100,marker = markers[i],color = color[i], linestyle = '--')
    #plt.xticks(ticks = np.arange(0,21,5), labels = np.arange(0,21,5),color = 'white')

    j+=2
    
    plt.subplot(3,2,j)
    plt.ylabel('Recall (%)')
    plt.plot(x,tmp1['RecTest']*100,marker = markers[i],color = color[i], linestyle = '--')
    plt.xticks(ticks = np.arange(0,21,5), labels = np.arange(0,21,5))
    
    #plt.subplot(3,2,j+1)
    #plt.plot(x,tmp2['RecTest']*100,marker = markers[i],color = color[i], linestyle = '--')
    #plt.xticks(ticks = np.arange(0,21,5), labels = np.arange(0,21,5))
    #plt.xlabel('Moving Average',{'x':-0.1,'y':0})

    i+=1
plt.legend(mod, bbox_to_anchor = (1.3,3.45))
