In [None]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import time
from scipy.stats import sem
from sklearn.metrics import accuracy_score, precision_score, recall_score
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [None]:
# spliting percentages
train = [0.02,0.05,0.07]
more = np.linspace(0.10,0.95,18)
train = np.hstack([train,more])

# Creates the table for the analysis
table = pd.DataFrame(columns=['models','train %','fit time','pred time','acc_train','pre_train','rec_train','acc_test','pre_test','rec_test'])

# Models to be used
models = [GradientBoostingClassifier(random_state=2),KNeighborsClassifier(),LinearSVC(random_state=5),LogisticRegression(random_state=2)
,RandomForestClassifier(random_state=2),RidgeClassifier(random_state=2),XGBClassifier()]

# Own data
own_data = pd.read_csv()

In [None]:
# Select cols to use to train the model
X = own_data[['Latitude','Longitude','Speed','Hours','Month']]

# Target Variable
y = own_data['target']
enconder = OrdinalEncoder()
y = enconder.fit_transform(np.array(y).reshape(-1,1))

In [None]:
# for each training percentage

for set in train:

    for i in range(10):
        
        xtrain, xtest, ytrain, ytest = train_test_split(X,y,train_size=set,shuffle=True)

        for model in models:

            time1 = time.time()
            model.fit(xtrain,ytrain)
            time2 = time.time()
            tmpT = time2-time1
            ypredt = model.predict(xtrain)
            acc_train = accuracy_score(ytrain,ypredt)
            pre_train = precision_score(ytrain,ypredt)
            rec_train = recall_score(ytrain,ypredt)
            time1 = time.time()
            ypred = model.predict(xtest)
            acc_test = accuracy_score(ytest,ypred)
            pre_test = precision_score(ytest,ypred)
            rec_test = recall_score(ytest,ypred)
            time2 = time.time()
            tmpTT = time2-time1
            name = model.__class__.__name__

            table = table.append({'models':name,'train %':set,'fit time':tmpT,'pred time':tmpTT,
                                'acc_train':acc_train,'pre_train':pre_train,'rec_train':rec_train,
                                'acc_test':acc_test,'pre_test':pre_test,'rec_test':rec_test}, ignore_index = True)
table

# Estimates mean and sem metrics for each model and number of variables


In [None]:
table_means = pd.DataFrame(columns=['model','train %','mean_fit','mean_pred','mean_acc','mean_pre','mean_rec','sem_fit','sem_pred','sem_acc','sem_pre','sem_rec'])

for model in pd.unique(table['models']):
    
    new_data = table[table['models']==model]

    for set in pd.unique(new_data['train %']):

        set_data = new_data[new_data['train %'] == set]
        
        mean_time = np.mean(set_data['fit time'])
        sem_time = sem(set_data['fit time'])

        mean_pred = np.mean(set_data['pred time'])
        sem_pred = sem(set_data['pred time'])

        mean_acc = np.mean(set_data['acc_test'])
        sem_acc = sem(set_data['acc_test'])

        mean_pre = np.mean(set_data['pre_test'])
        sem_pre = sem(set_data['pre_test'])

        mean_rec = np.mean(set_data['rec_test'])
        sem_rec = sem(set_data['rec_test'])

        results = {'model':model,'train %':set,'mean_fit':mean_time,'mean_pred':mean_pred,'mean_acc':mean_acc,'mean_pre':mean_pre,'mean_rec':mean_rec
                    ,'sem_fit':sem_time,'sem_pred':sem_pred,'sem_acc':sem_acc,'sem_pre':sem_pre,'sem_rec':sem_rec}

        table_means = table_means.append(results,ignore_index=True)



# Data Visualization

datasplitByBoat is regarding sampling by boat approach, not considered for the sake of simplicity, but it was commented

In [None]:
plt.figure(figsize=(10,15))
ftms = ['o','^','s','v','x','H','*']
colors = [(43/255,57/255,136/255),(42/255,196/255,244/255),(251/255,174/255,39/255)]
size = 4
label = ['GrBo','RaFo','XGBo']
i = 0
lab = np.array(pd.unique(table_means['train %']),dtype=float)*100
for model in ['GradientBoostingClassifier', 'RandomForestClassifier','XGBClassifier']:

    plt.subplot(5,2,1)
    plt.title('Sample unit by point')
    plt.ylabel('Accuracy (%)')

    plt.xticks(np.arange(10,100,10),color = 'w')
    plt.errorbar(np.round(lab,2)[3:-1],table_means[table_means['model']==model]['mean_acc'][3:-1]*100
                ,yerr=table_means[table_means['model']==model]['sem_acc'][3:-1]*100,capsize=3,fmt=ftms[i],color = colors[i] ,markersize= size)
    #plt.yticks(ticks=[96,97.5,99.0,100])

    #plt.subplot(5,2,2)
    #plt.title('Sample unit by boat')
    #plt.xticks(np.arange(10,100,10),color = 'w')
    #plt.errorbar(np.round(lab,2)[3:-1],datasplitByBoat[datasplitByBoat['model']==model]['mean_acc'][3:-1]*100
    #            ,yerr=datasplitByBoat[datasplitByBoat['model']==model]['sem_acc'][3:-1]*100,capsize=3,fmt=ftms[i], color = colors[i] ,markersize = size)
    #plt.yticks(ticks=[80,85.0,90.0,95.0])

    plt.subplot(5,2,3)
    plt.ylabel('Precision (%)')
    plt.xticks(np.arange(10,100,10),color = 'w')
    plt.errorbar(np.round(lab,2)[3:-1],table_means[table_means['model']==model]['mean_pre'][3:-1]*100
                ,yerr=table_means[table_means['model']==model]['sem_pre'][3:-1]*100,capsize=3,fmt=ftms[i], color = colors[i] ,markersize = size)
    #plt.yticks(ticks=[97,98,99,100])

    #plt.subplot(5,2,4)
    #plt.xticks(np.arange(10,100,10),color = 'w')
    #plt.errorbar(np.round(lab,2)[3:-1],datasplitByBoat[datasplitByBoat['model']==model]['mean_pre'][3:-1]*100
    #            ,yerr=datasplitByBoat[datasplitByBoat['model']==model]['sem_pre'][3:-1]*100,capsize=3,fmt=ftms[i],color = colors[i] , markersize = size)
    #plt.yticks(ticks=[91,93,95,97])

    plt.subplot(5,2,5)
    plt.ylabel('Recall (%)')
    plt.xticks(np.arange(10,100,10))
    plt.errorbar(np.round(lab,2)[3:-1],table_means[table_means['model']==model]['mean_rec'][3:-1]*100
                ,yerr=table_means[table_means['model']==model]['sem_rec'][3:-1]*100,capsize=3,fmt=ftms[i], color = colors[i] ,markersize = size)

    #plt.subplot(5,2,6)
    #plt.xticks(np.arange(10,100,10))
    #plt.errorbar(np.round(lab,2)[3:-1],datasplitByBoat[datasplitByBoat['model']==model]['mean_rec'][3:-1]*100
    #            ,yerr=datasplitByBoat[datasplitByBoat['model']==model]['sem_rec'][3:-1]*100,capsize=3,fmt=ftms[i],color = colors[i] , markersize = size)
    plt.xlabel('Percentage of training data (%)',{'x':-0.1,'y':0})

    ''' plt.subplot(5,2,7)
    plt.ylabel('Training time (seconds)')
    plt.xticks(np.arange(10,100,10),color = 'w')
    plt.errorbar(np.round(lab,2)[3:-1],datasplitByPoint[datasplitByPoint['model']==model]['mean_fit'][3:-1]
                ,yerr=datasplitByPoint[datasplitByPoint['model']==model]['sem_fit'][3:-1],capsize=3,fmt=ftms[i], color = colors[i] ,markersize = size)
    plt.yticks(ticks=[0,10,20,30])

    plt.subplot(5,2,8)
    plt.xticks(np.arange(10,100,10),color = 'w')
    plt.errorbar(np.round(lab,2)[3:-1],datasplitByBoat[datasplitByBoat['model']==model]['mean_fit'][3:-1]
                ,yerr=datasplitByBoat[datasplitByBoat['model']==model]['sem_fit'][3:-1],capsize=3,fmt=ftms[i],color = colors[i] , markersize = size)
    plt.yticks(ticks=[0,10,20,30])

    plt.subplot(5,2,9)
    plt.ylabel('Predict time (seconds)')
    plt.errorbar(np.round(lab,2)[3:-1],datasplitByPoint[datasplitByPoint['model']==model]['mean_pred'][3:-1]
                ,yerr=datasplitByPoint[datasplitByPoint['model']==model]['sem_pred'][3:-1],capsize=3,fmt=ftms[i], color = colors[i] ,markersize = size)
    plt.xticks(np.arange(10,100,10))
    plt.yticks(ticks=[0,0.75,1.75,2.5])
    #plt.xlabel('Data train set')
    
    plt.subplot(5,2,10)
    plt.errorbar(np.round(lab,2)[3:-1],datasplitByBoat[datasplitByBoat['model']==model]['mean_pred'][3:-1]
            ,yerr=datasplitByBoat[datasplitByBoat['model']==model]['sem_pred'][3:-1],capsize=3,fmt=ftms[i],color = colors[i] , markersize = size)
    plt.xticks(np.arange(10,100,10))
    plt.yticks(ticks=[0,0.75,1.75,2.5])
    plt.xlabel('Data train set',{'x':-0.1,'y':0})'''

    i+=1
plt.legend(['GrBo','RaFo','XGBo'], bbox_to_anchor = (1.3,3.45),ncol = 1)