In [None]:
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from itertools import combinations
import xgboost as xgb
import warnings
from scipy.stats import sem
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [None]:
own_data = pd.read_csv()

# Define columns to train the model
columns_to_train = []

In [None]:
table = pd.DataFrame(columns=['models','variables','accuracy_train','precision_train','recall_train','accuracy_test'
                            ,'precision_test','recall_test','ytrue','ypred','ytruetrain','ypredtrain','idx_train','idx_test'])


best_comb = ['','',-1,-1,-1]
enconder = OrdinalEncoder()

# Training Percentage
train = 0.6

# Choose the approach 'p' by point, 'b' by boat
approach = 'p'

models = [GradientBoostingClassifier(random_state=2),KNeighborsClassifier(),LinearSVC(random_state=5),
         LogisticRegression(random_state=2),RandomForestClassifier(random_state=2), RidgeClassifier(random_state=2),xgb.XGBClassifier()]

names = ['GrBo','Knei','LSVC','LogR','RaFo','RidC','XGBo']

# Defines the number of combinations to do (of 2 variables, 3 variables, 4 variables and so on)
for i in range(2,len(columns_to_train)+1):
    
    # All feature combinations
    comb = combinations(columns_to_train,i)
    comb = list(comb)

    # Prints how many combinations are possible with i
    print('Combinations:' + str(len(list(comb))) + '\n')
    
    for c in comb:
        
        c = list(c)
        print('Divisão dos dados...\n')
        X = own_data.copy()
        y = own_data["target"].copy()
        y = enconder.fit_transform(y).ravel()
        boats = pd.unique(own_data['boat_trip_id'])
        np.random.seed(2)
        np.random.shuffle(boats)
        
        # if you don't want to train the models based on boat trips but on single points
        if approach == 'p':
            xtrain,xtest,ytrain,ytest = train_test_split(X, y, train_size = train, shuffle= True,
                                                        random_state = 2)
            idxtrain = xtrain.index
            idxtest = xtest.index
        
        # if you wanto to use boat trips as sampling
        else:
            train = int(len(boats)*0.6)
            test = int(len(boats)-train)
            data_train = own_data[own_data['boat_trip_id'] == boats[0]].copy()

            for trip in boats[1:train]:
                
                new_train = own_data[own_data['boat_trip_id'] == trip]
                data_train = pd.concat((data_train,new_train))

            data_test = own_data[own_data['boat_trip_id'] == boats[train]].copy()
            for trip in boats[train+1:train + test]:
                
                new_test = own_data[own_data['boat_trip_id'] == trip]
                data_test = pd.concat((data_test,new_test))
            
            # Selects the variables based on the combinations
            xtrain = data_train[c]
            idxtrain=xtrain.index
            ytrain = enconder.fit_transform(np.array(data_train['target']).reshape(-1,1)).ravel()

            # Selects the variables based on the combinations
            xtest = data_test[c]
            idxtest=data_test.index
            ytest = enconder.fit_transform(np.array(data_test['target']).reshape(-1,1)).ravel()
        
        # Fits the models and checks their perfomances
        for m in range(len(models)):
            print('A testar um modelo')
            models[m].fit(xtrain,ytrain)
            y_fit = models[m].predict(xtrain)
            y_pred = models[m].predict(xtest)
            acc_train =  accuracy_score(ytrain,y_fit)
            pre_train =  precision_score(ytrain,y_fit)
            rec_train =  recall_score(ytrain,y_fit)
            acc_test =  accuracy_score(ytest,y_pred)
            pre_test =  precision_score(ytest,y_pred)
            rec_test =  recall_score(ytest,y_pred)
            
            
            
            table = table.append({'models':approach + '.' + names[m], 'num_variables':len(c), 'variables':str(c)[1:-1],'accuracy_treino':acc_train,
                                  'precision_treino':pre_train,'recall_treino':rec_train,'accuracy_test':acc_test,'precision_test':pre_test,
                                  'recall_test':rec_test,'ytrue':ytest,'ypred':y_pred,'ytruetrain':ytrain,'ypredtrain':y_fit,'idx_train':idxtrain
                                  ,'idx_test':idxtest},ignore_index=True)
           
            
            if best_comb[2] < accuracy_score(ytest,y_pred):
                best_comb[0] = names[m]
                best_comb[1] = c
                best_comb[2] = acc_test
                best_comb[3] = pre_test
                best_comb[4] = rec_test

# Estimates mean and sem metrics for each model and number of variables

model2 is regarding sampling by boat approach, not considered for the sake of simplicity, but it was commented

In [None]:
for mod in pd.unique(table['models']):
    model = table[table['models'] == mod]
    
    #model2 = variableByBoat[variableByBoat['models'] == mod]
    
    for i in range(2,6):
        acc_m= np.mean(model[model['num_variables']==i])['accuracy_test']
        acc_sem = sem(np.array(model[model['num_variables']==i]['accuracy_test']))
        pre_m= np.mean(model[model['num_variables']==i])['precision_test']
        pre_sem = sem(np.array(model[model['num_variables']==i]['precision_test']))
        rec_m= np.mean(model[model['num_variables']==i])['recall_test']
        rec_sem = sem(np.array(model[model['num_variables']==i]['recall_test']))
        if i == 5:
            acc_sem, pre_sem, rec_sem = 0,0,0
        result = {'model':mod,'num_variables':i,'mean_acc':acc_m,'mean_pre':pre_m,'mean_rec':rec_m,'sderror_acc':acc_sem,'sderror_pre':pre_sem,'sderror_rec':rec_sem}
        table = table.append(result,ignore_index=True)

        #acc_m= np.mean(model2[model2['num_variables']==i])['accuracy_test']
        #acc_sem = sem(np.array(model2[model2['num_variables']==i]['accuracy_test']))
        #pre_m= np.mean(model2[model2['num_variables']==i])['precision_test']
        #pre_sem = sem(np.array(model2[model2['num_variables']==i]['precision_test']))
        #rec_m= np.mean(model2[model2['num_variables']==i])['recall_test']
        #rec_sem = sem(np.array(model2[model2['num_variables']==i]['recall_test']))
        #if i == 5:
        #    acc_sem, pre_sem, rec_sem = 0,0,0
        #result = {'model':mod,'num_variables':i,'mean_acc':acc_m,'mean_pre':pre_m,'mean_rec':rec_m,'sderror_acc':acc_sem,'sderror_pre':pre_sem,'sderror_rec':rec_sem}
        #table2 = table2.append(result,ignore_index=True)

# Visualize the results

model2 is regarding sampling by boat approach, not considered for the sake of simplicity, but it was commented

In [None]:
j = 0
plt.figure(figsize=(10,5))
colors = [(43/255,57/255,136/255),(42/255,196/255,244/255),(251/255,174/255,39/255),(129/255,168/255,147/255)]
markers = ['o','*','s','x']
for mod in pd.unique(table['model']):
    j = j*1.1
    model = table[table['model'] == mod]
    
    #model2 = table2[table2['model'] == mod]

    for i in range(2,6):
        
        acc_mean = model[model['num_variables']==i]['mean_acc']
        acc_sem = model[model['num_variables']==i]['sderror_acc']
        
        plt.subplot(3,2,1)
        plt.plot()
        plt.title('Sample unit by point')
        plt.ylabel('Accuracy (%)')
        plt.ylim((0.5,1))
        plt.yticks(ticks=[0.5,0.7,0.9,1.0],labels = [0.5*100,0.7*100,0.9*100,''])
        plt.xticks(color ='w')
        plt.errorbar(x = j,y = acc_mean,yerr=acc_sem,fmt=markers[i-2],capsize=3, color = colors[i-2])
        plt.axhline( y = 0.95, color = 'black',alpha = 0.02, label = None, linestyle = '--' )

        #acc_mean = model2[model2['num_variables']==i]['mean_acc']
        #acc_sem = model2[model2['num_variables']==i]['sderror_acc']
        
        #plt.subplot(3,2,2)
       #
        #plt.title('Sample unit by boat')
        ##plt.ylabel('Accuracy')
        #plt.yticks(color ='w')
        #plt.ylim((0.5,1))
        #plt.xticks(color ='w')
        #plt.errorbar(x = j,y = acc_mean,yerr=acc_sem,fmt=markers[i-2],capsize=3, color = colors[i-2])
        #plt.axhline( y = 0.95, color = 'black',alpha = 0.02, label = None, linestyle = '--' )
        
        pre_mean = model[model['num_variables']==i]['mean_pre']
        pre_sem = model[model['num_variables']==i]['sderror_pre']
        
        plt.subplot(3,2,3)
        
        plt.ylabel('Precision (%)')
        plt.ylim((0.5,1))
        plt.yticks(ticks=[0.5,0.7,0.9,1.0],labels = [0.5*100,0.7*100,0.9*100,''])
        plt.xticks(color ='w')
        plt.errorbar(x = j,y = pre_mean,yerr=pre_sem,fmt=markers[i-2],capsize=3, color = colors[i-2])
        plt.axhline( y = 0.95, color = 'black',alpha = 0.02, label = None, linestyle = '--' )

        #pre_mean = model2[model2['num_variables']==i]['mean_pre']
        #pre_sem = model2[model2['num_variables']==i]['sderror_pre']
        #
        #plt.subplot(3,2,4)
       #
        ##plt.ylabel('Recall')
        #plt.ylim((0.5,1))
        #plt.yticks(color ='w')
        #plt.xticks(color ='w')
        #plt.errorbar(x = j,y = pre_mean,yerr=pre_sem,fmt=markers[i-2],capsize=3, color = colors[i-2])
        #plt.axhline( y = 0.95, color = 'black',alpha = 0.02, label = None, linestyle = '--' )
        
        
        rec_mean = model[model['num_variables']==i]['mean_rec']
        rec_sem = model[model['num_variables']==i]['sderror_rec']
        
        plt.subplot(3,2,5)
    
        plt.ylabel('Recall (%)')
        plt.errorbar(x = j,y = rec_mean,yerr=rec_sem,fmt=markers[i-2],capsize=3, color = colors[i-2])
        plt.axhline( y = 0.95, color = 'black',alpha = 0.02, label = None, linestyle = '--' )
        plt.ylim((0.5,1))
        plt.yticks(ticks=[0.5,0.7,0.9,1.0],labels = [0.5*100,0.7*100,0.9*100,''])
        
        #rec_mean = model2[model2['num_variables']==i]['mean_rec']
        #rec_sem = model2[model2['num_variables']==i]['sderror_rec']
        #plt.xticks( ticks=[1.5,6,10.5,16,22,28.5,36],labels=['GrBo','KNei','LSVC','LogR','RaFo','RidC','XGBo'])
        #
        #plt.subplot(3,2,6)
        #
        ##plt.ylabel('Recall')
        #plt.yticks(color ='w')
        #plt.ylim((0.5,1))
        #plt.errorbar(x = j,y = rec_mean,yerr=rec_sem,fmt=markers[i-2],capsize=3, color = colors[i-2])
        #plt.axhline( y = 0.95, color = 'black',alpha = 0.02, label = None, linestyle = '--' )
        #plt.xticks( ticks=[1.5,6,10.5,16,22,28.5,36],labels=['GrBo','KNei','LSCV','LogR','RaFo','RidC','XGBo'])
        
        j+=1
plt.tight_layout(pad=-1.7)
plt.legend(pd.unique(table['num_variables']),ncol = 1, bbox_to_anchor = (1.15,3.1))
#plt.xticks(rotation = 45, ticks=[1.5,6,10.5,16,22,28.5,36],labels=['RF','GB','KN','LSVC','RGC','LG','XGB'])
        
