### PyCaret documentation to be found at https://pycaret.readthedocs.io/en/latest/api/classification.html

# Setting up environment

In [1]:
import pandas as pd, numpy as np
%matplotlib inline

#setting to displaying all columns in pandas df
pd.set_option("display.max_columns", None)

## Load data

In [2]:
# Data generated in '2.0-random_forest'
dataset = pd.read_csv('../results/2.0-random_forest_train_test.csv', index_col=0)

In [3]:
dataset.head()

Unnamed: 0,AGE,RAVLT_immediate,AVDEL30MIN_neuro,AVDELTOT_neuro,TRAASCOR_neuro,TRABSCOR_neuro,CATANIMSC_neuro,GDTOTAL_gds,ANARTERR_neuro,LRHHC_n_long,Apoe4_,Subgroup_,Subgroup_num_,Usage_,PTGENDER,Gender_num_
13,80.0,36.0,1.0,8.0,47.0,139.0,16.0,1.0,4.0,0.003458,0.0,cAD,1,train,Female,1
26,76.8,27.0,5.0,10.0,61.0,300.0,19.0,4.0,23.0,0.005251,0.0,sMCI,0,train,Male,0
29,70.9,22.0,0.0,0.0,18.0,145.0,18.0,2.0,12.0,0.003026,1.0,cAD,1,train,Female,1
37,72.8,29.0,1.0,8.0,40.0,101.0,17.0,1.0,6.0,0.002767,0.0,cAD,1,train,Male,0
55,77.6,30.0,2.0,14.0,150.0,300.0,12.0,0.0,9.0,0.002814,0.0,cAD,1,train,Male,0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 678 entries, 13 to 6309
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AGE               678 non-null    float64
 1   RAVLT_immediate   678 non-null    float64
 2   AVDEL30MIN_neuro  678 non-null    float64
 3   AVDELTOT_neuro    678 non-null    float64
 4   TRAASCOR_neuro    678 non-null    float64
 5   TRABSCOR_neuro    678 non-null    float64
 6   CATANIMSC_neuro   678 non-null    float64
 7   GDTOTAL_gds       678 non-null    float64
 8   ANARTERR_neuro    678 non-null    float64
 9   LRHHC_n_long      678 non-null    float64
 10  Apoe4_            678 non-null    float64
 11  Subgroup_         678 non-null    object 
 12  Subgroup_num_     678 non-null    int64  
 13  Usage_            678 non-null    object 
 14  PTGENDER          678 non-null    object 
 15  Gender_num_       678 non-null    int64  
dtypes: float64(11), int64(2), object(3)
memory

> 678 instanser, ingen missing vales

In [5]:
dataset.Usage_.value_counts()

train    539
test     139
Name: Usage_, dtype: int64

539 for trening, 139 for test

In [6]:
dataset['Subgroup_'].value_counts()

sMCI    357
cAD     321
Name: Subgroup_, dtype: int64

# Prepare data

**ToDo's:**
1. Make sure the cross-validation on train set is exactly the same as used in RandomForest 
2. Run `setup` with the the correct vaildation folds and the spesified test data (defined in script X and selected by `Usage_`)

## Split trening og test

Specified in `Usage_`-column 

In [7]:
data = dataset.loc[dataset.Usage_=='train']
data_unseen = dataset.loc[dataset.Usage_=='test']

In [8]:
data_unseen.head()

Unnamed: 0,AGE,RAVLT_immediate,AVDEL30MIN_neuro,AVDELTOT_neuro,TRAASCOR_neuro,TRABSCOR_neuro,CATANIMSC_neuro,GDTOTAL_gds,ANARTERR_neuro,LRHHC_n_long,Apoe4_,Subgroup_,Subgroup_num_,Usage_,PTGENDER,Gender_num_
6,80.4,30.0,1.0,7.0,49.0,168.0,13.0,0.0,17.0,0.003638,0.0,sMCI,0,test,Female,1
82,77.3,29.0,0.0,11.0,122.0,151.0,17.0,2.0,3.0,0.003343,1.0,cAD,1,test,Male,0
184,77.5,35.0,1.0,10.0,27.0,69.0,24.0,2.0,22.0,0.003149,1.0,cAD,1,test,Female,1
359,71.1,24.0,0.0,2.0,50.0,85.0,13.0,2.0,7.0,0.003729,0.0,cAD,1,test,Female,1
384,83.6,30.0,2.0,9.0,22.0,76.0,18.0,0.0,9.0,0.0037,0.0,cAD,1,test,Female,1


In [9]:
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (539, 16)
Unseen Data For Predictions: (139, 16)


## Seting up crossvalidation specified in `2.0-random_forest` notebook

> NB! Using the exact same cross validation folds as specicified in the previous script. 

***Load cross validation folds specified in notebook `2.0-random_forest.ipynb`***

In [10]:
cv_folds = pd.read_csv('../results/2.0-random_forest_kfolds.csv', index_col=0)

In [11]:
cv_folds.head()

Unnamed: 0,SKF_CV10_F00_,SKF_CV10_F01_,SKF_CV10_F02_,SKF_CV10_F03_,SKF_CV10_F04_,SKF_CV10_F05_,SKF_CV10_F06_,SKF_CV10_F07_,SKF_CV10_F08_,SKF_CV10_F09_,SKF_CV20_F00_,SKF_CV20_F01_,SKF_CV20_F02_,SKF_CV20_F03_,SKF_CV20_F04_,SKF_CV20_F05_,SKF_CV20_F06_,SKF_CV20_F07_,SKF_CV20_F08_,SKF_CV20_F09_,SKF_CV20_F10_,SKF_CV20_F11_,SKF_CV20_F12_,SKF_CV20_F13_,SKF_CV20_F14_,SKF_CV20_F15_,SKF_CV20_F16_,SKF_CV20_F17_,SKF_CV20_F18_,SKF_CV20_F19_,SKF_CV50_F00_,SKF_CV50_F01_,SKF_CV50_F02_,SKF_CV50_F03_,SKF_CV50_F04_,SKF_CV50_F05_,SKF_CV50_F06_,SKF_CV50_F07_,SKF_CV50_F08_,SKF_CV50_F09_,SKF_CV50_F10_,SKF_CV50_F11_,SKF_CV50_F12_,SKF_CV50_F13_,SKF_CV50_F14_,SKF_CV50_F15_,SKF_CV50_F16_,SKF_CV50_F17_,SKF_CV50_F18_,SKF_CV50_F19_,SKF_CV50_F20_,SKF_CV50_F21_,SKF_CV50_F22_,SKF_CV50_F23_,SKF_CV50_F24_,SKF_CV50_F25_,SKF_CV50_F26_,SKF_CV50_F27_,SKF_CV50_F28_,SKF_CV50_F29_,SKF_CV50_F30_,SKF_CV50_F31_,SKF_CV50_F32_,SKF_CV50_F33_,SKF_CV50_F34_,SKF_CV50_F35_,SKF_CV50_F36_,SKF_CV50_F37_,SKF_CV50_F38_,SKF_CV50_F39_,SKF_CV50_F40_,SKF_CV50_F41_,SKF_CV50_F42_,SKF_CV50_F43_,SKF_CV50_F44_,SKF_CV50_F45_,SKF_CV50_F46_,SKF_CV50_F47_,SKF_CV50_F48_,SKF_CV50_F49_
13,val,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
26,val,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
29,val,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
37,val,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
55,val,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train


In [12]:
len(cv_folds)

539

**Check that indices in this file is the same as indices in `data` (i.e. train set).**

In [13]:
# Checking that number of times comparing index is Ture, is the same as number of data instances
np.sum(cv_folds.index == data.index) == len(data)

True

In [14]:
# Saveing names of the different folds as we have both 10, 20 and 50 (i.e. SKF_CV10_F00_ etc.)
fold_names = list(cv_folds.columns)

In [15]:
cv_setup = 'CV50'

In [16]:
val_folds = cv_folds[[f for f in fold_names if cv_setup in f]]

In [17]:
val_folds.head()

Unnamed: 0,SKF_CV50_F00_,SKF_CV50_F01_,SKF_CV50_F02_,SKF_CV50_F03_,SKF_CV50_F04_,SKF_CV50_F05_,SKF_CV50_F06_,SKF_CV50_F07_,SKF_CV50_F08_,SKF_CV50_F09_,SKF_CV50_F10_,SKF_CV50_F11_,SKF_CV50_F12_,SKF_CV50_F13_,SKF_CV50_F14_,SKF_CV50_F15_,SKF_CV50_F16_,SKF_CV50_F17_,SKF_CV50_F18_,SKF_CV50_F19_,SKF_CV50_F20_,SKF_CV50_F21_,SKF_CV50_F22_,SKF_CV50_F23_,SKF_CV50_F24_,SKF_CV50_F25_,SKF_CV50_F26_,SKF_CV50_F27_,SKF_CV50_F28_,SKF_CV50_F29_,SKF_CV50_F30_,SKF_CV50_F31_,SKF_CV50_F32_,SKF_CV50_F33_,SKF_CV50_F34_,SKF_CV50_F35_,SKF_CV50_F36_,SKF_CV50_F37_,SKF_CV50_F38_,SKF_CV50_F39_,SKF_CV50_F40_,SKF_CV50_F41_,SKF_CV50_F42_,SKF_CV50_F43_,SKF_CV50_F44_,SKF_CV50_F45_,SKF_CV50_F46_,SKF_CV50_F47_,SKF_CV50_F48_,SKF_CV50_F49_
13,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
26,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
29,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
37,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
55,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train


In [18]:
for idx, row in val_folds.iterrows():
    # Omgjør hver rad til en liste av True-False, der True hvis markert som 'val':
    bool_row = [s=='val' for s in row.values]
    
    # Sjekk om i nøyaktig én fold
    if np.sum(bool_row) != 1:
        print(f"{row.index} is not in exactly one fold")

OK.

### Fortell PyCaret's `setup` om hvilke fold som skal brukes
We are using the our predefined splits, and thus need to import the `PredefinedSplit`-function.

In [19]:
from pycaret.classification import *
from sklearn.model_selection import PredefinedSplit

In [20]:
val_folds.head()

Unnamed: 0,SKF_CV50_F00_,SKF_CV50_F01_,SKF_CV50_F02_,SKF_CV50_F03_,SKF_CV50_F04_,SKF_CV50_F05_,SKF_CV50_F06_,SKF_CV50_F07_,SKF_CV50_F08_,SKF_CV50_F09_,SKF_CV50_F10_,SKF_CV50_F11_,SKF_CV50_F12_,SKF_CV50_F13_,SKF_CV50_F14_,SKF_CV50_F15_,SKF_CV50_F16_,SKF_CV50_F17_,SKF_CV50_F18_,SKF_CV50_F19_,SKF_CV50_F20_,SKF_CV50_F21_,SKF_CV50_F22_,SKF_CV50_F23_,SKF_CV50_F24_,SKF_CV50_F25_,SKF_CV50_F26_,SKF_CV50_F27_,SKF_CV50_F28_,SKF_CV50_F29_,SKF_CV50_F30_,SKF_CV50_F31_,SKF_CV50_F32_,SKF_CV50_F33_,SKF_CV50_F34_,SKF_CV50_F35_,SKF_CV50_F36_,SKF_CV50_F37_,SKF_CV50_F38_,SKF_CV50_F39_,SKF_CV50_F40_,SKF_CV50_F41_,SKF_CV50_F42_,SKF_CV50_F43_,SKF_CV50_F44_,SKF_CV50_F45_,SKF_CV50_F46_,SKF_CV50_F47_,SKF_CV50_F48_,SKF_CV50_F49_
13,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
26,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
29,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
37,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
55,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train


Add as column i dataframe:

In [21]:
print(pycaret.__version__)

2.3.4


In [22]:
data.head()

Unnamed: 0,AGE,RAVLT_immediate,AVDEL30MIN_neuro,AVDELTOT_neuro,TRAASCOR_neuro,TRABSCOR_neuro,CATANIMSC_neuro,GDTOTAL_gds,ANARTERR_neuro,LRHHC_n_long,Apoe4_,Subgroup_,Subgroup_num_,Usage_,PTGENDER,Gender_num_
13,80.0,36.0,1.0,8.0,47.0,139.0,16.0,1.0,4.0,0.003458,0.0,cAD,1,train,Female,1
26,76.8,27.0,5.0,10.0,61.0,300.0,19.0,4.0,23.0,0.005251,0.0,sMCI,0,train,Male,0
29,70.9,22.0,0.0,0.0,18.0,145.0,18.0,2.0,12.0,0.003026,1.0,cAD,1,train,Female,1
37,72.8,29.0,1.0,8.0,40.0,101.0,17.0,1.0,6.0,0.002767,0.0,cAD,1,train,Male,0
55,77.6,30.0,2.0,14.0,150.0,300.0,12.0,0.0,9.0,0.002814,0.0,cAD,1,train,Male,0


In [23]:
def get_fold_nb(idx):
    """
    Input: an index (i.e. instance id) in the val_fold df
    Return: the index in the corresponding val_fold line where it says 'val'
    """
    
    folds = list(val_folds.loc[idx].values)
    #print(folds)
    return folds.index('val')

Adding this as new column in our `data` dataframe

In [24]:
data['fold_nb'] = [get_fold_nb(idx) for idx in list(data.index)]

Look at some arbitrary instances to checking that it is correct:

In [25]:
data[120:130]

Unnamed: 0,AGE,RAVLT_immediate,AVDEL30MIN_neuro,AVDELTOT_neuro,TRAASCOR_neuro,TRABSCOR_neuro,CATANIMSC_neuro,GDTOTAL_gds,ANARTERR_neuro,LRHHC_n_long,Apoe4_,Subgroup_,Subgroup_num_,Usage_,PTGENDER,Gender_num_,fold_nb
1476,80.4,46.0,9.0,15.0,29.0,82.0,13.0,1.0,7.0,0.004664,0.0,sMCI,0,train,Female,1,8
1491,80.9,29.0,2.0,11.0,73.0,84.0,21.0,2.0,2.0,0.003177,1.0,cAD,1,train,Male,0,13
1497,69.2,33.0,3.0,14.0,48.0,230.0,11.0,3.0,15.0,0.003481,1.0,cAD,1,train,Female,1,14
1504,65.1,38.0,0.0,5.0,25.0,73.0,15.0,1.0,17.0,0.004641,1.0,cAD,1,train,Female,1,14
1521,82.8,24.0,2.0,12.0,69.0,118.0,14.0,3.0,17.0,0.002981,1.0,sMCI,0,train,Male,0,8
1540,79.6,24.0,1.0,4.0,43.0,143.0,19.0,2.0,13.0,0.003408,1.0,sMCI,0,train,Male,0,8
1546,75.6,36.0,0.0,15.0,61.0,205.0,11.0,1.0,21.0,0.004062,1.0,cAD,1,train,Female,1,14
1552,82.0,29.0,0.0,1.0,80.0,204.0,18.0,2.0,4.0,0.002994,1.0,cAD,1,train,Female,1,14
1579,71.2,51.0,13.0,15.0,43.0,106.0,21.0,0.0,2.0,0.004881,0.0,sMCI,0,train,Male,0,9
1590,74.1,42.0,8.0,11.0,34.0,55.0,18.0,0.0,3.0,0.004407,0.0,sMCI,0,train,Female,1,9


In [26]:
val_folds[120:130]

Unnamed: 0,SKF_CV50_F00_,SKF_CV50_F01_,SKF_CV50_F02_,SKF_CV50_F03_,SKF_CV50_F04_,SKF_CV50_F05_,SKF_CV50_F06_,SKF_CV50_F07_,SKF_CV50_F08_,SKF_CV50_F09_,SKF_CV50_F10_,SKF_CV50_F11_,SKF_CV50_F12_,SKF_CV50_F13_,SKF_CV50_F14_,SKF_CV50_F15_,SKF_CV50_F16_,SKF_CV50_F17_,SKF_CV50_F18_,SKF_CV50_F19_,SKF_CV50_F20_,SKF_CV50_F21_,SKF_CV50_F22_,SKF_CV50_F23_,SKF_CV50_F24_,SKF_CV50_F25_,SKF_CV50_F26_,SKF_CV50_F27_,SKF_CV50_F28_,SKF_CV50_F29_,SKF_CV50_F30_,SKF_CV50_F31_,SKF_CV50_F32_,SKF_CV50_F33_,SKF_CV50_F34_,SKF_CV50_F35_,SKF_CV50_F36_,SKF_CV50_F37_,SKF_CV50_F38_,SKF_CV50_F39_,SKF_CV50_F40_,SKF_CV50_F41_,SKF_CV50_F42_,SKF_CV50_F43_,SKF_CV50_F44_,SKF_CV50_F45_,SKF_CV50_F46_,SKF_CV50_F47_,SKF_CV50_F48_,SKF_CV50_F49_
1476,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1491,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1497,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1504,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1521,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1540,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1546,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1552,train,train,train,train,train,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1579,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train
1590,train,train,train,train,train,train,train,train,train,val,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train,train


In [27]:
ps = PredefinedSplit(test_fold=data['fold_nb'])

In [28]:
ps

PredefinedSplit(test_fold=array([ 0,  0, ..., 49, 49]))

Checking that the folds contain the correct instances:

These are the rows that should belong to the first fold according to our predefined split. First there is an array with the rows for training, followed by an array with those in the validation fold:

In [29]:
list(ps.split())[0]

(array([  8,   9,  11,  12,  15,  16,  17,  18,  19,  20,  21,  22,  23,
         24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,
         37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,
         50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
         63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
         76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
         89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101,
        102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
        141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166,
        167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
        180, 181, 182, 183, 184, 185, 186, 187, 188

Det svarer til at disse instansene skal være i val:

In [30]:
# This mean that these instances should be in validation:
list(ps.split())[0][1]

array([ 0,  1,  2,  3,  4,  5,  6,  7, 10, 13, 14])

In [31]:
len(data.iloc[list(ps.split())[0][1]])

11

In [32]:
data.iloc[list(ps.split())[0][1]]

Unnamed: 0,AGE,RAVLT_immediate,AVDEL30MIN_neuro,AVDELTOT_neuro,TRAASCOR_neuro,TRABSCOR_neuro,CATANIMSC_neuro,GDTOTAL_gds,ANARTERR_neuro,LRHHC_n_long,Apoe4_,Subgroup_,Subgroup_num_,Usage_,PTGENDER,Gender_num_,fold_nb
13,80.0,36.0,1.0,8.0,47.0,139.0,16.0,1.0,4.0,0.003458,0.0,cAD,1,train,Female,1,0
26,76.8,27.0,5.0,10.0,61.0,300.0,19.0,4.0,23.0,0.005251,0.0,sMCI,0,train,Male,0,0
29,70.9,22.0,0.0,0.0,18.0,145.0,18.0,2.0,12.0,0.003026,1.0,cAD,1,train,Female,1,0
37,72.8,29.0,1.0,8.0,40.0,101.0,17.0,1.0,6.0,0.002767,0.0,cAD,1,train,Male,0,0
55,77.6,30.0,2.0,14.0,150.0,300.0,12.0,0.0,9.0,0.002814,0.0,cAD,1,train,Male,0,0
62,66.5,29.0,3.0,10.0,27.0,94.0,9.0,1.0,32.0,0.003679,1.0,cAD,1,train,Male,0,0
75,81.0,26.0,0.0,7.0,38.0,201.0,13.0,2.0,11.0,0.003298,0.0,cAD,1,train,Female,1,0
89,70.0,32.0,4.0,8.0,31.0,61.0,13.0,1.0,35.0,0.004214,1.0,sMCI,0,train,Male,0,0
108,75.1,23.0,0.0,6.0,26.0,54.0,23.0,1.0,10.0,0.003028,0.0,sMCI,0,train,Male,0,0
130,70.3,22.0,2.0,13.0,60.0,170.0,5.0,2.0,42.0,0.004205,1.0,sMCI,0,train,Female,1,0


Comparing to our `fold_nb`-column from above:

In [33]:
len(data.loc[data['fold_nb']==0])

11

In [34]:
data.loc[data['fold_nb']==0]

Unnamed: 0,AGE,RAVLT_immediate,AVDEL30MIN_neuro,AVDELTOT_neuro,TRAASCOR_neuro,TRABSCOR_neuro,CATANIMSC_neuro,GDTOTAL_gds,ANARTERR_neuro,LRHHC_n_long,Apoe4_,Subgroup_,Subgroup_num_,Usage_,PTGENDER,Gender_num_,fold_nb
13,80.0,36.0,1.0,8.0,47.0,139.0,16.0,1.0,4.0,0.003458,0.0,cAD,1,train,Female,1,0
26,76.8,27.0,5.0,10.0,61.0,300.0,19.0,4.0,23.0,0.005251,0.0,sMCI,0,train,Male,0,0
29,70.9,22.0,0.0,0.0,18.0,145.0,18.0,2.0,12.0,0.003026,1.0,cAD,1,train,Female,1,0
37,72.8,29.0,1.0,8.0,40.0,101.0,17.0,1.0,6.0,0.002767,0.0,cAD,1,train,Male,0,0
55,77.6,30.0,2.0,14.0,150.0,300.0,12.0,0.0,9.0,0.002814,0.0,cAD,1,train,Male,0,0
62,66.5,29.0,3.0,10.0,27.0,94.0,9.0,1.0,32.0,0.003679,1.0,cAD,1,train,Male,0,0
75,81.0,26.0,0.0,7.0,38.0,201.0,13.0,2.0,11.0,0.003298,0.0,cAD,1,train,Female,1,0
89,70.0,32.0,4.0,8.0,31.0,61.0,13.0,1.0,35.0,0.004214,1.0,sMCI,0,train,Male,0,0
108,75.1,23.0,0.0,6.0,26.0,54.0,23.0,1.0,10.0,0.003028,0.0,sMCI,0,train,Male,0,0
130,70.3,22.0,2.0,13.0,60.0,170.0,5.0,2.0,42.0,0.004205,1.0,sMCI,0,train,Female,1,0


## Pycaret setup

Setting up PyCaret experiment
- correct train and test data split specified in `2.0-random_forest`-notebook
- correct taget (i.e. `Subgroup_num_`), 
- dropping `Subgroup_`, `fold_nb` and `Usage_`, 
- using the predefined cross validation set up, and 
- specifying a `Session_id`

In [35]:
clf = setup(data, test_data=data_unseen, target = 'Subgroup_num_', 
            ignore_features=['Subgroup_', 'fold_nb','Usage_', 'PTGENDER'], fold_strategy=ps, session_id=1138)

Unnamed: 0,Description,Value
0,session_id,1138
1,Target,Subgroup_num_
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(539, 17)"
5,Missing Values,False
6,Numeric Features,10
7,Categorical Features,2
8,Ordinal Features,False
9,High Cardinality Features,False


# Construct models

In [36]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


Selecting the top 5 best performing:

In [37]:
top5 = compare_models(n_select=5, sort='Accuracy')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7336,0.7877,0.726,0.7403,0.7115,0.4653,0.4859,0.2092
rf,Random Forest Classifier,0.7222,0.7872,0.7387,0.7159,0.7093,0.4441,0.4606,0.265
lda,Linear Discriminant Analysis,0.7207,0.7951,0.734,0.7166,0.7101,0.4425,0.4573,0.0126
ada,Ada Boost Classifier,0.712,0.7384,0.7193,0.7153,0.7,0.4239,0.4381,0.1834
lr,Logistic Regression,0.7065,0.7676,0.7267,0.6856,0.6949,0.4119,0.419,0.2016
nb,Naive Bayes,0.7027,0.7528,0.77,0.6865,0.7085,0.4085,0.4304,0.0118
ridge,Ridge Classifier,0.7025,0.0,0.742,0.6931,0.7008,0.4061,0.4188,0.0088
lightgbm,Light Gradient Boosting Machine,0.6822,0.7545,0.6753,0.6677,0.6585,0.3624,0.3732,0.0432
qda,Quadratic Discriminant Analysis,0.6805,0.748,0.674,0.6648,0.6537,0.3571,0.3735,0.014
gbc,Gradient Boosting Classifier,0.6724,0.7323,0.692,0.6541,0.6566,0.3439,0.3566,0.1094


Tuning the selected models:

In [38]:
top5

[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=1138, verbose=0,
                      warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=-

In [39]:
tuned_top5 = [tune_model(i) for i in top5] 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7273,0.8,0.8333,0.7143,0.7692,0.4407,0.4485
1,0.7273,0.8,0.8333,0.7143,0.7692,0.4407,0.4485
2,0.5455,0.6,0.6667,0.5714,0.6154,0.0678,0.069
3,0.8182,0.9,1.0,0.75,0.8571,0.6207,0.6708
4,0.7273,0.7333,0.6667,0.8,0.7273,0.459,0.4667
5,0.3636,0.2667,0.3333,0.4,0.3636,-0.2623,-0.2667
6,0.8182,0.8667,1.0,0.7143,0.8333,0.6452,0.6901
7,0.7273,0.8333,0.6,0.75,0.6667,0.4407,0.4485
8,0.6364,0.6667,1.0,0.5556,0.7143,0.3125,0.4303
9,0.6364,0.8,0.6,0.6,0.6,0.2667,0.2667


In [40]:
tuned_top5

[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                      class_weight='balanced_subsample', criterion='entropy',
                      max_depth=10, max_features=1.0, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.002,
                      min_impurity_split=None, min_samples_leaf=2,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      n_estimators=260, n_jobs=-1, oob_score=False,
                      random_state=1138, verbose=0, warm_start=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={},
                        criterion='gini', max_depth=11, max_features='log2',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0001, min_impurity_split=None,
                        min_samples_leaf=6, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=190,
                 

In [41]:
bagged_top5 = [ensemble_model(i) for i in tuned_top5] 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7273,0.7667,0.8333,0.7143,0.7692,0.4407,0.4485
1,0.7273,0.8,0.8333,0.7143,0.7692,0.4407,0.4485
2,0.5455,0.6333,0.6667,0.5714,0.6154,0.0678,0.069
3,0.8182,0.9,1.0,0.75,0.8571,0.6207,0.6708
4,0.7273,0.7667,0.6667,0.8,0.7273,0.459,0.4667
5,0.3636,0.2667,0.3333,0.4,0.3636,-0.2623,-0.2667
6,0.8182,0.8667,1.0,0.7143,0.8333,0.6452,0.6901
7,0.8182,0.8333,0.8,0.8,0.8,0.6333,0.6333
8,0.4545,0.6667,0.6,0.4286,0.5,-0.0645,-0.069
9,0.7273,0.8,0.6,0.75,0.6667,0.4407,0.4485


In [42]:
bagged_top5 = [ensemble_model(i) for i in top5] 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7273,0.7667,0.8333,0.7143,0.7692,0.4407,0.4485
1,0.7273,0.8,0.8333,0.7143,0.7692,0.4407,0.4485
2,0.5455,0.6333,0.6667,0.5714,0.6154,0.0678,0.069
3,0.8182,0.9,1.0,0.75,0.8571,0.6207,0.6708
4,0.7273,0.7667,0.6667,0.8,0.7273,0.459,0.4667
5,0.3636,0.2667,0.3333,0.4,0.3636,-0.2623,-0.2667
6,0.8182,0.8667,1.0,0.7143,0.8333,0.6452,0.6901
7,0.8182,0.8333,0.8,0.8,0.8,0.6333,0.6333
8,0.5455,0.6667,0.8,0.5,0.6154,0.127,0.1491
9,0.7273,0.8,0.6,0.75,0.6667,0.4407,0.4485


In [43]:
nontuned_blender = blend_models(estimator_list = top5) 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8182,0.8667,1.0,0.75,0.8571,0.6207,0.6708
1,0.8182,0.9,0.8333,0.8333,0.8333,0.6333,0.6333
2,0.5455,0.5667,0.6667,0.5714,0.6154,0.0678,0.069
3,0.8182,1.0,1.0,0.75,0.8571,0.6207,0.6708
4,0.7273,0.8,0.6667,0.8,0.7273,0.459,0.4667
5,0.3636,0.3333,0.3333,0.4,0.3636,-0.2623,-0.2667
6,0.9091,0.9333,1.0,0.8333,0.9091,0.8197,0.8333
7,0.6364,0.7333,0.6,0.6,0.6,0.2667,0.2667
8,0.5455,0.6,0.8,0.5,0.6154,0.127,0.1491
9,0.7273,0.8667,0.6,0.75,0.6667,0.4407,0.4485


In [44]:
tuned_blender = blend_models(estimator_list = tuned_top5) 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8182,0.8667,1.0,0.75,0.8571,0.6207,0.6708
1,0.8182,0.8333,0.8333,0.8333,0.8333,0.6333,0.6333
2,0.5455,0.6,0.6667,0.5714,0.6154,0.0678,0.069
3,0.8182,0.9667,1.0,0.75,0.8571,0.6207,0.6708
4,0.7273,0.8,0.6667,0.8,0.7273,0.459,0.4667
5,0.3636,0.3333,0.3333,0.4,0.3636,-0.2623,-0.2667
6,0.9091,0.9667,1.0,0.8333,0.9091,0.8197,0.8333
7,0.7273,0.7333,0.8,0.6667,0.7273,0.459,0.4667
8,0.5455,0.6667,0.8,0.5,0.6154,0.127,0.1491
9,0.7273,0.8333,0.6,0.75,0.6667,0.4407,0.4485


In [45]:
bagged_blender = blend_models(estimator_list = bagged_top5)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8182,0.8667,1.0,0.75,0.8571,0.6207,0.6708
1,0.8182,0.8667,0.8333,0.8333,0.8333,0.6333,0.6333
2,0.5455,0.5667,0.6667,0.5714,0.6154,0.0678,0.069
3,0.9091,0.9333,1.0,0.8571,0.9231,0.8136,0.8281
4,0.7273,0.8,0.6667,0.8,0.7273,0.459,0.4667
5,0.3636,0.3,0.3333,0.4,0.3636,-0.2623,-0.2667
6,0.9091,0.9667,1.0,0.8333,0.9091,0.8197,0.8333
7,0.7273,0.7333,0.8,0.6667,0.7273,0.459,0.4667
8,0.6364,0.6,0.8,0.5714,0.6667,0.2903,0.3105
9,0.7273,0.8667,0.6,0.75,0.6667,0.4407,0.4485


By optimizing for Accuracy, we use the `auto_ml`-function to the best performing model:

In [46]:
best = automl(optimize='Accuracy')

In [47]:
best

VotingClassifier(estimators=[('Bagging',
                              BaggingClassifier(base_estimator=ExtraTreesClassifier(bootstrap=False,
                                                                                    ccp_alpha=0.0,
                                                                                    class_weight=None,
                                                                                    criterion='gini',
                                                                                    max_depth=None,
                                                                                    max_features='auto',
                                                                                    max_leaf_nodes=None,
                                                                                    max_samples=None,
                                                                                    min_impurity_decrease=0.0,
                               

In [48]:
test = pull()
print(test)

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
0       0.8182  0.8667  1.0000  0.7500  0.8571  0.6207  0.6708
1       0.8182  0.8667  0.8333  0.8333  0.8333  0.6333  0.6333
2       0.5455  0.5667  0.6667  0.5714  0.6154  0.0678  0.0690
3       0.9091  0.9333  1.0000  0.8571  0.9231  0.8136  0.8281
4       0.7273  0.8000  0.6667  0.8000  0.7273  0.4590  0.4667
5       0.3636  0.3000  0.3333  0.4000  0.3636 -0.2623 -0.2667
6       0.9091  0.9667  1.0000  0.8333  0.9091  0.8197  0.8333
7       0.7273  0.7333  0.8000  0.6667  0.7273  0.4590  0.4667
8       0.6364  0.6000  0.8000  0.5714  0.6667  0.2903  0.3105
9       0.7273  0.8667  0.6000  0.7500  0.6667  0.4407  0.4485
10      0.4545  0.4333  0.6000  0.4286  0.5000 -0.0645 -0.0690
11      0.7273  0.9333  0.8000  0.6667  0.7273  0.4590  0.4667
12      0.8182  0.8667  1.0000  0.7143  0.8333  0.6452  0.6901
13      0.8182  0.9000  0.8000  0.8000  0.8000  0.6333  0.6333
14      0.9091  1.0000  1.0000  0.8333  0.9091  0.8197 

# Export best performing model to results directory 

> The bagged blender was the best performing model, save this:

In [49]:
save_model(bagged_blender, '../results/3.0-251121_best_blended_accuracy_top5')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['Subgroup_', 'fold_nb',
                                                        'Usage_', 'PTGENDER'],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='Subgroup_num_',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categori...
                                                                                                    n_jobs=None,
                                                                                            

In [50]:
save_model(best, '../results/3.0-251121_best_model_accuracy_top5')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['Subgroup_', 'fold_nb',
                                                        'Usage_', 'PTGENDER'],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='Subgroup_num_',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categori...
                                                                                                    n_jobs=None,
                                                                                            