# Random forest

## Create and train the model

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
# Load the dataset for training (with labels)
df = pd.read_csv('../pre_processing_and_viz/df_train.csv')
df

Unnamed: 0,ID,Age,Gender,MMSE,Site,label,label_id,site_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40,gender_id
0,hokuto_dementia1,60,M,13,A,dementia,2,0,1.527738e+03,1.400718e+03,1.223784e+03,3.281070e+03,1.531131e+03,1
1,hokuto_dementia2,64,M,17,B,dementia,2,1,0.000000e+00,1.470078e+06,0.000000e+00,2.326050e+06,1.262857e+06,1
2,hokuto_dementia3,69,F,9,A,dementia,2,0,1.278396e+03,1.214818e+03,1.133075e+03,3.814643e+03,1.928315e+03,0
3,hokuto_dementia4,70,M,22,B,dementia,2,1,0.000000e+00,3.975737e+05,0.000000e+00,6.295413e+05,3.441654e+05,1
4,hokuto_dementia5,73,M,18,A,dementia,2,0,9.715404e+03,8.743269e+03,7.358837e+03,1.555337e+04,3.254478e+03,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,hokuto_control96,80,M,27,B,control,0,1,0.000000e+00,3.654249e+04,0.000000e+00,5.970315e+04,3.707580e+04,1
140,hokuto_control97,81,M,28,B,control,0,1,0.000000e+00,5.084338e+05,0.000000e+00,8.046457e+05,4.396953e+05,1
141,hokuto_control98,81,M,27,A,control,0,0,1.813328e+06,1.743510e+06,1.624769e+06,4.545834e+06,1.305148e+06,1
142,hokuto_control99,85,M,28,B,control,0,1,0.000000e+00,1.776107e+05,0.000000e+00,2.819723e+05,1.563449e+05,1


In [26]:

# drop mmse because this feature is not given in the df_test. Drop Site because its only implies a sample freq different which is already taken into account for the features extraction


features = df.drop(['ID', 'label', 'label_id', 'Site', 'Gender', 'MMSE'], axis=1)

features.head()

features



Unnamed: 0,Age,site_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40,gender_id
0,60,0,1.527738e+03,1.400718e+03,1.223784e+03,3.281070e+03,1.531131e+03,1
1,64,1,0.000000e+00,1.470078e+06,0.000000e+00,2.326050e+06,1.262857e+06,1
2,69,0,1.278396e+03,1.214818e+03,1.133075e+03,3.814643e+03,1.928315e+03,0
3,70,1,0.000000e+00,3.975737e+05,0.000000e+00,6.295413e+05,3.441654e+05,1
4,73,0,9.715404e+03,8.743269e+03,7.358837e+03,1.555337e+04,3.254478e+03,1
...,...,...,...,...,...,...,...,...
139,80,1,0.000000e+00,3.654249e+04,0.000000e+00,5.970315e+04,3.707580e+04,1
140,81,1,0.000000e+00,5.084338e+05,0.000000e+00,8.046457e+05,4.396953e+05,1
141,81,0,1.813328e+06,1.743510e+06,1.624769e+06,4.545834e+06,1.305148e+06,1
142,85,1,0.000000e+00,1.776107e+05,0.000000e+00,2.819723e+05,1.563449e+05,1


In [27]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Préparation des caractéristiques et des étiquettes
features = df.drop(
    ['ID', 'label', 'label_id', 'Site', 'Gender', 'MMSE'], axis=1)
labels = df['label_id']

# Normalisation des caractéristiques
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Application de PCA pour réduire la dimensionnalité
pca = PCA(n_components=0.9)
features_reduced = pca.fit_transform(features_scaled)

# Configuration du modèle Random Forest
rf = RandomForestClassifier(random_state=42)

# Configuration de la validation croisée Stratified K-Fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Définition de la grille d'hyperparamètres à tester
param_grid = {
    'n_estimators': [10, 20, 35, 50],
    'max_depth': [None,1, 2, 5],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 4, 5],
}

# Initialisation de GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=kf,
                           scoring='accuracy', n_jobs=-1)

# Recherche des meilleurs hyperparamètres
grid_search.fit(features_reduced, labels)

# Affichage des meilleurs paramètres et du meilleur score
print("Meilleurs paramètres:", grid_search.best_params_)
print("Meilleur score:", grid_search.best_score_)

# Utilisation du meilleur modèle trouvé pour entraîner sur l'ensemble des données
rf_final = grid_search.best_estimator_



Meilleurs paramètres: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 10}
Meilleur score: 0.888423645320197


## Use the trained model to predict the test data 

In [36]:
df_test = pd.read_csv('../pre_processing_and_viz/df_test.csv')
df_test.head()



Unnamed: 0,ID,Age,Gender,Site,gender_id,site_id,label,label_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40
0,hokuto_test1,64,M,A,1,0,,,25648300.0,25225210.0,24518520.0,85506270.0,44289850.0
1,hokuto_test2,70,F,A,0,1,,,2838.066,2804.487,2718.521,8439.152,2834.072
2,hokuto_test3,61,F,B,0,0,,,0.0,15253.85,0.0,26201.35,18670.67
3,hokuto_test4,83,F,B,0,1,,,0.0,20930.9,0.0,36859.6,26635.69
4,hokuto_test5,71,M,A,1,0,,,336.8586,346.8851,362.9394,1620.425,1151.548


In [37]:
X_test = df_test.drop(
    ['ID', 'label', 'label_id', 'Site', 'Gender', 'label', 'label_id'], axis=1)
display(X_test.head())
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
pca = PCA(n_components=0.9)
X_test_reduced = pca.fit_transform(X_test)
X_test_reduced.shape

Unnamed: 0,Age,gender_id,site_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40
0,64,1,0,25648300.0,25225210.0,24518520.0,85506270.0,44289850.0
1,70,0,1,2838.066,2804.487,2718.521,8439.152,2834.072
2,61,0,0,0.0,15253.85,0.0,26201.35,18670.67
3,83,0,1,0.0,20930.9,0.0,36859.6,26635.69
4,71,1,0,336.8586,346.8851,362.9394,1620.425,1151.548


(42, 4)

In [38]:
# Prédire les classes pour les données de test
predicted_labels = rf_final.predict(X_test_reduced)

# Obtenir les probabilités des classes prédites
predicted_probabilities = rf_final.predict_proba(X_test_reduced)

# Pour chaque instance de test, sélectionner la probabilité de la classe prédite
plausibilities = predicted_probabilities.max(axis=1)

df_test["Estimated diagnoses"] = predicted_labels
df_test['Estimated diagnoses'] = df_test['Estimated diagnoses'].map({
                                                                        0: "control", 1: "mci", 2: "dementia"})
df_test["Plasibility"] = plausibilities
df_test.head()

Unnamed: 0,ID,Age,Gender,Site,gender_id,site_id,label,label_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40,Estimated diagnoses,Plasibility
0,hokuto_test1,64,M,A,1,0,,,25648300.0,25225210.0,24518520.0,85506270.0,44289850.0,control,1.0
1,hokuto_test2,70,F,A,0,1,,,2838.066,2804.487,2718.521,8439.152,2834.072,control,0.525
2,hokuto_test3,61,F,B,0,0,,,0.0,15253.85,0.0,26201.35,18670.67,control,0.516667
3,hokuto_test4,83,F,B,0,1,,,0.0,20930.9,0.0,36859.6,26635.69,dementia,0.525
4,hokuto_test5,71,M,A,1,0,,,336.8586,346.8851,362.9394,1620.425,1151.548,mci,0.733333


In [46]:
#Store the result in the excel answer sheet

df_answer = pd.read_excel('../answer_sheet.xlsx')

df_answer = df_answer[["Test data ID"]].merge(
    df_test[['ID', 'Estimated diagnoses', 'Plasibility']], left_on='Test data ID', right_on='ID').drop('ID', axis=1)

df_answer.to_excel('../answer_sheet.xlsx', index=False)