# Random forest

## Create and train the model

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
# Load the dataset for training (with labels)
df = pd.read_csv('../pre_processing_and_viz/df_train.csv')
df

Unnamed: 0,ID,Age,Gender,MMSE,Site,label,label_id,site_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40,gender_id
0,hokuto_dementia1,60,M,13,A,dementia,2,0,1.527738e+03,1.400718e+03,1.223784e+03,3.281070e+03,1.531131e+03,1
1,hokuto_dementia2,64,M,17,B,dementia,2,1,0.000000e+00,1.470078e+06,0.000000e+00,2.326050e+06,1.262857e+06,1
2,hokuto_dementia3,69,F,9,A,dementia,2,0,1.278396e+03,1.214818e+03,1.133075e+03,3.814643e+03,1.928315e+03,0
3,hokuto_dementia4,70,M,22,B,dementia,2,1,0.000000e+00,3.975737e+05,0.000000e+00,6.295413e+05,3.441654e+05,1
4,hokuto_dementia5,73,M,18,A,dementia,2,0,9.715404e+03,8.743269e+03,7.358837e+03,1.555337e+04,3.254478e+03,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,hokuto_control96,80,M,27,B,control,0,1,0.000000e+00,3.654249e+04,0.000000e+00,5.970315e+04,3.707580e+04,1
140,hokuto_control97,81,M,28,B,control,0,1,0.000000e+00,5.084338e+05,0.000000e+00,8.046457e+05,4.396953e+05,1
141,hokuto_control98,81,M,27,A,control,0,0,1.813328e+06,1.743510e+06,1.624769e+06,4.545834e+06,1.305148e+06,1
142,hokuto_control99,85,M,28,B,control,0,1,0.000000e+00,1.776107e+05,0.000000e+00,2.819723e+05,1.563449e+05,1


In [28]:

# drop mmse because this feature is not given in the df_test. Drop Site because its only implies a sample freq different which is already taken into account for the features extraction


features = df.drop(['ID', 'label', 'label_id', 'Site', 'Gender', 'MMSE'], axis=1)

features.head()

features



Unnamed: 0,Age,site_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40,gender_id
0,60,0,1.527738e+03,1.400718e+03,1.223784e+03,3.281070e+03,1.531131e+03,1
1,64,1,0.000000e+00,1.470078e+06,0.000000e+00,2.326050e+06,1.262857e+06,1
2,69,0,1.278396e+03,1.214818e+03,1.133075e+03,3.814643e+03,1.928315e+03,0
3,70,1,0.000000e+00,3.975737e+05,0.000000e+00,6.295413e+05,3.441654e+05,1
4,73,0,9.715404e+03,8.743269e+03,7.358837e+03,1.555337e+04,3.254478e+03,1
...,...,...,...,...,...,...,...,...
139,80,1,0.000000e+00,3.654249e+04,0.000000e+00,5.970315e+04,3.707580e+04,1
140,81,1,0.000000e+00,5.084338e+05,0.000000e+00,8.046457e+05,4.396953e+05,1
141,81,0,1.813328e+06,1.743510e+06,1.624769e+06,4.545834e+06,1.305148e+06,1
142,85,1,0.000000e+00,1.776107e+05,0.000000e+00,2.819723e+05,1.563449e+05,1


In [89]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Préparation des caractéristiques et des étiquettes
features = df.drop(
    ['ID', 'label', 'label_id', 'Site', 'Gender', 'MMSE'], axis=1)
labels = df['label_id']

# Normalisation des caractéristiques
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Application de PCA pour réduire la dimensionnalité
pca = PCA(n_components=0.9)
features_reduced = pca.fit_transform(features_scaled)

# Initialisation du classificateur Random Forest
rf = RandomForestClassifier(n_estimators=50)

# Stratified K-Fold pour maintenir la proportion de chaque classe dans chaque pli
kf = StratifiedKFold(n_splits=10, shuffle=True)

# Utiliser cross_val_score pour effectuer la validation croisée
scores = cross_val_score(rf, features_reduced, labels,
                         cv=kf, scoring='accuracy')

print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {scores.mean()}")


Cross-validation scores: [0.93333333 0.66666667 0.66666667 0.86666667 0.85714286 0.92857143
 1.         0.85714286 0.78571429 0.92857143]
Mean accuracy: 0.849047619047619


## Use the trained model to predict the test data 

In [30]:
df_test = pd.read_csv('../pre_processing_and_viz/df_test.csv')
df_test.head()



Unnamed: 0,ID,Age,Gender,Site,gender_id,site_id,label,label_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40
0,hokuto_test1,64,M,A,1,0,,,25648300.0,25225210.0,24518520.0,85506270.0,44289850.0
1,hokuto_test2,70,F,A,0,1,,,2838.066,2804.487,2718.521,8439.152,2834.072
2,hokuto_test3,61,F,B,0,0,,,0.0,15253.85,0.0,26201.35,18670.67
3,hokuto_test4,83,F,B,0,1,,,0.0,20930.9,0.0,36859.6,26635.69
4,hokuto_test5,71,M,A,1,0,,,336.8586,346.8851,362.9394,1620.425,1151.548


In [31]:
X_test = df_test.drop(
    ['ID', 'label', 'label_id', 'Site', 'Gender', 'label', 'label_id'], axis=1)
display(X_test.head())
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)
pca = PCA(n_components=0.9)
X_test_reduced = pca.fit_transform(X_test)

X_test_reduced

Unnamed: 0,Age,gender_id,site_id,Delta-1:4,Theta-4:8,Alpha-8:13,Beta-13:30,Gamma-30:40
0,64,1,0,25648300.0,25225210.0,24518520.0,85506270.0,44289850.0
1,70,0,1,2838.066,2804.487,2718.521,8439.152,2834.072
2,61,0,0,0.0,15253.85,0.0,26201.35,18670.67
3,83,0,1,0.0,20930.9,0.0,36859.6,26635.69
4,71,1,0,336.8586,346.8851,362.9394,1620.425,1151.548


array([[ 1.43896839e+01,  6.19821414e-02, -4.96615323e-01,
        -6.31214539e-02],
       [-5.42511903e-01,  1.03105080e+00, -5.91980194e-01,
        -1.68252887e-01],
       [-3.12837580e-01,  1.87153659e-01, -5.39467069e-02,
         1.73785946e+00],
       [-6.27165270e-01,  2.17879131e-01, -1.05021614e+00,
        -9.05098087e-01],
       [-1.26613759e-01, -8.53499751e-01,  1.48033437e+00,
         4.26448394e-01],
       [-5.53116596e-01,  7.81528258e-01, -7.34909661e-01,
        -3.96367075e-01],
       [-5.21355461e-01,  1.21876978e+00, -4.86455995e-01,
         1.66151650e-03],
       [-2.78760347e-01,  2.42867277e-01,  1.07680571e+00,
        -1.25753901e+00],
       [-4.11406112e-01,  2.21985146e+00,  7.68043665e-02,
         9.08031024e-01],
       [-2.87746680e-01,  4.37305749e-01,  8.71382545e-02,
         1.96469732e+00],
       [-4.94042125e-01,  5.35033915e-01, -8.86711419e-01,
        -6.30619385e-01],
       [-5.06778082e-01, -1.56482977e+00, -1.03951763e+00,
      

In [32]:
# Faire des prédictions
predicted_labels = rf.predict(X_test_reduced)
display(predicted_labels)

ValueError: X has 4 features, but RandomForestClassifier is expecting 1 features as input.

# TEST SMOTE technique


In [None]:
X = features 
y = df['label_id']


0      2
1      2
2      2
3      2
4      2
      ..
139    0
140    0
141    0
142    0
143    0
Name: label_id, Length: 144, dtype: int64