In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification # To test if RandomForestClassifier is imported correctly
from sklearn.metrics import f1_score
from data_cleaning_utils import filter_columns

In [None]:
df = pd.read_csv('./data/full_data.csv')

In [None]:
df.info()

In [None]:
#Run this to make sure everything works
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]])) #Should display [1]

# Here we prepare our training data
X -> matrix (n_samples, n_features) \
y -> array (n_samples) with desired output (here i will try with **grav**)

In [None]:
# We sample our training and testing data
dftrain = df.sample(frac=0.7)
dftest = df.merge(dftrain, how='left', indicator=True)
dftest = dftest[dftest['_merge'] == 'left_only']
dftest = filter_columns(dftest, ['_merge'])


# -- Extracting y from dataframe training and creating array X --
yColumn = "grav"
if(dftrain[yColumn].isnull().any()):
    raise ValueError("missing values in column '" + yColumn + "'")
y = dftrain[yColumn].to_list()

# getting all the columns
new_cols = set(dftrain.columns)
# removing the desired column
new_cols.remove(yColumn)
new_cols = list(new_cols)
X = dftrain[new_cols]
X = X.to_numpy()

# -- Same for testing data --
yColumn = "grav"
if(dftest[yColumn].isnull().any()):
    raise ValueError("missing values in column '" + yColumn + "'")
ytest = dftest[yColumn].to_list()

# getting all the columns
new_cols = set(dftest.columns)
# removing the desired column
new_cols.remove(yColumn)
new_cols = list(new_cols)
Xtest = dftest[new_cols]
Xtest = Xtest.to_numpy()

In [None]:
# ---- DEPRECATED ----
#Here we keep a portion of the data as a testing set to see if our rfc works
x1, x2, x3 = np.array_split(X, 3)
X = np.concatenate([x1, x2])

y1, y2, y3 = np.array_split(y, 3)
y = np.concatenate([y1, y2])

print(len(X))
print(len(y))

# Training the RFC

In [None]:
clf = RandomForestClassifier(n_estimators = 1000, n_jobs = -1, max_depth = None, min_samples_leaf = 1, min_samples_split = 10)
clf.fit(X, y)

In [None]:
y_pred = clf.predict(Xtest)
f1 = f1_score(ytest, y_pred, average="weighted")
print(f1) # Deprecated sampling 100 trees : 0.643244001988855
          #                     1000 trees : 0.6480447511164154 
          # Correct sampling    100 trees : 0.6617
          #                     1000 trees : 0.6640904014522145

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc
import itertools

# Calculer la matrice de confusion
conf_matrix = confusion_matrix(ytest, y_pred)

# Afficher la matrice de confusion
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Matrice de Confusion')
plt.colorbar()

# Ajouter les étiquettes aux axes
classes = np.unique(y)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)

# Ajouter les valeurs dans la matrice
thresh = conf_matrix.max() / 2.
for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
    plt.text(j, i, format(conf_matrix[i, j], 'd'),
             horizontalalignment="center",
             color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('Vraies valeurs')
plt.xlabel('Prédictions')
plt.show()

# Recherche des meilleurs hyperparamètres

In [None]:
from sklearn.model_selection import GridSearchCV
PYDEVD_DISABLE_FILE_VALIDATION=1
# Définir les hyperparamètres à ajuster
param_grid = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8, 16],
    'n_jobs' : [-1]
}
# Créer un RandomForestclassifier
clf = RandomForestClassifier()
# Utiliser GridSearchCV pour trouver les meilleurs hyperparamètres
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs = -1)
grid_search.fit(X, y)
# Afficher les meilleurs hyperparamètres
print("Meilleurs hyperparamètres :", grid_search.best_params_)
# Utiliser le modèle avec les meilleurs hyperparamètres
best_clf = grid_search.best_estimator_

In [None]:
from sklearn.metrics import classification_report
y_pred = best_clf.predict(Xtest)
f1 = f1_score(ytest, y_pred, average="weighted")
print("F1 score of best_clf : " + str(f1))
print(classification_report(ytest, y_pred))