In [None]:
import pandas as pd
import numpy as np
from data_explore_utils import generate_sample_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification # To test if RandomForestClassifier is imported correctly
from sklearn.metrics import f1_score
from data_cleaning_utils import filter_columns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc
import itertools

In [None]:
df = pd.read_csv('./data/full_data.csv')

# Here we prepare our training data
We will prepare data separing our Data of the dataframe separing it in groupes of 70% for trainign and 30% for tests \
X -> matrix (n_samples, n_features) \
y -> array (n_samples) with desired output (here i will try with **grav**)

In [None]:
try: 
    # We sample our training and testing data
    dftrain = df.sample(frac=0.7)
    dftest = df.merge(dftrain, how='left', indicator=True)
    dftest = dftest[dftest['_merge'] == 'left_only']
    dftest = filter_columns(dftest, ['_merge'])

    # -- Extracting y from dataframe training and creating array X --
    X, y = generate_sample_data(dftrain)

    # -- Same for testing data --
    Xtest, ytest = generate_sample_data(dftrain)


except ValueError as e:
    print(e)
    exit(1)
except Exception as e:
    print(e)
    exit(1)

## Prediction
In the following phase will be evaluated code for the Random forest and the prediction will be scored.

In [None]:
clf = RandomForestClassifier(
    n_estimators=100, 
    criterion='gini', 
    n_jobs = -1, 
    max_depth = None, 
    min_samples_leaf = 1, 
    min_samples_split = 10)
clf.fit(X, y)

y_pred = clf.predict(Xtest)
f1 = f1_score(ytest, y_pred, average="weighted")
print('Score for this prediction', f1)

# Training the RFC

In [None]:


# Calculer la matrice de confusion
conf_matrix = confusion_matrix(ytest, y_pred)

# Afficher la matrice de confusion
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Matrice de Confusion')
plt.colorbar()

# Ajouter les étiquettes aux axes
classes = np.unique(y)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)

# Ajouter les valeurs dans la matrice
thresh = conf_matrix.max() / 2.
for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
    plt.text(j, i, format(conf_matrix[i, j], 'd'),
             horizontalalignment="center",
             color="white" if conf_matrix[i, j] > thresh else "black")

plt.ylabel('Vraies valeurs')
plt.xlabel('Prédictions')
plt.show()

## Evaluating features importances

In [None]:
feature_importances = clf.feature_importances_
col_exclu = 'grav'
colonnes_without_grav = [col for col in df.columns if col != col_exclu]
plt.figure(figsize=(12, 6))
plt.bar(range(len(feature_importances)), feature_importances, tick_label=colonnes_without_grav)
plt.xlabel('Feature')
plt.ylabel('Feature Importance')
plt.title('Feature Importances')
plt.xticks(rotation=90, ha="right")  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.show()

### Note: Following code take time to execute results.

# Searching best hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV
PYDEVD_DISABLE_FILE_VALIDATION=1
# Définir les hyperparamètres à ajuster
param_grid = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8, 16],
    'n_jobs' : [-1]
}
# Créer un RandomForestclassifier
clf = RandomForestClassifier()
# Utiliser GridSearchCV pour trouver les meilleurs hyperparamètres
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs = -1)
grid_search.fit(X, y)
# Afficher les meilleurs hyperparamètres
print("Meilleurs hyperparamètres :", grid_search.best_params_)
# Utiliser le modèle avec les meilleurs hyperparamètres
best_clf = grid_search.best_estimator_

In [None]:
from sklearn.metrics import classification_report
y_pred = best_clf.predict(Xtest)
f1 = f1_score(ytest, y_pred, average="weighted")
print("F1 score of best_clf : " + str(f1))
print(classification_report(ytest, y_pred))