# Packages

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from pprint import pprint
from prettytable import PrettyTable
import joblib
import os

In [37]:
# sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC, NuSVC
from scipy.stats import mode
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
import lightgbm as lgb
from catboost import CatBoostClassifier

In [38]:
import warnings 
warnings.filterwarnings("ignore")

# Initialisation

In [39]:
naive_sub= pd.read_csv('forest/naive_submission.csv')
df= pd.read_csv('forest/train.csv').drop('Unnamed: 0',axis=1)
test= pd.read_csv('forest/test.csv').drop('row_ID',axis=1)

# Prétraitement des données

In [40]:
# Variable qualitative ou quantitative
# Toutes les variables sont quantitatives, à part Soil_Type et Wilderness_Area
columns= df.columns
quantitative_vars = columns[:10]
qual_vars = columns[10:]
qualitative_vars = [item.rsplit('_')[1] for item in qual_vars if item != 'Cover_Type']

In [41]:
# Filtre les données en fonction des quantiles calculés pour chaque variable
# Définition d'un nouveau dataframe df_filtered
df_filtered = df.copy()

for col in quantitative_vars:
    Q3 = df_filtered[col].quantile(0.85)
    Q1 = df_filtered[col].quantile(0.15)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_filtered = df_filtered[(df_filtered[col] >= lower_bound) & (df_filtered[col] <= upper_bound)]
df_filtered_long = pd.melt(df_filtered, var_name='Variables', value_name='Values')

In [42]:
# Normalisation des données
ct_s = ColumnTransformer(
    transformers=[
        ('standardize', StandardScaler(), quantitative_vars)
    ],
    remainder='passthrough'
)
df_filtered_ss = ct_s.fit_transform(df_filtered)
df_filtered_ss = pd.DataFrame(df_filtered_ss, columns=df_filtered.columns)

# Feature engineering: À Lancer!!

In [None]:
df_fe = pd.DataFrame()
# Distance to hydrology
df_fe['Distance_To_Hydrology'] = np.sqrt(df_filtered['Horizontal_Distance_To_Hydrology']**2 + df_filtered['Vertical_Distance_To_Hydrology']**2)
df_fe['Distance_To_Hydrology'] = df_fe['Distance_To_Hydrology'].astype(float)

# Elevation to Aspect
df_fe['Elevation_Aspect'] = df_filtered['Elevation']*np.sin(np.radians(df_filtered['Aspect']))
df_fe['Elevation_Aspect'] = df_fe['Elevation_Aspect'].astype(float)

# Entrainement d'un modèle basé sur Soil Type et Wilderness Area

# Encode `Soil_Type` et `Wilderness_Area` comme une catégorie
soil_columns = [col for col in df_filtered.columns if col.startswith('Soil_Type')]
area_columns = [col for col in df_filtered.columns if col.startswith('Wilderness_Area')]
X_sw = df_filtered[soil_columns+area_columns].drop('Soil_Type15',axis=1)
y_sw = df_filtered['Cover_Type']

model_sw = RandomForestClassifier(random_state=42)
model_sw.fit(X_sw, y_sw)

df_fe['Soil_Wilderness_Predict'] = model_sw.predict(X_sw)
df_fe['Soil_Wilderness_Predict'] = df_fe['Soil_Wilderness_Predict'].astype(str)
df_fe['Soil_Wilderness_Predict'] = df_fe['Soil_Wilderness_Predict'].astype('category')

ct_s2 = ColumnTransformer(
    transformers=[
        ('standardize', StandardScaler(), ['Distance_To_Hydrology', 'Elevation_Aspect'])
    ],
    remainder='passthrough'
)

# Normalisation
df_fe_ss = ct_s2.fit_transform(df_fe)
df_fe_ss = pd.DataFrame(df_fe_ss, columns=df_fe.columns)

# Convertir en numérique les colonnes qui devraient l'être
cols_to_numeric = ['Distance_To_Hydrology', 'Elevation_Aspect']
for col in cols_to_numeric:
    df_fe_ss[col] = pd.to_numeric(df_fe_ss[col], errors='coerce')

# Convertir les autres colonnes en type "category" pour les colonnes catégorielles
cols_to_category = ['Soil_Wilderness_Predict']
for col in cols_to_category:
    df_fe_ss[col] = df_fe_ss[col].astype('category')

# Concanténation
df_filtered_ss = df_filtered_ss.join(df_fe_ss)

# Matrice de transformations
def dataframe_fe(df, model_sw, ct_s, ct_s2):
    # Input
    # model_sw : Modele pour predire Cover type a partir de Soil Type et Wilderness Area
    # ct_s : normalisation utilisé sur donnees train
    # ct_s2 : normalisation utilisé sur donnees train feature engineering

    # Output
    # df_filtered_ss: variables one hot encoded
    # df_filtered_ss: variable soil type n'est plus one hot encode

    df_fe = pd.DataFrame()
    df = df.drop('Soil_Type15', axis=1)
    
    # Distance to hydrology
    df_fe['Distance_To_Hydrology'] = np.sqrt(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2)
    df_fe['Distance_To_Hydrology'] = df_fe['Distance_To_Hydrology'].astype(float)

    # Elevation to Aspect
    df_fe['Elevation_Aspect'] = df['Elevation']*np.sin(np.radians(df['Aspect']))
    df_fe['Elevation_Aspect'] = df_fe['Elevation_Aspect'].astype(float)

    # Entrainement d'un modèle basé sur Soil Type et Wilderness Area
    # Encode `Soil_Type` et `Wilderness_Area` comme des catégories
    soil_columns = [col for col in df.columns if col.startswith('Soil_Type')]
    area_columns = [col for col in df.columns if col.startswith('Wilderness_Area')]
    X_sw = df[soil_columns+area_columns]

    df_fe['Soil_Wilderness_Predict'] = model_sw.predict(X_sw)
    df_fe['Soil_Wilderness_Predict'] = df_fe['Soil_Wilderness_Predict'].astype(str)
    df_fe['Soil_Wilderness_Predict'] = df_fe['Soil_Wilderness_Predict'].astype('category')

    # Normalisation
    df_fe_ss = ct_s2.fit_transform(df_fe)
    df_fe_ss = pd.DataFrame(df_fe_ss, columns=df_fe.columns)

    # Convertir en numérique les colonnes qui devraient l'être
    cols_to_numeric = ['Distance_To_Hydrology', 'Elevation_Aspect']
    for col in cols_to_numeric:
        df_fe_ss[col] = pd.to_numeric(df_fe_ss[col], errors='coerce')

    # Convertir les autres colonnes en type "category" pour les colonnes catégorielles
    cols_to_category = ['Soil_Wilderness_Predict']
    for col in cols_to_category:
        df_fe_ss[col] = df_fe_ss[col].astype('category')

    # Transformé les données
    df_filtered_ss = ct_s.fit_transform(df)
    df_filtered_ss = pd.DataFrame(df_filtered_ss, columns=df.columns)
    df_filtered_ss = df_filtered_ss.join(df_fe_ss)

    ### Décoder Soil Type
    soil_columns = [col for col in df_filtered_ss.columns if col.startswith('Soil_Type')]

    df_filtered_ss['Soil_Type'] = np.argmax(df_filtered_ss[soil_columns].values, axis=1)

    # Type catégorie pour pas confondre avec des entiers
    df_filtered_ss['Soil_Type'] = df_filtered_ss['Soil_Type'].astype('category')

    # Supprimer les anciennes colonnes one-hot encodées
    df_filtered_ss = df_filtered_ss.drop(columns=soil_columns)

    return df_filtered_ss

# Optimisation des hyperparamètre pour une sélection de modèles

In [51]:
# Different models

# Return the model with the best performance in cross-validation after grid search (hyperparameters that achieve best cv score)
def tune_hyperparameters(model, method, param_grid, X_train, y_train, cv_folds, scoring):
    stratified_cv = StratifiedKFold(n_splits=cv_folds, shuffle=True)
    gs_model = RandomizedSearchCV(model, param_grid, cv=stratified_cv, scoring=scoring, n_jobs=-1, verbose=1)

    if method == 'XGBoost':        
        gs_model.fit(X_train, y_train-1)
    else:
        gs_model.fit(X_train, y_train)
    return gs_model.best_estimator_, gs_model.best_params_

models = {
    "LightGBM": lgb.LGBMClassifier(is_unbalance=True),
    "Random Forest Classifier": RandomForestClassifier(class_weight='balanced'),
    "XGBoost": XGBClassifier(enable_categorical=True),
    "Extra Trees Classifier": ExtraTreesClassifier(class_weight='balanced')
}

hyperparameter_grids = {
    "LightGBM": {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [-1, 5, 10, 20],
        'min_child_samples': [5, 10, 20, 50],
        'min_split_gain': [0, 0.1, 0.5, 1],
        'num_leaves': [20, 31, 50, 100],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'reg_alpha': [0, 0.1, 1],
        'reg_lambda': [1, 0.1, 0],
        'verbose': [-1]
    },
    "Random Forest Classifier": {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True]
    },

    "XGBoost": {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 6, 10, 15],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.5, 1, 5],
        'reg_alpha': [0, 0.1, 1],
        'reg_lambda': [1, 0.1, 0]
    },
    "Extra Trees Classifier": {
        'n_estimators': [100, 300, 500],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True]               
    }
}

In [48]:
df_train = dataframe_fe(df_filtered, model_sw, ct_s, ct_s2)

# Les donnees
X = df_train.drop('Cover_Type',axis=1)
y = df_train['Cover_Type']

### Entraînement: objectif Accuracy

In [52]:
cv_folds = 5

tuned_models = {}
best_params = {}

for method,model in models.items():
    print(method)
    tuned_model, params = tune_hyperparameters(model, method, hyperparameter_grids[method], X, y, cv_folds, scoring='accuracy')
    tuned_models[method] = tuned_model
    best_params[method] = params

LightGBM
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest Classifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits
XGBoost
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Extra Trees Classifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits


### Entraînement: objectif F1 weighted

In [53]:
cv_folds = 5

tuned_models_f1 = {}
best_params_f1 = {}

for method,model in models.items():
    print(method)
    tuned_model, params = tune_hyperparameters(model, method, hyperparameter_grids[method], X, y, cv_folds, scoring='f1_weighted')
    tuned_models_f1[method] = tuned_model
    best_params_f1[method] = params

LightGBM
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest Classifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits
XGBoost
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Extra Trees Classifier
Fitting 5 folds for each of 10 candidates, totalling 50 fits


# Prédictions: vote majoritaire

In [54]:
def majority_vote_ensemble(data, models):
    # test_set : soil type one hot encode

    # Predictions
    predictions = []
    for method in models:
        print(method)
        if method == 'XGBoost':
            pred = models[method].predict(data) + 1
            predictions.append(list(pred))
        else: 
            pred = models[method].predict(data)
            pred = list(map(int, pred))   
            predictions.append(pred)
    predictions = np.array(predictions).T  # Transpose so that rows are instances
    # Calculer le vote majoritaire pour chaque observation
    majority_vote_predictions, _ = mode(predictions, axis=1)
    
    return majority_vote_predictions.ravel()

In [55]:
X_test = dataframe_fe(test, model_sw, ct_s, ct_s2)

In [56]:
predictions = majority_vote_ensemble(X_test, tuned_models)
predictions2 = majority_vote_ensemble(X_test, tuned_models_f1)

LightGBM
Random Forest Classifier
XGBoost
Extra Trees Classifier
LightGBM
Random Forest Classifier
XGBoost
Extra Trees Classifier


Sauvegarder les résultats

In [None]:
# results = pd.DataFrame()
# # results['Unnamed: 0'] = np.arange(len(test))
# results['row_ID'] = np.arange(len(test))
# results['Cover_Type'] = predictions
# results
# results.to_csv('results_forest.csv', index=False)

## Performance sur les données d'entraînement

In [57]:
predictions_train = majority_vote_ensemble(X, tuned_models)
print('Accuracy score:', accuracy_score(y, predictions_train))
print('F1 score:', f1_score(y, predictions_train,average='weighted'))

LightGBM
Random Forest Classifier
XGBoost
Extra Trees Classifier
Accuracy score: 0.9953856362615486
F1 score: 0.9953874337886743


# Sauvegarder les modèles pour utilisation ultérieure

## Exporter modèles

In [67]:
def save_all_models(tuned_models, save_path="models/"):
    os.makedirs(save_path, exist_ok=True)
    
    for method, model in tuned_models.items():
        # nom du fichier

        filename = f"{save_path}{method.replace(' ', '_')}_tuned_model.joblib"
        
        # save modèle
        joblib.dump(model, filename)

        # Message
        print(f"Modèle '{method}' sauvegardé: '{filename}'")

In [68]:
save_all_models(tuned_models)

Modèle 'LightGBM' sauvegardé: 'models/LightGBM_tuned_model.joblib'
Modèle 'Random Forest Classifier' sauvegardé: 'models/Random_Forest_Classifier_tuned_model.joblib'
Modèle 'XGBoost' sauvegardé: 'models/XGBoost_tuned_model.joblib'
Modèle 'Extra Trees Classifier' sauvegardé: 'models/Extra_Trees_Classifier_tuned_model.joblib'


## Importer modèles

In [69]:
# model = joblib.load("tuned_model.joblib")