# Imports


In [None]:
from prepro import x_train_split, x_test_split, y_train_split, y_test_split, x_test, x_train, y_train, x_val, y_val

import optuna
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier ,RandomForestRegressor, VotingClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso, Ridge
import numpy as np
from tqdm import tqdm

x_train_split: (467084, 28), x_test_split: (100090, 28), x_val: (100090, 28)
y_train_split: (467084, 1), y_test_split: (100090, 1), y_val: (100090, 1)


Nous avons préprocess les données dans le fichier `prepro.py` donnant ainsi les variables `x_train_split`, `x_test_split`, `y_train_split`, `y_test_split`, `x_test`, `x_train`, `y_train`, `x_val`, `y_val`. Nous avons choisi de ne pas utiliser la cross validation afin de pouvoir tester plus de modèles et réduire leur temps d'execution. Nous avons divisé notre ensemble de données en trois: train, test, validation. On validera notre modèle - testé sur y_test et entraîné sur train - sur le set `y_val` par la suite. De cette façon on s'assure que le modèle n'a jamais vu les données du set validation et donc ne donnera pas un résultat biaisé.

Conclusion:

Finalement après beaucoup de tests on remarquera que nos meilleurs modèles sont le random forest classifier ainsi que le xgboost classifier. Les modèles qui considèrent la cible comme une classe ont été meilleurs que les modèles de régressions qui nous donnaient des valeurs décimales. 

# XGB

Nous commençons par optimiser le modèle xgboost

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),  
        'max_depth': trial.suggest_int('max_depth', 5, 50,step=5),  
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),  
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),  
        'gamma': trial.suggest_loguniform('gamma', 0.1, 20),  
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1),  
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 1),  
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'n_jobs':-1  
    }

    model = XGBRegressor(**param)

    model.fit(x_train_split, y_train_split)

    y_pred = model.predict(x_test_split)
    mae = mean_absolute_error(y_test_split, y_pred)
    
    return mae  

study = optuna.create_study(direction='minimize')  
trials = 200
with tqdm(total=trials, desc="Optimisation") as pbar:
    def callback(study, trial):
        pbar.update(1)  

    study.optimize(objective, n_trials=trials, n_jobs=-1, callbacks=[callback])


print("Meilleurs hyperparamètres:", study.best_params)
print("Meilleure valeur de mae:", study.best_value)

[I 2025-03-12 09:11:52,861] A new study created in memory with name: no-name-ea46c621-293e-46aa-8e8a-e8fd72e6cafe
Optimisation:   0%|          | 0/200 [00:00<?, ?it/s][I 2025-03-12 09:12:18,162] Trial 14 finished with value: 0.869809627532959 and parameters: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.007098887099074291, 'colsample_bytree': 0.8202487656991827, 'gamma': 13.286585982453143, 'reg_alpha': 0.6223812724280458, 'reg_lambda': 0.9249002318192379}. Best is trial 14 with value: 0.869809627532959.
Optimisation:   0%|          | 1/200 [00:25<1:23:54, 25.30s/it][I 2025-03-12 09:12:37,212] Trial 2 finished with value: 0.9014929533004761 and parameters: {'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.00234340357951956, 'colsample_bytree': 0.7420044344438264, 'gamma': 3.6444448198967896, 'reg_alpha': 0.5984282113621451, 'reg_lambda': 0.4135552896974062}. Best is trial 14 with value: 0.869809627532959.
Optimisation:   1%|          | 2/200 [00:44<1:11:22, 21.63s/it

Meilleurs hyperparamètres: {'n_estimators': 1000, 'max_depth': 20, 'learning_rate': 0.00854067488526583, 'colsample_bytree': 0.5457962608849325, 'gamma': 4.066717329025523, 'reg_alpha': 0.6464720853598553, 'reg_lambda': 0.9830613163354874}
Meilleure valeur de mae: 0.7156595587730408





Après 3h30 d'execution, les meilleurs paramètres sont les suivants.

In [3]:
study.best_params

{'n_estimators': 1000,
 'max_depth': 20,
 'learning_rate': 0.00854067488526583,
 'colsample_bytree': 0.5457962608849325,
 'gamma': 4.066717329025523,
 'reg_alpha': 0.6464720853598553,
 'reg_lambda': 0.9830613163354874}

Nous utiliserons ces hyper paramètres pour cross valider. On entraîne le modèle 8 fois et on test 8 fois pour voir la MAE. Ici on a une MAE de 0.7157 en moyenne.

In [None]:
params = {'n_estimators': 1000,
    'max_depth': 20,
    'learning_rate': 0.00854067488526583,
    'colsample_bytree': 0.5457962608849325,
    'gamma': 4.066717329025523,
    'reg_alpha': 0.6464720853598553,
    'reg_lambda': 0.9830613163354874,
    'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'n_jobs':-1}
models=[]

num_folds = 8
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

test_predictions = []

for train_index, val_index in kf.split(x_train_split):
    X_train_fold, X_val_fold = x_train_split.iloc[train_index], x_train_split.iloc[val_index]
    y_train_fold, y_val_fold = y_train_split.iloc[train_index], y_train_split.iloc[val_index]

    model = XGBRegressor(**params)
    model.fit(X_train_fold, y_train_fold)
    models.append(model)

    val_preds = model.predict(X_val_fold)

    fold_mae = mean_absolute_error(y_val_fold, val_preds)
    print(f'MAE for this fold: {fold_mae}')

    test_fold_preds = model.predict(x_test_split)
    test_predictions.append(test_fold_preds)

final_test_predictions = np.mean(np.array(test_predictions), axis=0)

final_mae = mean_absolute_error(y_test_split, final_test_predictions)
print(f'\nFinal MAE on test set: {final_mae}')


MAE for this fold: 0.7157818078994751
MAE for this fold: 0.7274929285049438
MAE for this fold: 0.7191265821456909
MAE for this fold: 0.7268838286399841
MAE for this fold: 0.7197303771972656
MAE for this fold: 0.7186602354049683
MAE for this fold: 0.730207085609436
MAE for this fold: 0.7214305996894836

Final MAE on test set: 0.7157794833183289


En soumettant nos résultats dans le challenge: on obtient une MAE de 0.91 dans le challenge indiquant sûrement un sur apprentissage. Nous poursuivons donc notre recherche du meilleur modèle.


# XGB with more data

On teste ici avec plus de données récoltées (données météo, grêves etc.)

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000, step=100),  
        'max_depth': trial.suggest_int('max_depth', 5, 50,step=5),  
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),  
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),  
        'gamma': trial.suggest_loguniform('gamma', 0.1, 20),  
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1),  
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 1),  
        'objective':'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'mae',
        'n_jobs':-1  
    }

    model = XGBRegressor(**param)

    model.fit(x_train_split, y_train_split)

    y_pred = model.predict(x_test_split)
    mae = mean_absolute_error(y_test_split, y_pred)
    
    return mae  

study = optuna.create_study(direction='minimize')  
trials = 200
with tqdm(total=trials, desc="Optimisation") as pbar:
    def callback(study, trial):
        pbar.update(1)  

    study.optimize(objective, n_trials=trials, callbacks=[callback])


print("Meilleurs hyperparamètres:", study.best_params)
print("Meilleure valeur de mae:", study.best_value)

[I 2025-03-18 17:20:20,543] A new study created in memory with name: no-name-9293b310-b33f-456e-9cbd-48954a85ca76
Optimisation:   0%|          | 0/200 [00:00<?, ?it/s][I 2025-03-18 17:24:03,529] Trial 0 finished with value: 0.7227921016833702 and parameters: {'n_estimators': 2000, 'max_depth': 35, 'learning_rate': 0.0024096330998685344, 'colsample_bytree': 0.7309116777638804, 'gamma': 1.7725628280233199, 'reg_alpha': 0.35530564372401874, 'reg_lambda': 0.7889072065295287}. Best is trial 0 with value: 0.7227921016833702.
Optimisation:   0%|          | 1/200 [03:42<12:19:34, 222.98s/it][I 2025-03-18 17:24:10,103] Trial 1 finished with value: 0.7260617654655515 and parameters: {'n_estimators': 300, 'max_depth': 30, 'learning_rate': 0.04572568002863998, 'colsample_bytree': 0.6444480843463894, 'gamma': 6.55342120449137, 'reg_alpha': 0.9463283908244944, 'reg_lambda': 0.26022304629933646}. Best is trial 0 with value: 0.7227921016833702.
Optimisation:   1%|          | 2/200 [03:49<5:15:45, 95.6

Meilleurs hyperparamètres: {'n_estimators': 2000, 'max_depth': 45, 'learning_rate': 0.0030617299673514824, 'colsample_bytree': 0.596554092389905, 'gamma': 3.9608786974301395, 'reg_alpha': 0.9967843829432115, 'reg_lambda': 0.7283756578416208}
Meilleure valeur de mae: 0.7138619780449067





In [None]:
study.best_params # avec ces paramètres, on obtient un MAE de 0.713861

{'n_estimators': 2000,
 'max_depth': 45,
 'learning_rate': 0.0030617299673514824,
 'colsample_bytree': 0.596554092389905,
 'gamma': 3.9608786974301395,
 'reg_alpha': 0.9967843829432115,
 'reg_lambda': 0.7283756578416208}

In [None]:
feature_columns = [
                    # Variables contextuelles
                    'train','gare','arret',
                    # Variables passées
                    'p2q0','p3q0','p4q0','p0q2','p0q3','p0q4',
                    ]

best_params = {'n_estimators': 2000,
            'max_depth': 45,
            'learning_rate': 0.0030617299673514824,
            'colsample_bytree': 0.596554092389905,
            'gamma': 3.9608786974301395,
            'reg_alpha': 0.9967843829432115,
            'reg_lambda': 0.7283756578416208}

model = XGBRegressor(**best_params, n_jobs=-1)
model.fit(x_train_split[feature_columns], y_train_split)
y_pred = model.predict(x_val[feature_columns])
mae = mean_absolute_error(y_val,y_pred)
print(mae) 

0.7294944891953586


Après 4h30 d'execution on obtient sur le set validation une MAE de 0.7294 ce qui est moins bon que le xgboost précédent

# XGB Classif

On utilise non plus un régresseur mais un classifieur. On va mapper la cible pour qu'elle soit non plus de -X à Y mais de 0 à Y+X

In [None]:
feature_columns = [
                    # Variables contextuelles
                    'train','gare','arret',
                    # Variables passées
                    'p2q0','p3q0','p4q0','p0q2','p0q3','p0q4',
                    ]

unique_values = np.unique(y_train_split['p0q0'])  # les valeurs uniques de y_train_split

mapping_dict = {v: idx for idx, v in enumerate(unique_values)}
inverse_mapping_dict = {idx: val for val, idx in mapping_dict.items()}

y_train_mapped = np.array([mapping_dict[val] for val in y_train_split['p0q0']])

def objective(trial):
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [50, 100, 200]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, step=0.05),
        "max_depth": trial.suggest_int("max_depth", 3, 9, step=2),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0, step=0.2),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0, step=0.2),
        "n_jobs":-1
    }

    model = XGBClassifier(**params)

    model.fit(x_train_split[feature_columns], y_train_mapped)

    y_pred_mapped = model.predict(x_test_split[feature_columns])
    y_pred_original = np.array([inverse_mapping_dict[val] for val in y_pred_mapped])

    mae = mean_absolute_error(y_test_split, y_pred_original)
    
    return mae  

study = optuna.create_study(direction='minimize')  
trials = 10
with tqdm(total=trials, desc="Optimisation") as pbar:
    def callback(study, trial):
        pbar.update(1)  

    study.optimize(objective, n_trials=trials, callbacks=[callback])


print("Meilleurs hyperparamètres:", study.best_params)
print("Meilleure valeur de mae:", study.best_value)

[I 2025-04-05 18:14:18,700] A new study created in memory with name: no-name-e14bbed8-7b8e-4088-8a8e-7c71cad2fb59
Optimisation:   0%|          | 0/10 [00:00<?, ?it/s][I 2025-04-05 18:15:39,469] Trial 0 finished with value: 0.746108502347887 and parameters: {'n_estimators': 100, 'learning_rate': 0.060000000000000005, 'max_depth': 3, 'subsample': 0.2, 'colsample_bytree': 0.6000000000000001}. Best is trial 0 with value: 0.746108502347887.
Optimisation:  10%|█         | 1/10 [01:20<12:06, 80.77s/it][I 2025-04-05 18:18:32,590] Trial 1 finished with value: 0.7166050554500949 and parameters: {'n_estimators': 200, 'learning_rate': 0.11, 'max_depth': 3, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.8}. Best is trial 1 with value: 0.7166050554500949.
Optimisation:  20%|██        | 2/10 [04:13<18:00, 135.09s/it][I 2025-04-05 18:21:39,511] Trial 2 finished with value: 0.673773603756619 and parameters: {'n_estimators': 200, 'learning_rate': 0.16000000000000003, 'max_depth': 5, 'subsample':

Meilleurs hyperparamètres: {'n_estimators': 200, 'learning_rate': 0.21000000000000002, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.8}
Meilleure valeur de mae: 0.6671295833749625
Meilleure valeur de mae: 0.6671295833749625





In [None]:
best_params ={'n_estimators': 200, 'learning_rate': 0.21000000000000002, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.8}

model = XGBClassifier(**best_params,n_jobs=-1)

model.fit(x_train_split[feature_columns], y_train_mapped)
y_pred_mapped = model.predict(x_val[feature_columns])

y_pred_original = np.array([inverse_mapping_dict[val] for val in y_pred_mapped])

In [7]:
mae = mean_absolute_error(y_val,y_pred_original)

print(mae)

0.6729143770606454


On a une MAE à 0.67 

# RF

On teste le random Forest

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),  
        'max_depth': trial.suggest_int('max_depth', 5, 50, step=5),  
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20, step=1),  
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20, step=1),  
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),  
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),  
        #'n_jobs': -1  
    }

    model = RandomForestClassifier(**param)

    model.fit(x_train_split, y_train_split)

    y_pred = model.predict(x_test_split)
    mae = mean_absolute_error(y_test_split, y_pred)
    
    return mae  

study = optuna.create_study(direction='minimize')  
trials = 20
with tqdm(total=trials, desc="Optimisation") as pbar:
    def callback(study, trial):
        pbar.update(1)  

    study.optimize(objective, n_trials=trials, callbacks=[callback])


print("Meilleurs hyperparamètres:", study.best_params)
print("Meilleure valeur de mae:", study.best_value)

In [None]:
# on obtient ces paramètres optimaux:
params = {'n_estimators': 800, 'max_depth': 30, 'min_samples_split': 13, 
          'min_samples_leaf': 6, 'max_features': 'log2', 'criterion': 'entropy',
            'random_state':42,
        'n_jobs':-1}

model = RandomForestClassifier(**params)

model.fit(x_train,y_train)
mean_absolute_error(y_test_split,model.predict(x_test_split))

On obtient la MAE suivante: 0.670340

# RF with more data

On test le random Forest avec davantage de données.

In [None]:
model = RandomForestClassifier(n_jobs=-1,verbose=100,random_state=42)
model.fit(x_train_split, y_train_split)
mean_absolute_error(y_test_split,model.predict(x_test_split))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.6s
building tree 18 of 100
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.6s
building tree 19 of 100
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    3.7s
building tree 20 of 100
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    3.7s
building tree 21 of 100building tree 22 of 100
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    3.9s
building tree 23 of 100

[Pa

0.7123706473440088

In [None]:
params = {'n_estimators': 800, 'max_depth': 30, 'min_samples_split': 13, 
          'min_samples_leaf': 6, 'max_features': 'log2', 'criterion': 'entropy',
            'random_state':42, 'verbose':100,
        'n_jobs':-1}
models=[]

num_folds = 3
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

test_predictions = []

for train_index, val_index in kf.split(x_train_split):
    X_train_fold, X_val_fold = x_train_split.iloc[train_index], x_train_split.iloc[val_index]
    y_train_fold, y_val_fold = y_train_split.iloc[train_index], y_train_split.iloc[val_index]

    model = RandomForestClassifier(**params)
    model.fit(X_train_fold, y_train_fold)
    models.append(model)

    val_preds = model.predict(X_val_fold)

    fold_mae = mean_absolute_error(y_val_fold, val_preds)
    print(f'MAE for this fold: {fold_mae}')

    test_fold_preds = model.predict(x_test_split)
    test_predictions.append(test_fold_preds)

final_test_predictions = np.mean(np.array(test_predictions), axis=0)

final_mae = mean_absolute_error(y_test_split, final_test_predictions)
print(f'\nFinal MAE on test set: {final_mae}')

MAE for this fold: 0.7121172100237725
MAE for this fold: 0.7130950842151997
MAE for this fold: 0.7132355833806348

Final MAE on test set: 0.707132848268679


On peut remarquer qu'avec plus de données, ce modèle ne fait pas mieux.

# Modèle Linéaire
On a testé quelques modèles linéaires en optimisant les hyper paramètres. On obtient les MAE suivantes:
- modèle Lasso: 0.86400
- modèle Ridge: 0.86400

Ces modèles linéaires ne sont pas meilleurs.

# Voting Classifier using best models

On utilise le meilleur modèle RF et le meilleur modèle Xgboost Classifier avec un voting classifier. 

In [3]:
# params
best_rf_params = {'n_estimators': 800, 'max_depth': 30, 'min_samples_split': 13, 
          'min_samples_leaf': 6, 'max_features': 'log2', 'criterion': 'entropy'}

best_xgbc_params = {'n_estimators': 1500, 'max_depth': 40, 'learning_rate': 0.012698373632191865,
                'colsample_bytree': 0.8846631804776101, 'gamma': 1.4539984509864856, 'reg_alpha': 0.5009435510185976,
                'reg_lambda': 0.6892591544187344}

feature_columns = [
                    # Variables contextuelles
                    'train','gare','arret',
                    # Variables passées
                    'p2q0','p3q0','p4q0','p0q2','p0q3','p0q4',
                    ]

unique_values = np.unique(y_train_split['p0q0'])  # les valeurs uniques de y_train_split

mapping_dict = {v: idx for idx, v in enumerate(unique_values)}
inverse_mapping_dict = {idx: val for val, idx in mapping_dict.items()}

y_train_mapped = np.array([mapping_dict[val] for val in y_train_split['p0q0']])

# voting models
model_rf = RandomForestClassifier(**best_rf_params, n_jobs=-1, random_state=42)
model_xgbc = XGBClassifier(**best_xgbc_params,n_jobs=-1,random_state=42)  

# Créer le VotingClassifier hard
model_hard = VotingClassifier(estimators=[('rf', model_rf), ('xgb', model_xgbc)], voting='hard')

# Fitting les modèles dans le VotingClassifier
model_hard.fit(x_train_split[feature_columns], y_train_mapped)


In [4]:
y_pred_mapped = model_hard.predict(x_val[feature_columns])

y_pred_original = np.array([inverse_mapping_dict[val] for val in y_pred_mapped])

mae = mean_absolute_error(y_val,y_pred_original)

print(mae)

0.6555799780197822


On voit que cette combinaison de modèle performe au même niveau que nos modèles individuels.

En conclusion, nous nous retrouvons avec une MAE autour de 0.65 minutes soit 39 secondes en moyenne.