# Modélisation #1 XGBoost + HyperOpt

In [18]:
# Importation des packages

# Base
import importlib
import functions
importlib.reload(functions)
from functions import *

# Data Management
import pandas as pd
import numpy as np

# Modélisation
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Optimisation
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [19]:
# Importation des données

train_data = pd.read_csv('../data/Train_Data/7_train_data.csv', index_col=0)
train_scores = pd.read_csv('../data/Y_train_1rknArQ.csv', index_col=0)

sub_data = pd.read_csv('../data/Sub_Data/7_sub_data.csv', index_col=0)

train_data.head(5)

In [3]:
# Je préfère être sûr que les indexs concordent. 

train_scores = train_scores.loc[train_data.index]

train_scores.head(5)

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,1
1,0,1,0
2,0,0,1
3,1,0,0
4,0,1,0


In [4]:
# Préparation des données pour la prédiction. 

train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

  train_scores_1c = train_scores_1c.replace(label_mapping)


ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [5]:
x_train, x_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.9, random_state=42)

XGB MODEL

In [6]:
# model = xgb.XGBClassifier(
#     objective='multi:softmax',
#     num_class=3,
# )

HyperOpt

In [7]:
# Je définis l'espace de recherche pour les hyperparamètres de XGBoost

# space = {
#     'max_depth': hp.quniform('max_depth', 3, 15, 1),
#     'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
#     'n_estimators': hp.quniform('n_estimators', 100, 1000, 10),
#     'gamma': hp.uniform('gamma', 0, 0.5),
#     'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
#     'subsample': hp.uniform('subsample', 0.3, 1.0),
#     'reg_alpha': hp.loguniform('reg_alpha', np.log(0.01), np.log(1)),
#     'reg_lambda': hp.loguniform('reg_lambda', np.log(0.01), np.log(1)),
#     'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1)
# }

space = {
    'max_depth': hp.quniform('max_depth', 4, 20, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.2)),
    'n_estimators': hp.quniform('n_estimators', 200, 2000, 100),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(0.001), np.log(1)),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(0.001), np.log(1)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0.1, 10)
}


In [8]:
# La fonction que l'on va minimiser
    

def objective(params):
    # Convertir les valeurs en entiers là où c'est nécessaire
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])
    params['min_child_weight'] = int(params['min_child_weight'])

    # Entraîner le modèle XGBoost avec les hyperparamètres courants
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        n_jobs=-1,  # Utilisation de tous les cœurs disponibles
        **params
    )

    model.fit(x_train, y_train)

    # Prédiction
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Retourner 1 - accuracy pour minimiser (on veut maximiser accuracy)
    return {'loss': 1 - accuracy, 'status': STATUS_OK}

In [9]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,  # Utilise la méthode TPE pour la recherche
            max_evals=75,  # Nombre d'évaluations
            trials=trials)

print("Meilleurs hyperparamètres trouvés :", best)

  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

Parameters: { "scale_pos_weight" } are not used.




100%|██████████| 1/1 [01:31<00:00, 91.08s/trial, best loss: 0.4849715678310317]
Meilleurs hyperparamètres trouvés : {'colsample_bytree': np.float64(0.7058786693412795), 'gamma': np.float64(0.28293626685024964), 'learning_rate': np.float64(0.0022308795179515), 'max_depth': np.float64(6.0), 'min_child_weight': np.float64(9.0), 'n_estimators': np.float64(1100.0), 'reg_alpha': np.float64(0.013289770552295915), 'reg_lambda': np.float64(0.027267662993960103), 'scale_pos_weight': np.float64(9.607230607039273), 'subsample': np.float64(0.7232817087376201)}


In [10]:
# Si c'est pas des entiers ça ne fonctionne pas.

best['max_depth'] = int(best['max_depth'])
best['n_estimators'] = int(best['n_estimators'])
best['min_child_weight'] = int(best['min_child_weight'])

In [11]:
final_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    **best
)

final_model.fit(x_train, y_train)

Parameters: { "scale_pos_weight" } are not used.



In [12]:
y_pred_final = final_model.predict(x_test)

final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Accuracy du modèle optimisé : {final_accuracy * 100:.2f}%")

Accuracy du modèle optimisé : 51.50%


# Prédictions

In [13]:
y_sub = final_model.predict(sub_data)

In [14]:
# Reconstruction du DataFrame y_sub avec les colonnes 'HOME_WINS', 'DRAW', 'AWAY_WINS'
y_sub_df = pd.DataFrame(y_sub, columns=['PRED'])

# Transformation en one-hot encoding pour obtenir les 3 colonnes
y_sub_df['HOME_WINS'] = (y_sub_df['PRED'] == 0).astype(int)
y_sub_df['DRAW'] = (y_sub_df['PRED'] == 1).astype(int)
y_sub_df['AWAY_WINS'] = (y_sub_df['PRED'] == 2).astype(int)

# Ajout de la colonne 'ID' depuis sub_data
y_sub_df['ID'] = sub_data.index

# Suppression de la colonne 'PRED'
y_sub_df.drop('PRED', axis=1, inplace=True)

# Réorganiser les colonnes pour avoir 'ID' en premier
y_sub_df = y_sub_df[['ID', 'HOME_WINS', 'DRAW', 'AWAY_WINS']]

# Afficher les premières lignes du résultat final
y_sub_df = y_sub_df.set_index('ID')
y_sub_df.head()

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12303,1,0,0
12304,0,0,1
12305,1,0,0
12306,1,0,0
12307,0,0,1


In [15]:
y_sub_df.to_csv('../data/Sub_Data/15_sub_scores.csv')