## 0 - Démarches préliminaires

### 0.a- Importation des librairies 

In [1]:
#general librairies
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd

#sklearn librairies
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import *
from sklearn.ensemble import * 
from sklearn.svm import *
import statsmodels.api as sm
import xgboost as xgb

#appel a nos fonctions
from fcts_R.general import * 
from fcts_R.dataset_division import *
from fcts_R.combinaison import * 

import yaml



## 1 - Traitement du jeu de données 

In [3]:
data0 = pd.read_csv("datasets_R/data0.csv")
data1 = pd.read_csv("datasets_R/data1.csv")


#On sépare les co-variables de la variable à prédire
X0,y0 = treatment(data0)
X1,y1 = treatment(data1)

#Data0: 
X_tr0, X_te0, y_tr0, y_te0= train_test_split(X0,y0, test_size=0.33,random_state=2023)

#Data1:
X_tr1, X_te1, y_tr1, y_te1= train_test_split(X1,y1, test_size=0.33,random_state=2023)

## 2- Comparaison des performances des modèles 

Meilleurs modèles pour data0

In [4]:
models_0 = {'mod' : [NuSVR(C=1,gamma= 'scale', kernel='rbf',nu=0.6), xgb.XGBRFRegressor(random_state=50, n_estimators=150),ExtraTreesRegressor(random_state=50, max_samples=None, n_estimators=1000), BaggingRegressor(n_estimators= 2250, random_state=10)]}

#On crée une pipeline
pipe_0 = Pipeline(steps=[('std', StandardScaler()),('mod', LinearRegression())])

#On cherche le meilleur modèle par cross-validation 
grid_search_0 = GridSearchCV(estimator=pipe_0,param_grid=models_0,cv=5)
grid_search_0.fit(X_tr0, y_tr0.to_numpy().ravel())
print("Best Hyperparameters: ", grid_search_0.best_params_)
print("Best Score: ", grid_search_0.best_score_)

Best Hyperparameters:  {'mod': ExtraTreesRegressor(n_estimators=1000, random_state=50)}
Best Score:  0.4453407130077863


Meilleurs models pour data1

In [158]:
models_1 = {'mod': [NuSVR(kernel="rbf", gamma="scale", nu=0.5,C=1), xgb.XGBRFRegressor(seed=2023, n_estimators=415), ExtraTreesRegressor(n_estimators=2750,random_state=50, max_samples=None)]}

#On crée une pipeline et on cherche le meilleur modèle par cross validation 
pipe_1 = Pipeline(steps=[('std', StandardScaler()),('mod', LinearRegression())])

#On cherche par cross validation le meilleur modèle 
grid_search_1 = GridSearchCV(estimator=pipe_1,param_grid=models_1,cv=5)
grid_search_1.fit(X_tr1, y_tr1.to_numpy().ravel())
print("Best Hyperparameters: ", grid_search_1.best_params_)
print("Best Score: ", grid_search_1.best_score_)

Best Hyperparameters:  {'mod': ExtraTreesRegressor(n_estimators=2750, random_state=50)}
Best Score:  0.44142366751757073


## 3- Choix de la meilleure combinaison  

In [3]:
mods_0 = np.array([NuSVR(C=1,gamma= 'scale', kernel='rbf',nu=0.6), xgb.XGBRFRegressor(random_state=50, n_estimators=150),ExtraTreesRegressor(random_state=50, max_samples=None, n_estimators=1000), BaggingRegressor(n_estimators= 2250, random_state=10)])
mods_1 = np.array([NuSVR(kernel="rbf", gamma="scale", nu=0.5,C=1), xgb.XGBRFRegressor(seed=2023, n_estimators=415), ExtraTreesRegressor(n_estimators=2750,random_state=50, max_samples=None)])

In [4]:
#Choix des deux prédicteurs de base par cross-validation 
mod0, mod1 = choix_melange(mods_0, mods_1, data0, data1)

## 4 - On sauvegarde les modèles dans un fichier YAML 

In [165]:
data = {"model0": str(mod0),"model1": str(mod1)}
yaml_file_path = "modeles_choisis.yaml"

with open(yaml_file_path, "w") as yaml_file:
    yaml.dump(data, yaml_file, default_flow_style=False)
print(f"Variables sauvegardées dans {yaml_file_path}")

Variables sauvegardées dans modeles_choisis.yaml
