## 0 - Démarches préliminaires

### 0.a- Importation des librairies 

In [1]:
#general librairies
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd
import yaml 

#sklearn librairies
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import *
from sklearn.ensemble import * 
from sklearn.svm import *
import statsmodels.api as sm
import xgboost as xgb

#appel a nos fonctions
from fcts_R.general import * 
from fcts_R.dataset_division import *



### 0.b - On télécharge le jeu de données train

In [2]:
with open("paths_datasets_r.yaml", "r") as yaml_file:
    data = yaml.load(yaml_file, Loader=yaml.FullLoader)

path_train = data["train"]

#On charge le jeu de données
data = pd.read_csv(path_train)

Dans la suite, on va tenter de faire des prédictions sur chaque type de vin.

## 1 - Traitement du jeu de données 

### a- Séparation du jeu de données initial en fonction de 'wine_type'

In [3]:
#on extrait indices dans lequels la variable "wine_type"= 0 (resp. = 1)
idx0, idx1 = winetype(data)

#on crée les jeu de données data0 et data1 à partir des indices 
data0, data1 = formal_div(data, idx0, idx1)

data0.to_csv("Datasets_R/data0.csv")
data1.to_csv("Datasets_R/data1.csv")

### b - Séparation train/test: 
Cette division nous permettra d'évaluer les performances de nos modèles en contrôlant l'erreur test. 

In [4]:
data0 = pd.read_csv("Datasets_R/data0.csv")
data1 = pd.read_csv("Datasets_R/data1.csv")


#On sépare les co-variables de la variable à prédire
X0,y0 = treatment(data0)
X1,y1 = treatment(data1)

#Data0: 
X_tr0, X_te0, y_tr0, y_te0= train_test_split(X0,y0, test_size=0.33,random_state=2023)

#Data1:
X_tr1, X_te1, y_tr1, y_te1= train_test_split(X1,y1, test_size=0.33,random_state=2023)

### c - Normalisation des co-variables
Cette démarche nous permettra d'éviter des biais dûs à d'unité des covariables de l'étude.

In [5]:
#Pour data0: 
stdsc = StandardScaler()
X_tr_0 = stdsc.fit_transform(X_tr0)
X_te_0 = stdsc.transform(X_te0)

#Pour data1: 
stdsc = StandardScaler()
X_tr_1 = stdsc.fit_transform(X_tr1)
X_te_1 = stdsc.transform(X_te1)

## 2 - Sélection et entrainement de modèles

### a - Sélection de paramètres et modèles basée sur la cross-validation

#### 1- SVM

Modèle entrainé sur data0.

In [None]:
# {'kernel': ["linear","poly","rbf", "sigmoid"], 'gamma':["scale", "auto"], 'epsilon':[0.05,0.1,0.15], 'C': [0.5,1,1.5]}
#Best Hyperparameters:  {'C': 1, 'epsilon': 0.15, 'gamma': 'scale', 'kernel': 'rbf'}; Best Score:  0.37465693229921293; normal:  0.40728418982541437 

#{'degree': [1,2,3,4,5,6]}, SVR(kernel="poly", gamma="scale", epsilon=0.15)
#Best Hyperparameters:  {'degree': 1}; Best Score:  0.26182202519203174; normal:  0.3233410341186739

#{'degree': [1,2,3,4,5,6], 'coef0': [0.0, 0.01, 0.1, 0.5, 0.75,1]}, SVR(kernel="poly", gamma="scale", epsilon=0.15)
#Best Hyperparameters:  {'coef0': 1, 'degree': 2}; Best Score:  0.31889755456529084; normal:  0.26912361991020595

#{'C': [0.75,0.9,1,1.1,1.25]}, SVR(kernel="rbf", gamma="scale", epsilon=0.15)
#Best Hyperparameters:  {'C': 1}; Best Score:  0.37465693229921293; normal:  0.40728418982541437

#{'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"], 'epsilon': [0,0.01,0.05,0.1,0.2,0.5],  'C': [0.5,1,1.5]},LinearSVR()
# Best Hyperparameters:  {'C': 1.5, 'epsilon': 0, 'loss': 'squared_epsilon_insensitive'}; Best Score:  0.26364596987343536; normal:  0.3142987698900451

#BEST SCORE: 
#params = {'kernel': ["linear","poly","rbf", "sigmoid"], 'gamma':["scale", "auto"], 'nu':[0.25,0.4,0.5,0.6,0.75,1], 'C': [0.5,1,1.5]}, NuSVR()
# Best Hyperparameters:  {'C': 1.5, 'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.6}; Best Score:  0.3825024346626745; normal:  0.4170237776338953

params = {'kernel': ["linear","poly","rbf", "sigmoid"], 'gamma':["scale", "auto"], 'nu':[0.25,0.4,0.5,0.6,0.75,1], 'C': [0.5,1,1.5]}
pred0 = param_selection(params, NuSVR(), X_tr_0, y_tr0.to_numpy().ravel(), X_te_0)

print("erreur test: ", r2_score(y_te0, pred0))

Best Hyperparameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.6}
Best Score:  0.3692312928254857
erreur test:  0.40566250675181714


Modèle entrainé sur data1.

In [None]:
#pred_SVM1 = param_selection({'kernel': ["linear","poly","rbf", "sigmoid"], 'gamma':["scale", "auto"], 'epsilon':[0.05,0.1,0.15], 'C': [0.5,1,1.5]}, SVR(gamma='auto'), X_tr_1,y_tr1, X_te_1)
# Best Hyperparameters:  {'C': 0.5, 'epsilon': 0.15, 'gamma': 'scale', 'kernel': 'rbf'}; Best Score:  0.36646962843446423

#pred_SVM1 = param_selection({'kernel': ["linear","poly","rbf"], 'epsilon':[0.01,0.1,0.2]}, SVR(gamma="auto"), X_tr_1, y_tr1, X_te_1)
# Best Hyperparameters:  {'epsilon': 0.2, 'kernel': 'rbf'}; Best Score:  0.37085380472521134

#pred_SVM1 = param_selection({'degree': [1,2,3,4]}, SVR(gamma="auto", kernel="poly", epsilon=0.2), X_tr_1, y_tr1, X_te_1)
#Best Hyperparameters:  {'degree': 1}; Best Score:  0.3486973341420462

#pred_SVM1 = param_selection({'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"], 'epsilon': [0,0.01,0.05,0.1,0.2,0.5],  'C': [0.5,1,1.5]},LinearSVR(), X_tr_1,y_tr1,X_te_1)
#Best Hyperparameters:  {'C': 1, 'epsilon': 0.2, 'loss': 'epsilon_insensitive'}; Best Score:  0.34926931350937107

# pred_SVM1 = param_selection({'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"], 'epsilon': [0,0.01,0.05,0.1,0.2,0.5],  'C': [0.5,1,1.5]},LinearSVR(), X_tr_1, y_tr1, X_te_1)
# Best Hyperparameters:  {'C': 1, 'epsilon': 0.2, 'loss': 'epsilon_insensitive'}; Best Score:  0.34948012695417774

#BEST SCORE
#pred_SVM1 = param_selection({'kernel': ["linear","poly","rbf", "sigmoid"], 'gamma':["scale", "auto"], 'nu':[0.25,0.4,0.5,0.6,0.75,1], 'C': [0.5,1,1.5]}, NuSVR(), X_tr_1,y_tr1,X_te1)
# Best Hyperparameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'nu': 0.5}; Best Score:  0.37920360670150177
pred_SVM1 = NuSVR(kernel="rbf", gamma="scale", nu=0.5,C=1).fit(X_tr_1,y_tr1.to_numpy().ravel()).predict(X_te_1)

print("erreur test: ", r2_score(y_te1, pred_SVM1))

erreur test:  0.3745598443683087


### 2 - Arbres de décision

Modèle entrainé sur data0.

In [11]:
#param_grid = {"learning_rate":[0.01, 0.05,0.1,0.15]}
#mod = GradientBoostingRegressor(random_state=10, loss="huber", learning_rate= 0.15)
#Best Score:  0.3803194835261793

#param_grid = {loss:['squared_error', 'absolute_error', 'poisson', 'quantile'],"learning_rate":[0.15,0.1,0.075]}
#mod = HistGradientBoostingRegressor(random_state=10, loss="poisson", learning_rate= 0.15)
#Best Score:  0.4212414218497866

#param_grid = {'n_estimators': [1750,2000,2250]}
#mod = BaggingRegressor(random_state=10, n_estimators=2250)
# Best Hyperparameters:  {'n_estimators': 2250}; Best Score:  0.4322135469402454

#BEST SCORE
param_grid  = {'n_estimators': [900,1000,1100]}
pred0 = param_selection(param_grid, ExtraTreesRegressor(random_state=50, max_samples=None), X_tr_0, y_tr0.to_numpy().ravel(), X_te_0)
#Best Hyperparameters:  {'n_estimators': 1000}; Best Score:  0.445343694590953

print("erreur test: ", r2_score(y_te0, pred0))

Best Hyperparameters:  {'n_estimators': 1000}
Best Score:  0.445343694590953
erreur test:  0.5123931993006859


Modèle entrainé sur data1.

In [14]:
#param_grid = {'n_estimators': [400,500]}
#mod = BaggingRegressor(random_state=10)
#Best Hyperparameters:  {'n_estimators': 500}; Best Score:  0.4415537219663822

#BEST SCORE 
param_grid = {'n_estimators': [300,2750,100]}
pred1 = param_selection(param_grid, ExtraTreesRegressor(random_state=50, max_samples=None), X_tr_1, y_tr1.to_numpy().ravel(), X_te_1)
#Best Hyperparameters:  {'n_estimators': 2750}; Best Score:  0.44143303715384785

print("erreur test: ", r2_score(y_te1, pred1))

Best Hyperparameters:  {'n_estimators': 2750}
Best Score:  0.44143303715384785
erreur test:  0.4469262599949042


### 3- XGBoost

Modèle entraîné sur data0.

In [None]:
# preds = param_selection({'n_estimators': [95,100,105]}, xgb.XGBRegressor(seed = 2023, objective="reg:squarederror"), X_tr_0, y_tr0, X_te_0)
# Best Hyperparameters:  {'n_estimators': 100}; Best Score:  0.377947838196242

#BEST SCORE
preds0 = param_selection({'n_estimators': [100,150, 200, 250,300]}, xgb.XGBRFRegressor(random_state=50), X_tr_0, y_tr0, X_te_0)
# Best Hyperparameters:  {'n_estimators': 200}; Best Score:  0.32986538836899715

print("erreur test",r2_score(y_te0, preds0))

Best Hyperparameters:  {'n_estimators': 150}
Best Score:  0.3286218368868324
erreur test 0.3954196853517815


Modèle entrainé sur data1.

In [None]:
# preds1 = param_selection({'objective':["reg:linear", "reg:squarederror", "reg:tweedie", "reg:huber"],'n_estimators': [10,50,75,100,150,200]}, xgb.XGBRegressor(seed = 2023), X_tr_1, y_tr1, X_te_1)
# Best Hyperparameters:  {'n_estimators': 10, 'objective': 'reg:linear'}; Best Score:  0.37838078901841915

# preds1 = param_selection({'objective':["reg:linear", "reg:squarederror"],'n_estimators': [5,10, 15,20]}, xgb.XGBRegressor(random_state = 2023), X_tr_1, y_tr1, X_te_1)
# Best Hyperparameters:  {'n_estimators': 10, 'objective': 'reg:linear'}; Best Score:  0.37838078901841915

#BEST SCORE
preds1 = param_selection({'n_estimators': [415, 420, 425]}, xgb.XGBRFRegressor(seed=2023), X_tr_1, y_tr1, X_te_1)
#Best Hyperparameters:  {'n_estimators': 420}; Best Score:  0.40799724415966443

print("erreur test: ",r2_score(y_te1, preds1))

Best Hyperparameters:  {'n_estimators': 415}
Best Score:  0.40799724415966443
erreur test:  0.37489969193217565
