In [2]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor


## Data traitment

Collecte et traitement des données

In [3]:
# Collecte des données
data1= pd.read_csv(r"C:\Users\collo\Documents\Etudes\Dauphine\Machine Learning\Projet\DAUPHINE_Machine_Learning\X_train.csv",index_col=0)
data_test = data1[data1['predicted_spot_price'].isnull()]
data_train = data1[~data1['predicted_spot_price'].isnull()]

In [4]:
data2= pd.read_csv(r"C:\Users\collo\Documents\Etudes\Dauphine\Machine Learning\Projet\DAUPHINE_Machine_Learning\X_test.csv",index_col=0)
data_test_2 = data2[data2['predicted_spot_price'].isnull()]
data_train_2 = data2[~data2['predicted_spot_price'].isnull()]

On va entrainer sur les données de notre set de train et test pour lesquel la colonne est remplie

In [5]:
data_train = pd.concat([data_train,data_train_2])
data_train

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-01-03 01:00:00+01:00,48275.0,3386.0,11945.0,42390.0,3792.0,0.0,64.529291,0.0,114.88
2023-01-03 02:00:00+01:00,47626.0,3386.0,11945.0,42390.0,4033.0,0.0,87.206239,0.0,110.62
2023-01-03 03:00:00+01:00,45648.0,3386.0,11945.0,42390.0,4200.0,0.0,68.271493,0.0,105.31
2023-01-03 04:00:00+01:00,44972.0,3386.0,11945.0,42390.0,4360.0,0.0,59.995098,0.0,94.95
2023-01-03 05:00:00+01:00,47372.0,3386.0,11945.0,42390.0,4495.0,0.0,49.122662,0.0,97.82
...,...,...,...,...,...,...,...,...,...
2023-10-24 19:00:00+02:00,49686.0,2226.0,11749.0,42980.0,4901.0,0.0,247.887323,0.0,125.67
2023-10-24 20:00:00+02:00,53397.0,2226.0,11749.0,42980.0,5584.0,0.0,343.192642,0.0,139.58
2023-10-24 21:00:00+02:00,50586.0,2226.0,11749.0,42980.0,6306.0,0.0,471.875973,0.0,147.93
2023-10-24 22:00:00+02:00,46777.0,2226.0,11749.0,42980.0,6959.0,0.0,595.528100,0.0,122.20


In [6]:
# Traitement
## Missing Data
data = data_train.dropna(axis=1, thresh=len(data_train)*0.25)
data = data.dropna()
#data = data.ffill()
X = data[[col for col in data.columns if col !='predicted_spot_price']]
Y = data['predicted_spot_price'].to_numpy()
## Format
X = pd.get_dummies(X)
## Split
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)

## Model Selection

In [7]:
scalers = [None,StandardScaler(), MinMaxScaler(), RobustScaler()]
models = {
    'XGBoost' : XGBRegressor(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Support Vector Regression': SVR(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    'K-Neighbors Regression': KNeighborsRegressor()}

res = list()
for mod in models.values() :
    print(f"Model: {mod.__class__.__name__}")
    for scaler in scalers:
        # Appliquer le scaler
        if scaler is None :
            X_train_scaled = X_train
            X_test_scaled = X_test
        else :
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
        # Initialiser le modèle
        model = mod
        
        # Entraîner le modèle de base
        model.fit(X_train_scaled, y_train)
        
        # Évaluation du modèle
        y_pred = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"Scaler: {scaler.__class__.__name__}")
        
        # Validation croisée
        cv_scores = sk.model_selection.cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
        res.append([mod.__class__.__name__,scaler.__class__.__name__,mse,r2,cv_scores])


Model: XGBRegressor
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: LinearRegression
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: Ridge
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: Lasso
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: ElasticNet
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: SVR
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: DecisionTreeRegressor
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: RandomForestRegressor
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: GradientBoostingRegressor
Scaler: NoneType
Scaler: StandardScaler
Scaler: MinMaxScaler
Scaler: RobustScaler
Model: KNeighborsRegressor
Scaler: NoneType
Scaler: StandardScaler
Scale

In [8]:
res = pd.DataFrame(res,columns=["model","scaler","mse","r2","cv_score"]).sort_values(by="r2",ascending=False)
res["cv_mean"] = res['cv_score'].apply(lambda x : np.mean(x))
res.head(1)

Unnamed: 0,model,scaler,mse,r2,cv_score,cv_mean
0,XGBRegressor,NoneType,180.855576,0.87186,"[-165.99278600766965, -154.86317123759338, -20...",-176.859641


## XGBRegressor

Optimisation des paramètres

In [9]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2,0.3],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.2,0.4,0.6, 0.8, 1.0],
    'colsample_bytree': [0.2,0.4,0.6, 0.8, 1.0]
}

grid_search = sk.model_selection.GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


In [10]:
y_pred_best = best_model.predict(X_test_scaled)
best_mse = mean_squared_error(y_test, y_pred_best)
best_r2 = r2_score(y_test, y_pred_best)

print(f"Best Mean Squared Error: {best_mse}")
print(f"Best R^2 Score: {best_r2}")

best_cv_scores = sk.model_selection.cross_val_score(best_model, X, Y, cv=5, scoring='neg_mean_squared_error')
print(f"Best Cross-Validation Scores (neg MSE): {best_cv_scores}")
print(f"Mean Best Cross-Validation Score (neg MSE): {best_cv_scores.mean()}")

Best Mean Squared Error: 161.70377686332674
Best R^2 Score: 0.8854297460806846
Best Cross-Validation Scores (neg MSE): [-487.88851344 -474.22477177 -429.7453366  -334.12472218 -505.36685016]
Mean Best Cross-Validation Score (neg MSE): -446.2700388305034


In [11]:
best_params

{'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 7,
 'n_estimators': 200,
 'subsample': 0.8}

Creation et Validation du modèle

In [12]:
best_model = XGBRegressor(**best_params)

# Entraîner le modèle sur les données d'entraînement complètes
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 161.70377686332674
R2 Score: 0.8854297460806846


In [13]:
best_model.fit(X_test,y_test)

On utilise le model sur les valeurs manquantes

In [14]:
X_test_test = data_test[[col for col in data_test.columns if col !='predicted_spot_price']]
X_test_test = pd.get_dummies(X_test_test)
y_test_test = best_model.predict(X_test_test)


In [15]:
X_test_test['predicted_spot_price'] = y_test_test
data_filled = pd.concat([data_train,X_test_test]).sort_index()
data_filled.to_csv("X_train_filled.csv")

In [16]:
X_test_test_2 = data_test_2[[col for col in data_test_2.columns if col !='predicted_spot_price']]
X_test_test_2 = pd.get_dummies(X_test_test_2)
y_test_test_2 = best_model.predict(X_test_test_2)

In [17]:
X_test_test_2['predicted_spot_price'] = y_test_test_2
data_filled_2 = pd.concat([data_train_2,X_test_test_2]).sort_index()
data_filled_2

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-04-02 00:00:00+02:00,45814.0,3386.0,10902.0,36705.0,6359.0,0.0,56.764535,0.0,111.145187
2023-04-02 01:00:00+02:00,44084.0,3386.0,10902.0,36705.0,6469.0,0.0,54.262133,0.0,110.423241
2023-04-02 02:00:00+02:00,43281.0,3386.0,10902.0,36705.0,6511.0,0.0,78.105928,0.0,110.569878
2023-04-02 03:00:00+02:00,40825.0,3386.0,10902.0,36705.0,6628.0,0.0,78.187557,0.0,82.747284
2023-04-02 04:00:00+02:00,39181.0,3386.0,10902.0,36705.0,6700.0,0.0,96.765484,0.0,82.842125
...,...,...,...,...,...,...,...,...,...
2023-10-24 19:00:00+02:00,49686.0,2226.0,11749.0,42980.0,4901.0,0.0,247.887323,0.0,125.670000
2023-10-24 20:00:00+02:00,53397.0,2226.0,11749.0,42980.0,5584.0,0.0,343.192642,0.0,139.580000
2023-10-24 21:00:00+02:00,50586.0,2226.0,11749.0,42980.0,6306.0,0.0,471.875973,0.0,147.930000
2023-10-24 22:00:00+02:00,46777.0,2226.0,11749.0,42980.0,6959.0,0.0,595.528100,0.0,122.200000


In [18]:
data_filled_2.to_csv("X_test_filled.csv")