In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor


## Data traitment

Collecte et traitement des données

In [2]:
# Collecte des données
X= pd.read_csv(r"X_train_filled.csv",index_col=0)
Y = pd.read_csv(r"..\\y_train.csv",index_col=0)
data_train = pd.merge(X, Y, left_index=True, right_index=True)
data_train

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,spot_id_delta
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.000000,135.062851,-36.874770
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.000000,110.893448,-12.643588
2022-01-01 04:00:00+01:00,45158.0,3386.0,11487.0,44118.0,3288.0,0.0,44.291112,0.000000,82.303596,-1.950193
2022-01-01 05:00:00+01:00,44779.0,3386.0,11487.0,44118.0,3447.0,0.0,36.127588,0.000000,79.969795,1.938272
2022-01-01 06:00:00+01:00,45284.0,3386.0,11487.0,44118.0,3679.0,0.0,30.983023,0.000000,76.075630,0.199907
...,...,...,...,...,...,...,...,...,...,...
2023-03-29 19:00:00+02:00,50814.0,3386.0,11952.0,38320.0,7552.0,651.0,247.408490,7.821622,108.110000,6.029303
2023-03-29 20:00:00+02:00,50628.0,3386.0,11952.0,38320.0,8338.0,109.0,155.795012,2.534054,125.660000,13.576177
2023-03-29 21:00:00+02:00,48201.0,3386.0,11952.0,38320.0,9115.0,0.0,126.884684,0.000000,138.010000,17.478945
2023-03-29 22:00:00+02:00,47967.0,3386.0,11952.0,38320.0,9636.0,0.0,156.669189,0.000000,136.740000,17.559407


In [3]:
# Traitement
## Missing Data
data = data_train.dropna(axis=1, thresh=len(data_train)*0.25)
data = data.dropna()
#data = data.ffill()
X = data[[col for col in data.columns if col !='spot_id_delta']]
Y = data['spot_id_delta'].to_numpy()
## Format
X = pd.get_dummies(X)
## Split
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)

## Model Selection

In [4]:
scalers = [None,StandardScaler(), MinMaxScaler(), RobustScaler()]
models = {
    'XGBoost' : XGBRegressor(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet': ElasticNet(),
    'Support Vector Regression': SVR(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    #'K-Neighbors Regression': KNeighborsRegressor()
    }

res = list()
for mod in models.values() :
    print(f"Model: {mod.__class__.__name__}")
    for scaler in scalers:
        # Appliquer le scaler
        if scaler is None :
            X_train_scaled = X_train
            X_test_scaled = X_test
        else :
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
        # Initialiser le modèle
        model = mod
        
        # Entraîner le modèle de base
        model.fit(X_train_scaled, y_train)
        
        # Évaluation du modèle
        y_pred = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Validation croisée
        cv_scores = sk.model_selection.cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
        res.append([mod.__class__.__name__,scaler.__class__.__name__,mse,r2,cv_scores])


Model: XGBRegressor
Model: LinearRegression
Model: Ridge
Model: Lasso
Model: ElasticNet
Model: SVR
Model: DecisionTreeRegressor
Model: RandomForestRegressor
Model: GradientBoostingRegressor


In [5]:
res = pd.DataFrame(res,columns=["model","scaler","mse","r2","cv_score"]).sort_values(by="r2",ascending=False)
res["cv_mean"] = res['cv_score'].apply(lambda x : np.mean(x))
res

Unnamed: 0,model,scaler,mse,r2,cv_score,cv_mean
29,RandomForestRegressor,StandardScaler,347.17922,0.584799,"[-2895.249689558415, -312.6028999571236, -322....",-978.974897
30,RandomForestRegressor,MinMaxScaler,354.802441,0.575682,"[-2784.4168476509803, -313.63087106522664, -31...",-962.119197
28,RandomForestRegressor,NoneType,359.236471,0.57038,"[-2708.6641802016034, -301.50420815626626, -32...",-967.49901
31,RandomForestRegressor,RobustScaler,381.519184,0.543731,"[-2698.292535560612, -310.9286316024971, -302....",-973.215418
0,XGBRegressor,NoneType,459.238568,0.450784,"[-2132.732669303874, -399.8495642356061, -345....",-828.944717
1,XGBRegressor,StandardScaler,459.238568,0.450784,"[-2132.732669303874, -399.8495642356061, -345....",-828.944717
2,XGBRegressor,MinMaxScaler,459.238568,0.450784,"[-2132.732669303874, -399.8495642356061, -345....",-828.944717
3,XGBRegressor,RobustScaler,459.238568,0.450784,"[-2132.732669303874, -399.8495642356061, -345....",-828.944717
34,GradientBoostingRegressor,MinMaxScaler,554.991453,0.336271,"[-2385.880577176395, -592.4453815940984, -461....",-1209.668333
35,GradientBoostingRegressor,RobustScaler,555.51067,0.33565,"[-2388.3020096922146, -599.5118791274514, -461...",-1189.687007


## XGBRegressor

Optimisation des paramètres

In [6]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = sk.model_selection.GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [7]:
y_pred_best = best_model.predict(X_test_scaled)
best_mse = mean_squared_error(y_test, y_pred_best)
best_r2 = r2_score(y_test, y_pred_best)

print(f"Best Mean Squared Error: {best_mse}")
print(f"Best R^2 Score: {best_r2}")

X_scaled = scaler.fit_transform(X)
best_cv_scores = sk.model_selection.cross_val_score(best_model, X_scaled, Y, cv=5, scoring='neg_mean_squared_error')
print(f"Best Cross-Validation Scores (neg MSE): {best_cv_scores}")
print(f"Mean Best Cross-Validation Score (neg MSE): {best_cv_scores.mean()}")

Best Mean Squared Error: 368.11336361384025
Best R^2 Score: 0.559763405800065
Best Cross-Validation Scores (neg MSE): [-1165.29777872 -2850.28642023 -2205.56579047  -892.83008671
  -900.6307924 ]
Mean Best Cross-Validation Score (neg MSE): -1602.9221737074888


In [8]:
best_params

{'bootstrap': True,
 'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 50}

Creation et Validation du modèle

In [9]:
best_model = XGBRegressor(**best_params)

# Entraîner le modèle sur les données d'entraînement complètes
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.



Mean Squared Error: 392.3495490055356
R2 Score: 0.5307787049772204


In [10]:
best_model.fit(X_test,y_test)

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.



On utilie le model sur les valeurs manquantes

In [11]:
X_test = pd.read_csv(r"..\\X_test.csv",index_col=0)
X_test

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-04-02 00:00:00+02:00,45814.0,3386.0,10902.0,36705.0,6359.0,0.0,56.764535,0.0,
2023-04-02 01:00:00+02:00,44084.0,3386.0,10902.0,36705.0,6469.0,0.0,54.262133,0.0,
2023-04-02 02:00:00+02:00,43281.0,3386.0,10902.0,36705.0,6511.0,0.0,78.105928,0.0,
2023-04-02 03:00:00+02:00,40825.0,3386.0,10902.0,36705.0,6628.0,0.0,78.187557,0.0,
2023-04-02 04:00:00+02:00,39181.0,3386.0,10902.0,36705.0,6700.0,0.0,96.765484,0.0,
...,...,...,...,...,...,...,...,...,...
2023-10-24 19:00:00+02:00,49686.0,2226.0,11749.0,42980.0,4901.0,0.0,247.887323,0.0,125.67
2023-10-24 20:00:00+02:00,53397.0,2226.0,11749.0,42980.0,5584.0,0.0,343.192642,0.0,139.58
2023-10-24 21:00:00+02:00,50586.0,2226.0,11749.0,42980.0,6306.0,0.0,471.875973,0.0,147.93
2023-10-24 22:00:00+02:00,46777.0,2226.0,11749.0,42980.0,6959.0,0.0,595.528100,0.0,122.20


In [12]:
X_test = pd.get_dummies(X_test)
y_test = best_model.predict(X_test)

In [15]:
Y_test = pd.Series(y_test,index = X_test.index)
Y_test.name = "spot_id_delta"
Y_test

DELIVERY_START
2023-04-02 00:00:00+02:00     3.753654
2023-04-02 01:00:00+02:00     1.634030
2023-04-02 02:00:00+02:00     1.526841
2023-04-02 03:00:00+02:00     1.095470
2023-04-02 04:00:00+02:00    10.310592
                               ...    
2023-10-24 19:00:00+02:00    -9.910774
2023-10-24 20:00:00+02:00     4.048954
2023-10-24 21:00:00+02:00    16.774851
2023-10-24 22:00:00+02:00    18.066935
2023-10-24 23:00:00+02:00    27.858685
Name: spot_id_delta, Length: 4942, dtype: float32

In [16]:
Y_test.to_csv("y_test.csv")