In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor


## Data traitment

Collecte et traitement des données

In [2]:
# Collecte des données
X= pd.read_csv(r"X_train_filled.csv",index_col=0)
Y = pd.read_csv(r"C:\Users\collo\Documents\Etudes\Dauphine\Machine Learning\Projet\DAUPHINE_Machine_Learning\y_train.csv",index_col=0)
data_train = pd.merge(X, Y, left_index=True, right_index=True)
data_train

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,spot_id_delta
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.000000,116.808914,-36.874770
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.000000,116.345711,-12.643588
2022-01-01 04:00:00+01:00,45158.0,3386.0,11487.0,44118.0,3288.0,0.0,44.291112,0.000000,108.858177,-1.950193
2022-01-01 05:00:00+01:00,44779.0,3386.0,11487.0,44118.0,3447.0,0.0,36.127588,0.000000,112.760529,1.938272
2022-01-01 06:00:00+01:00,45284.0,3386.0,11487.0,44118.0,3679.0,0.0,30.983023,0.000000,114.032677,0.199907
...,...,...,...,...,...,...,...,...,...,...
2023-03-29 19:00:00+02:00,50814.0,3386.0,11952.0,38320.0,7552.0,651.0,247.408490,7.821622,108.110000,6.029303
2023-03-29 20:00:00+02:00,50628.0,3386.0,11952.0,38320.0,8338.0,109.0,155.795012,2.534054,125.660000,13.576177
2023-03-29 21:00:00+02:00,48201.0,3386.0,11952.0,38320.0,9115.0,0.0,126.884684,0.000000,138.010000,17.478945
2023-03-29 22:00:00+02:00,47967.0,3386.0,11952.0,38320.0,9636.0,0.0,156.669189,0.000000,136.740000,17.559407


In [3]:
# Traitement
## Missing Data
data = data_train.dropna(axis=1, thresh=len(data_train)*0.25)
data = data.dropna()
#data = data.ffill()
X = data[[col for col in data.columns if col !='spot_id_delta']]
X_lag = X.shift(1)
X_lag.columns=[col+"_lag" for col in X_lag.columns]
X = pd.concat([X,X_lag],axis=1).bfill()

In [4]:
Y = data['spot_id_delta'].to_numpy()
## Format
X = pd.get_dummies(X)
## Split
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)

## Model Selection

In [6]:
scalers = [None,StandardScaler(), MinMaxScaler(), RobustScaler()]
models = {
    'XGBoost' : XGBRegressor(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    #'Lasso Regression': Lasso(),
    #'ElasticNet': ElasticNet(),
    'Support Vector Regression': SVR(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    #'K-Neighbors Regression': KNeighborsRegressor()
    }

res = list()
for mod in models.values() :
    print(f"Model: {mod.__class__.__name__}")
    for scaler in scalers:
        # Appliquer le scaler
        if scaler is None :
            X_train_scaled = X_train
            X_test_scaled = X_test
        else :
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
        # Initialiser le modèle
        np.random.RandomState(42)
        model = mod
        
        # Entraîner le modèle de base
        model.fit(X_train_scaled, y_train)
        
        # Évaluation du modèle
        y_pred = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Validation croisée
        cv_scores = sk.model_selection.cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='neg_mean_squared_error')
        res.append([mod.__class__.__name__,scaler.__class__.__name__,mse,r2,cv_scores])


Model: XGBRegressor
Model: LinearRegression
Model: Ridge
Model: SVR
Model: DecisionTreeRegressor
Model: RandomForestRegressor
Model: GradientBoostingRegressor


In [7]:
res = pd.DataFrame(res,columns=["model","scaler","mse","r2","cv_score"]).sort_values(by="r2",ascending=False)
res["cv_mean"] = res['cv_score'].apply(lambda x : np.mean(x))
res

Unnamed: 0,model,scaler,mse,r2,cv_score,cv_mean
23,RandomForestRegressor,RobustScaler,379.813894,0.54577,"[-2367.3714438893385, -376.5672007050638, -299...",-923.976046
21,RandomForestRegressor,StandardScaler,382.509928,0.542546,"[-2434.4096526247567, -360.5378683352575, -296...",-938.470445
22,RandomForestRegressor,MinMaxScaler,385.783025,0.538632,"[-2629.568392880779, -334.9602710010324, -305....",-938.225227
20,RandomForestRegressor,NoneType,385.824842,0.538582,"[-2581.9503092234686, -344.33855605305166, -29...",-951.114164
0,XGBRegressor,NoneType,442.03993,0.471353,"[-1885.1077576442829, -379.64995353961723, -32...",-799.128702
1,XGBRegressor,StandardScaler,442.03993,0.471353,"[-1885.1077576442829, -379.64995353961723, -32...",-799.128702
2,XGBRegressor,MinMaxScaler,442.03993,0.471353,"[-1885.1077576442829, -379.64995353961723, -32...",-799.128702
3,XGBRegressor,RobustScaler,442.03993,0.471353,"[-1885.1077576442829, -379.64995353961723, -32...",-799.128702
24,GradientBoostingRegressor,NoneType,578.286994,0.308411,"[-3338.5592564405088, -467.75899757304, -461.9...",-1232.710462
26,GradientBoostingRegressor,MinMaxScaler,579.910428,0.30647,"[-3340.93541604471, -469.0950323008352, -462.0...",-1208.913387


## Model

Optimisation des paramètres

Creation et Validation du modèle

In [8]:
model = RandomForestRegressor(random_state=42) #**best_params)

# Entraîner le modèle sur les données d'entraînement complètes
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 376.23053152501564
R2 Score: 0.5500558680984087


In [9]:
model.fit(X_test,y_test)

On utilie le model sur les valeurs manquantes

In [10]:
X_test = pd.read_csv(r"X_test_filled.csv",index_col=0)
X_test

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-04-02 00:00:00+02:00,45814.0,3386.0,10902.0,36705.0,6359.0,0.0,56.764535,0.0,111.145187
2023-04-02 01:00:00+02:00,44084.0,3386.0,10902.0,36705.0,6469.0,0.0,54.262133,0.0,110.423241
2023-04-02 02:00:00+02:00,43281.0,3386.0,10902.0,36705.0,6511.0,0.0,78.105928,0.0,110.569878
2023-04-02 03:00:00+02:00,40825.0,3386.0,10902.0,36705.0,6628.0,0.0,78.187557,0.0,82.747284
2023-04-02 04:00:00+02:00,39181.0,3386.0,10902.0,36705.0,6700.0,0.0,96.765484,0.0,82.842125
...,...,...,...,...,...,...,...,...,...
2023-10-24 19:00:00+02:00,49686.0,2226.0,11749.0,42980.0,4901.0,0.0,247.887323,0.0,125.670000
2023-10-24 20:00:00+02:00,53397.0,2226.0,11749.0,42980.0,5584.0,0.0,343.192642,0.0,139.580000
2023-10-24 21:00:00+02:00,50586.0,2226.0,11749.0,42980.0,6306.0,0.0,471.875973,0.0,147.930000
2023-10-24 22:00:00+02:00,46777.0,2226.0,11749.0,42980.0,6959.0,0.0,595.528100,0.0,122.200000


In [11]:
X_test_lag = X_test.shift(1)
X_test_lag.columns=[col+"_lag" for col in X_test_lag.columns]
X_test = pd.concat([X_test,X_test_lag],axis=1).bfill()
X_test

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,load_forecast_lag,coal_power_available_lag,gas_power_available_lag,nucelear_power_available_lag,wind_power_forecasts_average_lag,solar_power_forecasts_average_lag,wind_power_forecasts_std_lag,solar_power_forecasts_std_lag,predicted_spot_price_lag
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-04-02 00:00:00+02:00,45814.0,3386.0,10902.0,36705.0,6359.0,0.0,56.764535,0.0,111.145187,45814.0,3386.0,10902.0,36705.0,6359.0,0.0,56.764535,0.000000,111.145187
2023-04-02 01:00:00+02:00,44084.0,3386.0,10902.0,36705.0,6469.0,0.0,54.262133,0.0,110.423241,45814.0,3386.0,10902.0,36705.0,6359.0,0.0,56.764535,0.000000,111.145187
2023-04-02 02:00:00+02:00,43281.0,3386.0,10902.0,36705.0,6511.0,0.0,78.105928,0.0,110.569878,44084.0,3386.0,10902.0,36705.0,6469.0,0.0,54.262133,0.000000,110.423241
2023-04-02 03:00:00+02:00,40825.0,3386.0,10902.0,36705.0,6628.0,0.0,78.187557,0.0,82.747284,43281.0,3386.0,10902.0,36705.0,6511.0,0.0,78.105928,0.000000,110.569878
2023-04-02 04:00:00+02:00,39181.0,3386.0,10902.0,36705.0,6700.0,0.0,96.765484,0.0,82.842125,40825.0,3386.0,10902.0,36705.0,6628.0,0.0,78.187557,0.000000,82.747284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-24 19:00:00+02:00,49686.0,2226.0,11749.0,42980.0,4901.0,0.0,247.887323,0.0,125.670000,47307.0,2226.0,11749.0,42980.0,4424.0,694.0,272.109576,50.001802,99.820000
2023-10-24 20:00:00+02:00,53397.0,2226.0,11749.0,42980.0,5584.0,0.0,343.192642,0.0,139.580000,49686.0,2226.0,11749.0,42980.0,4901.0,0.0,247.887323,0.000000,125.670000
2023-10-24 21:00:00+02:00,50586.0,2226.0,11749.0,42980.0,6306.0,0.0,471.875973,0.0,147.930000,53397.0,2226.0,11749.0,42980.0,5584.0,0.0,343.192642,0.000000,139.580000
2023-10-24 22:00:00+02:00,46777.0,2226.0,11749.0,42980.0,6959.0,0.0,595.528100,0.0,122.200000,50586.0,2226.0,11749.0,42980.0,6306.0,0.0,471.875973,0.000000,147.930000


In [None]:
X_test = pd.get_dummies(X_test)
y_test = model.predict(X_test)

In [None]:
Y_test = pd.Series(y_test,index = X_test.index)
Y_test.name = "spot_id_delta"
Y_test

DELIVERY_START
2023-04-02 00:00:00+02:00    1.169882
2023-04-02 01:00:00+02:00    1.817912
2023-04-02 02:00:00+02:00    2.778389
2023-04-02 03:00:00+02:00    4.458980
2023-04-02 04:00:00+02:00   -0.427356
                               ...   
2023-10-24 19:00:00+02:00   -1.092920
2023-10-24 20:00:00+02:00   -6.371372
2023-10-24 21:00:00+02:00   -9.957579
2023-10-24 22:00:00+02:00    1.742788
2023-10-24 23:00:00+02:00    1.592608
Name: spot_id_delta, Length: 4942, dtype: float64

In [None]:
Y_test.to_csv("y_test.csv")