In [11]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split


## Data traitment

Collecte et traitement des données

In [12]:
# Load data
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

In [13]:
# Preprocessing: index
X_train['DELIVERY_START'] = pd.to_datetime(X_train['DELIVERY_START'],utc=True)
X_test['DELIVERY_START'] = pd.to_datetime(X_test['DELIVERY_START'],utc=True)
X_train.set_index('DELIVERY_START',inplace=True)
X_test.set_index('DELIVERY_START',inplace=True)

Split between train and test

In [14]:
X_train_train = X_train.dropna(subset=['predicted_spot_price'])
X_train_test = X_train.loc[~X_train.index.isin(X_train_train.index)]
X_test_train = X_test.dropna(subset=['predicted_spot_price'])
X_test_test = X_test.loc[~X_test.index.isin(X_test_train.index)]

 NaN traitement

Separation between X and Y

In [15]:
# Assuming we will drop timestamps for clustering
X_train_clustering = X_train_train.drop(['predicted_spot_price'], axis=1)
y_train_clustering = X_train_train[['predicted_spot_price']]
X_test_clustering = X_test_train.drop(['predicted_spot_price'], axis=1)
y_test_clustering = X_test_train[['predicted_spot_price']]

Clustering

In [16]:
#Impute missing values for clustering dataset (using mean strategy)
imputer = SimpleImputer(strategy='mean')
X_train_clustering_imputed = imputer.fit_transform(X_train_clustering)
X_test_clustering_imputed = imputer.transform(X_test_clustering)

# Standardize features
scaler = StandardScaler()
X_train_clustering_scaled = scaler.fit_transform(X_train_clustering_imputed)
X_test_clustering_scaled = scaler.transform(X_test_clustering_imputed)

# Apply K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
X_train_clustering['cluster'] = kmeans.fit_predict(X_train_clustering_scaled)
X_test_clustering['cluster'] = kmeans.predict(X_test_clustering_scaled)

In [20]:
# Impute missing values for all features (except timestamps and cluster labels)
clusters = X_train_clustering.groupby(by='cluster')
for i,c in clusters : 
    c.fillna(c.mean(), inplace=True)
    X_train_clustering.loc[X_train_clustering['cluster'] == i] = c
clusters = X_test_clustering.groupby(by='cluster')
for i,c in clusters : 
    c.fillna(c.mean(), inplace=True)
    X_test_clustering.loc[X_test_clustering['cluster'] == i] = c

In [42]:
# Define and train models for imputing predicted_spot_price
cluster_models = {}

for cluster in X_train_clustering['cluster'].unique():
    cluster_data_train = X_train_clustering[X_train_clustering['cluster'] == cluster]
    X_cluster_train = cluster_data_train.drop(['cluster'], axis=1)
    y_cluster_train = y_train_clustering.loc[X_cluster_train.index]

    cluster_data_test = X_test_clustering[X_test_clustering['cluster'] == cluster]
    X_cluster_test = cluster_data_test.drop(['cluster'], axis=1)
    y_cluster_test = y_test_clustering.loc[X_cluster_test.index]

    if len(X_cluster_train) > 10:
        model = LinearRegression()
        model.fit(X_cluster_train, y_cluster_train)
        cluster_models[cluster] = model
        X_cluster_predict = X_cluster_test
        if not X_cluster_predict.empty:
            predicted_values = model.predict(X_cluster_predict)
            X_test_train.loc[cluster_data_test.index,'predicted_spot_price_imputed'] = predicted_values
    else:
        mean_value = y_cluster_test.mean()
        X_test_train.loc[cluster_data_test.index,'predicted_spot_price_imputed'] = y_cluster_test.fillna(mean_value).values

X_test_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_train.loc[cluster_data_test.index,'predicted_spot_price_imputed'] = predicted_values


Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,predicted_spot_price,predicted_spot_price_imputed
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-04-04 00:00:00+00:00,49276.0,3386.0,10902.0,38230.0,6290.0,0.0,50.187654,0.0,112.26,101.279458
2023-04-04 01:00:00+00:00,47821.0,3386.0,10902.0,38230.0,5881.0,0.0,43.212955,0.0,109.62,97.540504
2023-04-04 02:00:00+00:00,47758.0,3386.0,10902.0,38230.0,5601.0,0.0,55.637012,0.0,106.16,98.031739
2023-04-04 03:00:00+00:00,50500.0,3386.0,10902.0,38230.0,5399.0,0.0,69.342123,0.0,106.33,106.732964
2023-04-04 04:00:00+00:00,55648.0,3386.0,10902.0,38230.0,5223.0,0.0,126.094269,0.0,106.90,123.235674
...,...,...,...,...,...,...,...,...,...,...
2023-10-24 17:00:00+00:00,49686.0,2226.0,11749.0,42980.0,4901.0,0.0,247.887323,0.0,125.67,112.164271
2023-10-24 18:00:00+00:00,53397.0,2226.0,11749.0,42980.0,5584.0,0.0,343.192642,0.0,139.58,128.441189
2023-10-24 19:00:00+00:00,50586.0,2226.0,11749.0,42980.0,6306.0,0.0,471.875973,0.0,147.93,127.974725
2023-10-24 20:00:00+00:00,46777.0,2226.0,11749.0,42980.0,6959.0,0.0,595.528100,0.0,122.20,67.472368


In [43]:
y_cluster_test.predicted_spot_price

DELIVERY_START
2023-07-04 21:00:00+00:00    133.62
2023-10-19 20:00:00+00:00    113.86
2023-10-19 21:00:00+00:00     97.10
2023-10-20 15:00:00+00:00     82.00
2023-10-21 15:00:00+00:00     16.06
2023-10-21 16:00:00+00:00     34.56
2023-10-21 17:00:00+00:00     59.49
2023-10-21 18:00:00+00:00     78.72
2023-10-21 19:00:00+00:00     90.35
2023-10-21 20:00:00+00:00     74.11
2023-10-21 21:00:00+00:00     57.87
2023-10-24 20:00:00+00:00    122.20
2023-10-24 21:00:00+00:00    105.12
Name: predicted_spot_price, dtype: float64

In [None]:
# Prepare final datasets
X_train_final = X_train.drop(['DELIVERY_START', 'predicted_spot_price', 'cluster'], axis=1)
X_train_final['predicted_spot_price'] = X_train['predicted_spot_price_imputed']
y_train_final = y_train['spot_id_delta']

# Train and evaluate regression model to predict Y
X_train_model, X_val_model, y_train_model, y_val_model = train_test_split(X_train_final, y_train_final, test_size=0.2, random_state=42)
model_y = LinearRegression()
model_y.fit(X_train_model, y_train_model)
y_val_pred = model_y.predict(X_val_model)
mse = mean_squared_error(y_val_model, y_val_pred)

mse
