In [124]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from termcolor import colored
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn import metrics

In [131]:
# Loading final dataset
df = pd.read_csv('C:/Users/cpedr/OneDrive - Hertie School/Semester 4/Machine Learning/Project/all_features_merged.csv')
print(df.head())
print(df.shape)

#Counting NA
nan_count = df['energy_price'].isna().sum()
print(nan_count)

df_cleared = df.dropna(subset=['energy_price'])
print(df_cleared.shape)

               Date Country  Solar  Wind Onshore  Wind Offshore  \
0  2014-12-31 23:00      GB    0.0       4546.00         3165.0   
1  2015-01-01 00:00      AT    0.0        117.25            NaN   
2  2015-01-01 00:00      BE    0.0        246.00          420.0   
3  2015-01-01 00:00      CH    0.0          4.00            NaN   
4  2015-01-01 00:00      CZ    0.0           NaN            NaN   

   water_reservoirs_and_hydro_storage  energy_price  biomass  gas  nuclear  
0                                 NaN           NaN      NaN  NaN      NaN  
1                                 NaN           NaN      NaN  NaN      NaN  
2                                 NaN           NaN      NaN  NaN      NaN  
3                                 NaN           NaN      NaN  NaN      NaN  
4                                 NaN           NaN      NaN  NaN      NaN  
(5551063, 10)
2179723
(3371340, 10)


In [132]:
df.dtypes

Date                                   object
Country                                object
Solar                                 float64
Wind Onshore                          float64
Wind Offshore                         float64
water_reservoirs_and_hydro_storage    float64
energy_price                          float64
biomass                               float64
gas                                   float64
nuclear                               float64
dtype: object

### Handling Dates

In [133]:
# Helper function to handle multiple date formats
def convert_dates(df, column_name, formats):
    # Start with a copy of the column to avoid altering the original data
    temp_series = pd.Series(pd.NaT, index=df.index)
    
    # Try each format and update only NaT entries
    for fmt in formats:
        temp_series = temp_series.combine_first(pd.to_datetime(df[column_name], format=fmt, errors='coerce'))
    
    return temp_series

# List of date formats you expect in your data
date_formats = ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M']

# Convert the 'dates' column to datetime
df['Date'] = convert_dates(df, 'Date', date_formats)

# Extract components
df['year'] = df['Date'].dt.year.astype('Int64')
df['month'] = df['Date'].dt.month.astype('Int64')
df['day'] = df['Date'].dt.day.astype('Int64')
df['hour'] = df['Date'].dt.hour.astype('Int64')
df['week_number'] = df['Date'].dt.isocalendar().week.astype('Int64')

# Extract the day of the week as an integer (Monday=0, Sunday=6)
df['day_of_week'] = df['Date'].dt.dayofweek

df

Unnamed: 0,Date,Country,Solar,Wind Onshore,Wind Offshore,water_reservoirs_and_hydro_storage,energy_price,biomass,gas,nuclear,year,month,day,hour,week_number,day_of_week
0,2014-12-31 23:00:00,GB,0.0,4546.00,3165.0,,,,,,2014,12,31,23,1,2
1,2015-01-01 00:00:00,AT,0.0,117.25,,,,,,,2015,1,1,0,1,3
2,2015-01-01 00:00:00,BE,0.0,246.00,420.0,,,,,,2015,1,1,0,1,3
3,2015-01-01 00:00:00,CH,0.0,4.00,,,,,,,2015,1,1,0,1,3
4,2015-01-01 00:00:00,CZ,0.0,,,,,,,,2015,1,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5551058,2023-12-31 00:00:00,SE_2,,,,,44.87,,,,2023,12,31,0,52,6
5551059,2023-12-31 00:00:00,SE_3,,,,,44.87,,,,2023,12,31,0,52,6
5551060,2023-12-31 00:00:00,SE_4,,,,,44.87,,,,2023,12,31,0,52,6
5551061,2023-12-31 00:00:00,SI,,,,,33.30,,,,2023,12,31,0,52,6


In [134]:
df.to_csv('C:/Users/cpedr/OneDrive - Hertie School/Semester 4/Machine Learning/Project/dates_features.csv')

# Only dummies (except the energy variables)

In [140]:
# Loading final dataset
df = pd.read_csv('C:/Users/cpedr/OneDrive - Hertie School/Semester 4/Machine Learning/Project/dates_features.csv')

df.drop(['Unnamed: 0', 'Date'], axis=1, inplace=True)
df

Unnamed: 0,Country,Solar,Wind Onshore,Wind Offshore,water_reservoirs_and_hydro_storage,energy_price,biomass,gas,nuclear,year,month,day,hour,week_number,day_of_week
0,GB,0.0,4546.00,3165.0,,,,,,2014,12,31,23,1,2
1,AT,0.0,117.25,,,,,,,2015,1,1,0,1,3
2,BE,0.0,246.00,420.0,,,,,,2015,1,1,0,1,3
3,CH,0.0,4.00,,,,,,,2015,1,1,0,1,3
4,CZ,0.0,,,,,,,,2015,1,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5551058,SE_2,,,,,44.87,,,,2023,12,31,0,52,6
5551059,SE_3,,,,,44.87,,,,2023,12,31,0,52,6
5551060,SE_4,,,,,44.87,,,,2023,12,31,0,52,6
5551061,SI,,,,,33.30,,,,2023,12,31,0,52,6


In [141]:
# Create a 'weekend' dummy variable where Saturday and Sunday are marked as 1, others as 0
df['weekend'] = (df['day_of_week'] >= 5).astype(int)

In [143]:
# List of columns for which to create dummies
columns_to_dummy = ['Country','year', 'month', 'day', 'hour', 'week_number', 'day_of_week']

# Loop through each column and create dummy variables with a prefix
for column in columns_to_dummy:
    dummies = pd.get_dummies(df[column], prefix=column).astype(int)
    df = pd.concat([df, dummies], axis=1)

df.drop('Country', axis=1, inplace=True)
df

Unnamed: 0,Solar,Wind Onshore,Wind Offshore,water_reservoirs_and_hydro_storage,energy_price,biomass,gas,nuclear,year,month,...,week_number_51,week_number_52,week_number_53,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,0.0,4546.00,3165.0,,,,,,2014,12,...,0,0,0,0,0,1,0,0,0,0
1,0.0,117.25,,,,,,,2015,1,...,0,0,0,0,0,0,1,0,0,0
2,0.0,246.00,420.0,,,,,,2015,1,...,0,0,0,0,0,0,1,0,0,0
3,0.0,4.00,,,,,,,2015,1,...,0,0,0,0,0,0,1,0,0,0
4,0.0,,,,,,,,2015,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5551058,,,,,44.87,,,,2023,12,...,0,1,0,0,0,0,0,0,0,1
5551059,,,,,44.87,,,,2023,12,...,0,1,0,0,0,0,0,0,0,1
5551060,,,,,44.87,,,,2023,12,...,0,1,0,0,0,0,0,0,0,1
5551061,,,,,33.30,,,,2023,12,...,0,1,0,0,0,0,0,0,0,1


In [144]:
df.to_csv('C:/Users/cpedr/OneDrive - Hertie School/Semester 4/Machine Learning/Project/all_dummies.csv')

# Modelling 

In [151]:
df1 = pd.read_csv('C:/Users/cpedr/OneDrive - Hertie School/Semester 4/Machine Learning/Project/dates_features.csv')
df1.drop('Unnamed: 0', axis=1, inplace=True)

In [152]:
df1_cleaned = df1.dropna(subset=['energy_price'])
df1_cleaned

Unnamed: 0,Date,Country,Solar,Wind Onshore,Wind Offshore,water_reservoirs_and_hydro_storage,energy_price,biomass,gas,nuclear,year,month,day,hour,week_number,day_of_week
1976013,2015-01-01 00:00:00,CH,,,,,44.94,,,,2015,1,1,0,1,3
1976014,2015-01-01 00:00:00,CZ,,,,,26.48,135.0,172.0,2596.0,2015,1,1,0,1,3
1976017,2015-01-01 00:00:00,DK_1,,,,,25.02,18.0,233.0,,2015,1,1,0,1,3
1976018,2015-01-01 00:00:00,DK_2,,,,,27.38,25.0,304.0,,2015,1,1,0,1,3
1976019,2015-01-01 00:00:00,EE,,,,,27.38,,,,2015,1,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5551058,2023-12-31 00:00:00,SE_2,,,,,44.87,,,,2023,12,31,0,52,6
5551059,2023-12-31 00:00:00,SE_3,,,,,44.87,,,,2023,12,31,0,52,6
5551060,2023-12-31 00:00:00,SE_4,,,,,44.87,,,,2023,12,31,0,52,6
5551061,2023-12-31 00:00:00,SI,,,,,33.30,,,,2023,12,31,0,52,6


In [153]:
from sklearn.impute import SimpleImputer

In [180]:
df1_cleaned.isna().sum() 

Date                                        0
Country                                     0
Solar                                 3371340
Wind Onshore                          3371340
Wind Offshore                         3371340
water_reservoirs_and_hydro_storage    3371340
energy_price                                0
biomass                               1537445
gas                                   1313978
nuclear                               2591288
year                                        0
month                                       0
day                                         0
hour                                        0
week_number                                 0
day_of_week                                 0
dtype: int64

In [181]:
df1_cleaned.drop(['Solar', 'Wind Onshore', 'Wind Offshore', 'water_reservoirs_and_hydro_storage'], axis=1, inplace=True)

# Create an instance of SimpleImputer with mean strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Apply fit_transform directly and replace in the DataFrame
columns_to_impute = ['biomass', 'gas', 'nuclear']
df1_cleaned[columns_to_impute] = imputer.fit_transform(df1_cleaned[columns_to_impute])

df1_cleaned

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_cleaned.drop(['Solar', 'Wind Onshore', 'Wind Offshore', 'water_reservoirs_and_hydro_storage'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_cleaned[columns_to_impute] = imputer.fit_transform(df1_cleaned[columns_to_impute])


Unnamed: 0,Date,Country,energy_price,biomass,gas,nuclear,year,month,day,hour,week_number,day_of_week
1976013,2015-01-01 00:00:00,CH,44.94,434.052289,1180.283774,7185.089448,2015,1,1,0,1,3
1976014,2015-01-01 00:00:00,CZ,26.48,135.000000,172.000000,2596.000000,2015,1,1,0,1,3
1976017,2015-01-01 00:00:00,DK_1,25.02,18.000000,233.000000,7185.089448,2015,1,1,0,1,3
1976018,2015-01-01 00:00:00,DK_2,27.38,25.000000,304.000000,7185.089448,2015,1,1,0,1,3
1976019,2015-01-01 00:00:00,EE,27.38,434.052289,1180.283774,7185.089448,2015,1,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
5551058,2023-12-31 00:00:00,SE_2,44.87,434.052289,1180.283774,7185.089448,2023,12,31,0,52,6
5551059,2023-12-31 00:00:00,SE_3,44.87,434.052289,1180.283774,7185.089448,2023,12,31,0,52,6
5551060,2023-12-31 00:00:00,SE_4,44.87,434.052289,1180.283774,7185.089448,2023,12,31,0,52,6
5551061,2023-12-31 00:00:00,SI,33.30,434.052289,1180.283774,7185.089448,2023,12,31,0,52,6


In [182]:
df1_cleaned.isna().sum() 

Date            0
Country         0
energy_price    0
biomass         0
gas             0
nuclear         0
year            0
month           0
day             0
hour            0
week_number     0
day_of_week     0
dtype: int64