# Running models and cross validation 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from termcolor import colored
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn import metrics

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
#Time series split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

In [None]:
df = pd.read_csv('')

In [None]:
# Separating features and target variable
X = df(['Price'], axis=1)
y = df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Cross validation

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

Iterate through cross validation

In [None]:
for train_index, test_index in tscv.split(data):
    train_data, test_data = data.iloc[train_index], data.iloc[test_index]

Create the dataframe 

In [18]:
df1 = pd.read_csv('merged_luisfer.CSV')
df2 = pd.read_csv('merged_isabella.csv')
df3 = pd.read_csv('merged_camilo.csv', index_col=False)

In [19]:
df1

Unnamed: 0.1,Unnamed: 0,Date,Solar,Wind Onshore,country_index,Wind Offshore
0,0,2015-01-01 00:00:00+01:00,0.0,118.0,AT,
1,1,2015-01-01 00:15:00+01:00,0.0,118.0,AT,
2,2,2015-01-01 00:30:00+01:00,0.0,117.0,AT,
3,3,2015-01-01 00:45:00+01:00,0.0,116.0,AT,
4,4,2015-01-01 01:00:00+01:00,0.0,146.0,AT,
...,...,...,...,...,...,...
705207,78611,2023-12-30 19:00:00+01:00,0.0,665.0,DK_1,570.0
705208,78612,2023-12-30 20:00:00+01:00,0.0,466.0,DK_1,452.0
705209,78613,2023-12-30 21:00:00+01:00,0.0,323.0,DK_1,352.0
705210,78614,2023-12-30 22:00:00+01:00,0.0,227.0,DK_1,315.0


In [20]:
df2

Unnamed: 0.1,Unnamed: 0,Date,Solar,Wind Onshore,country_index,Wind Offshore
0,0,2015-01-01 00:00:00+01:00,0.0,,IT_SARD,
1,1,2015-01-01 01:00:00+01:00,0.0,,IT_SARD,
2,2,2015-01-01 02:00:00+01:00,0.0,,IT_SARD,
3,3,2015-01-01 03:00:00+01:00,0.0,,IT_SARD,
4,4,2015-01-01 04:00:00+01:00,0.0,,IT_SARD,
...,...,...,...,...,...,...
1048570,76801,2023-10-08 16:00:00+02:00,0.0,89.0,NO_4,
1048571,76802,2023-10-08 17:00:00+02:00,0.0,83.0,NO_4,
1048572,76803,2023-10-08 18:00:00+02:00,0.0,76.0,NO_4,
1048573,76804,2023-10-08 19:00:00+02:00,0.0,74.0,NO_4,


In [21]:
df3

Unnamed: 0.1,Unnamed: 0,Date,Solar,Wind Offshore,country_index,Wind Onshore
0,0,2015-01-07 00:00:00+01:00,0.0,316.0,DK_2,267.0
1,1,2015-01-07 01:00:00+01:00,0.0,302.0,DK_2,235.0
2,2,2015-01-07 02:00:00+01:00,0.0,269.0,DK_2,204.0
3,3,2015-01-07 03:00:00+01:00,0.0,234.0,DK_2,173.0
4,4,2015-01-07 04:00:00+01:00,0.0,182.0,DK_2,142.0
...,...,...,...,...,...,...
1048570,58440,2021-09-01 07:00:00+02:00,65.0,,IT_CNOR,3.0
1048571,58441,2021-09-01 08:00:00+02:00,271.0,,IT_CNOR,4.0
1048572,58442,2021-09-01 09:00:00+02:00,509.0,,IT_CNOR,4.0
1048573,58443,2021-09-01 10:00:00+02:00,684.0,,IT_CNOR,6.0


In [22]:
print(df1.shape)
print(df2.shape)
print(df3.shape)

(705212, 6)
(1048575, 6)
(1048575, 6)


In [65]:
df_concatenated = pd.concat([df1, df2, df3], ignore_index=True)

In [66]:
df_concatenated

Unnamed: 0.1,Unnamed: 0,Date,Solar,Wind Onshore,country_index,Wind Offshore
0,0,2015-01-01 00:00:00+01:00,0.0,118.0,AT,
1,1,2015-01-01 00:15:00+01:00,0.0,118.0,AT,
2,2,2015-01-01 00:30:00+01:00,0.0,117.0,AT,
3,3,2015-01-01 00:45:00+01:00,0.0,116.0,AT,
4,4,2015-01-01 01:00:00+01:00,0.0,146.0,AT,
...,...,...,...,...,...,...
2802357,58440,2021-09-01 07:00:00+02:00,65.0,3.0,IT_CNOR,
2802358,58441,2021-09-01 08:00:00+02:00,271.0,4.0,IT_CNOR,
2802359,58442,2021-09-01 09:00:00+02:00,509.0,4.0,IT_CNOR,
2802360,58443,2021-09-01 10:00:00+02:00,684.0,6.0,IT_CNOR,


In [67]:
print(df_concatenated.head())


   Unnamed: 0                       Date  Solar  Wind Onshore country_index  \
0           0  2015-01-01 00:00:00+01:00    0.0         118.0            AT   
1           1  2015-01-01 00:15:00+01:00    0.0         118.0            AT   
2           2  2015-01-01 00:30:00+01:00    0.0         117.0            AT   
3           3  2015-01-01 00:45:00+01:00    0.0         116.0            AT   
4           4  2015-01-01 01:00:00+01:00    0.0         146.0            AT   

   Wind Offshore  
0            NaN  
1            NaN  
2            NaN  
3            NaN  
4            NaN  


In [26]:
print(df_concatenated.tail()) 

         Unnamed: 0                       Date  Solar  Wind Onshore  \
2802357       58440  2021-09-01 07:00:00+02:00   65.0           3.0   
2802358       58441  2021-09-01 08:00:00+02:00  271.0           4.0   
2802359       58442  2021-09-01 09:00:00+02:00  509.0           4.0   
2802360       58443  2021-09-01 10:00:00+02:00  684.0           6.0   
2802361       58444  2021-09-01 11:00:00+02:00  784.0           3.0   

        country_index  Wind Offshore  
2802357       IT_CNOR            NaN  
2802358       IT_CNOR            NaN  
2802359       IT_CNOR            NaN  
2802360       IT_CNOR            NaN  
2802361       IT_CNOR            NaN  


In [68]:
df_concatenated.to_csv('all_merged.csv', index=False)

Now we're missing merging all the dataframe


In [3]:
df4 = pd.read_csv('all_merged.csv')

In [4]:
if 'Date' in df4.columns and 'country_index' in df4.columns:
    df4 = df4.set_index(['Date', 'country_index'])
    df4.index = df4.index.rename(['Date', 'Country'])  # Rename index levels directly
else:
    print("Required columns for index are not present in df4")

In [75]:
print(df4.index)

MultiIndex([('2015-01-01 00:00:00+01:00',      'AT'),
            ('2015-01-01 00:15:00+01:00',      'AT'),
            ('2015-01-01 00:30:00+01:00',      'AT'),
            ('2015-01-01 00:45:00+01:00',      'AT'),
            ('2015-01-01 01:00:00+01:00',      'AT'),
            ('2015-01-01 01:15:00+01:00',      'AT'),
            ('2015-01-01 01:30:00+01:00',      'AT'),
            ('2015-01-01 01:45:00+01:00',      'AT'),
            ('2015-01-01 02:00:00+01:00',      'AT'),
            ('2015-01-01 02:15:00+01:00',      'AT'),
            ...
            ('2021-09-01 02:00:00+02:00', 'IT_CNOR'),
            ('2021-09-01 03:00:00+02:00', 'IT_CNOR'),
            ('2021-09-01 04:00:00+02:00', 'IT_CNOR'),
            ('2021-09-01 05:00:00+02:00', 'IT_CNOR'),
            ('2021-09-01 06:00:00+02:00', 'IT_CNOR'),
            ('2021-09-01 07:00:00+02:00', 'IT_CNOR'),
            ('2021-09-01 08:00:00+02:00', 'IT_CNOR'),
            ('2021-09-01 09:00:00+02:00', 'IT_CNOR'),
            

In [6]:
df5 =pd.read_csv('oil_hydro_biomass_gas_nuclear_merged.csv')

In [56]:
df5.dtypes

index                                   int64
water_reservoirs_and_hydro_storage    float64
energy_price                          float64
biomass                               float64
gas                                   float64
nuclear                               float64
dtype: object

In [77]:
print(df5.index)

MultiIndex([('2014-12-31 23:00:00',                      'PT'),
            ('2015-01-01 00:00:00',                      'BE'),
            ('2015-01-01 00:00:00',                      'CH'),
            ('2015-01-01 00:00:00',                      'CZ'),
            ('2015-01-01 00:00:00',    'Combined_AT_DE_AT_LU'),
            ('2015-01-01 00:00:00', 'Combined_DE_LU_DE_AT_LU'),
            ('2015-01-01 00:00:00',                    'DK_1'),
            ('2015-01-01 00:00:00',                    'DK_2'),
            ('2015-01-01 00:00:00',                      'EE'),
            ('2015-01-01 00:00:00',                      'ES'),
            ...
            ('2023-12-24 23:00:00',                    'NO_2'),
            ('2023-12-24 23:00:00',                    'NO_3'),
            ('2023-12-24 23:00:00',                    'NO_4'),
            ('2023-12-24 23:00:00',                    'NO_5'),
            ('2023-12-24 23:00:00',                      'RS'),
            ('2023-12-24

In [7]:
df4 = df4.reset_index()

# Resetting the index for df5
df5 = df5.reset_index()

In [8]:
df4 = df4.set_index(['Date', 'Country'])
df5 = df5.set_index(['Date', 'Country'])

Now the merge of a single dataframe with all features 

In [64]:
print(df4.index.get_level_values('Date').dtype)
print(df5.index.get_level_values('Date').dtype)

object
datetime64[ns]


In [9]:
all_featuresdf = pd.merge(df4, df5, left_index=True, right_index=True, how='outer')

In [11]:
all_featuresdf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,Solar,Wind Onshore,Wind Offshore,index,water_reservoirs_and_hydro_storage,energy_price,biomass,gas,nuclear
Date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-12-31 23:00:00,PT,,,,,0.0,,,310.0,575.0,
2014-12-31 23:00:00+00:00,GB,0.0,0.0,4546.0,3165.0,,,,,,
2014-12-31 23:30:00+00:00,GB,1.0,0.0,4546.0,3165.0,,,,,,
2015-01-01 00:00:00,BE,,,,,1.0,,,68.0,2536.0,2875.0
2015-01-01 00:00:00,CH,,,,,2.0,,44.94,,,
...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 00:45:00+02:00,FI,94772.0,0.0,1739.0,,,,,,,
2023-12-31 01:00:00+02:00,BG,74639.0,0.0,315.0,,,,,,,
2023-12-31 01:00:00+02:00,EE,78528.0,0.0,0.0,,,,,,,
2023-12-31 01:00:00+02:00,GR,77208.0,0.0,235.0,,,,,,,


In [10]:
all_featuresdf.to_csv('all_features_merged.csv')

In [12]:
df = pd.read_csv('all_features_merged.csv')

In [13]:
df

Unnamed: 0.1,Date,Country,Unnamed: 0,Solar,Wind Onshore,Wind Offshore,index,water_reservoirs_and_hydro_storage,energy_price,biomass,gas,nuclear
0,2014-12-31 23:00:00,PT,,,,,0.0,,,310.0,575.0,
1,2014-12-31 23:00:00+00:00,GB,0.0,0.0,4546.0,3165.0,,,,,,
2,2014-12-31 23:30:00+00:00,GB,1.0,0.0,4546.0,3165.0,,,,,,
3,2015-01-01 00:00:00,BE,,,,,1.0,,,68.0,2536.0,2875.0
4,2015-01-01 00:00:00,CH,,,,,2.0,,44.94,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
6379643,2023-12-31 00:45:00+02:00,FI,94772.0,0.0,1739.0,,,,,,,
6379644,2023-12-31 01:00:00+02:00,BG,74639.0,0.0,315.0,,,,,,,
6379645,2023-12-31 01:00:00+02:00,EE,78528.0,0.0,0.0,,,,,,,
6379646,2023-12-31 01:00:00+02:00,GR,77208.0,0.0,235.0,,,,,,,
