Combine data from DMI, Energinet and FRED
=======================
This loads in csv file created by the [loadbulkweather.ipynb](loadbulkweather.ipynb) and pulls data from the Energinet API

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pre_processing as pp
import yfinance as yf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

%load_ext autoreload
%autoreload 2

## Loading DMI data

In [2]:
# path to the CSV file
path = '/Users/johan/Documents/04 Uni/09 Asset Pricing Data/weather-muni/weather_muni.csv'
path_out = '../../data/fulldata.csv'

# read the data
df_weather = pd.read_csv(path)

In [None]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11955314 entries, 0 to 11955313
Data columns (total 25 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   municipalityId                int64  
 1   municipalityName              object 
 2   from                          object 
 3   to                            object 
 4   geometry_type                 object 
 5   coordinates                   object 
 6   mean_radiation                float64
 7   mean_wind_speed               float64
 8   acc_precip                    float64
 9   temp_soil_10                  float64
 10  max_wind_speed_3sec           float64
 11  temp_grass                    float64
 12  max_temp_w_date               float64
 13  mean_relative_hum             float64
 14  max_wind_speed_10min          float64
 15  mean_cloud_cover              float64
 16  leaf_moisture                 float64
 17  mean_temp                     float64
 18  vapour_pressure_defi

In [4]:
# define ids
if 'cellId' in df_weather.columns:
    ids = ['cellId']
if 'municipalityId' in df_weather.columns:
    ids = ['municipalityName','municipalityId']

# convert to datetime
df_weather['from'] = pd.to_datetime(df_weather['from'])
df_weather['to'] = pd.to_datetime(df_weather['to'])

# columns to keep
cols = ids + ['from', 'to', 'coordinates', 'geometry_type', # general columns
              'mean_temp', # temperature data
              'mean_wind_speed', 'mean_wind_dir', # wind data
            #   'mean_cloud_cover', 'bright_sunshine', # cloud and sunshine data
              ]

# keep only the columns we need
df_weather = df_weather[cols]

# rename columns
df_weather.rename(columns={'mean_temp':'temp','mean_wind_speed':'wind_speed'}, inplace=True)

# adding electricity area to the dataframe DK1 is the price area for west Denmark, DK2 is for east Denmark https://energinet.dk/El/Systemydelser/Introduktion-til-Systemydelser/Oversigt-over-systemydelser/
if 'cellId' in df_weather.columns:
    df_weather['area'] = df_weather['cellId'].apply(lambda x: 'DK1' if int(x[-2:]) < 62 else 'DK2')
elif 'municipalityId' in df_weather.columns:
    df_weather['area'] = df_weather['municipalityId'].apply(lambda x: 'DK1' if x > 400 else 'DK2')

display(df_weather.head())

Unnamed: 0,municipalityName,municipalityId,from,to,coordinates,geometry_type,temp,wind_speed,mean_wind_dir,area
0,København,101,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.49390862, 55.7040906]",Point,3.1,10.0,250,DK2
1,Frederiksberg,147,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.52373306, 55.67936546]",Point,3.2,9.7,248,DK2
2,Ballerup,151,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.36840182, 55.72775072]",Point,2.9,10.6,258,DK2
3,Brøndby,153,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.40438199, 55.64503727]",Point,3.1,10.4,253,DK2
4,Dragør,155,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.65022813, 55.59380739]",Point,3.1,11.8,259,DK2


In [5]:
# create a fact table with information on municipalities
df_facts = df_weather[ids + ['area','coordinates','geometry_type']].drop_duplicates()

In [6]:
# count nan values in each column
print(df_weather.isna().sum()) 

municipalityName    0
municipalityId      0
from                0
to                  0
coordinates         0
geometry_type       0
temp                0
wind_speed          0
mean_wind_dir       0
area                0
dtype: int64


In [7]:
# controlling for values completely missing
control = df_weather[['from', 'to']].apply(pd.to_datetime).drop_duplicates()
control.set_index('from', inplace=True)

# Create a complete date range from the start to the end of your data
full_range = pd.date_range(start=control.index.min(), end=control.index.max(), freq='h')

# Reindex your DataFrame to this full range
df_reindexed = control.reindex(full_range)

# Identify missing hours
missing_hours = df_reindexed[df_reindexed.isnull().any(axis=1)]

print("Missing hours:")
print(missing_hours.index)

Missing hours:
DatetimeIndex([], dtype='datetime64[ns, UTC]', freq='h')


In [8]:
# adding missing hours
if missing_hours.shape[0] < 720:
    # ensure datetime format
    df_weather['from'] = pd.to_datetime(df_weather['from'])

    # original shape
    shape_original = df_weather.shape

    # get the min and max 'from' dates per ID
    min_max_dates = df_weather.groupby(ids[0])['from'].agg(['min', 'max']).reset_index()

    # for each ID, create a DataFrame with all hourly 'from' timestamps between min and max dates
    date_ranges = []
    for idx, row in min_max_dates.iterrows():
        id_value = row[ids[0]]
        start_date = row['min']
        end_date = row['max']
        date_range = pd.date_range(start=start_date, end=end_date, freq='h')
        df_id = pd.DataFrame({ids[0]: id_value, 'from': date_range})
        date_ranges.append(df_id)

    # concatenate all date ranges into a single DataFrame
    df_all_hours = pd.concat(date_ranges, ignore_index=True)

    # merge the original data with the full hourly data
    df_weather = pd.merge(df_all_hours, df_weather, on=[ids[0], 'from'], how='left')

    # create the 'to' column by adding one hour to 'from'
    df_weather['to'] = df_weather['from'] + pd.DateOffset(hours=1)

    # Optional: Sort the DataFrame
    df_weather.sort_values(by=['from', ids[0]], inplace=True)

    # reset index
    df_weather.reset_index(drop=True, inplace=True)

    # new shape
    shape_new = df_weather.shape

    print(f"Added {shape_new[0] - shape_original[0]} rows")

Added 0 rows


In [9]:
# adding the missing values
df_weather = pd.merge(df_weather, df_facts, on=ids[0], how='left')

# remove columns that end with '_x'
df_weather = df_weather.drop(columns=[col for col in df_weather.columns if col.endswith('_x')])

# rename columns that end with '_y' by removing the '_y' suffix
df_weather = df_weather.rename(columns=lambda x: x[:-2] if x.endswith('_y') else x)

In [10]:
# select only float columns for interpolation
float_columns = df_weather.select_dtypes(include=['float']).columns

# count NaN values across all float columns
nan_count = df_weather[float_columns].isna().sum().sum()
print(f'Total NaN values: {nan_count}')

# Check if NaN values exceed 10,000
if nan_count > 10000:
    # Create a boolean mask for rows with NaN values in any float column
    nan_mask = df_weather[float_columns].isna().any(axis=1)
    # Find the last NaN date
    last_nan_date = df_weather['from'][nan_mask].iloc[-1]
    # Drop all rows before the last NaN date
    df_weather = df_weather[df_weather['from'] > last_nan_date].copy()

    print(f'Dropped all rows before {last_nan_date}, none interpolated')

    # check if na values are gone
    print(f'Total NaN values left: {df_weather[float_columns].isna().sum().sum()}')

    display(df_weather)

elif nan_count > 0:
    for idx in df_weather[ids[0]].unique():
        df_weather.loc[df_weather[ids[0]] == idx, float_columns] = df_weather.loc[df_weather[ids[0]] == idx, float_columns].interpolate()
    
    print(f'A total of {nan_count} NaN values were interpolated')

    # check if na values are gone
    print(f'Total NaN values left: {df_weather[float_columns].isna().sum().sum()}')

    display(df_weather)

Total NaN values: 0


In [11]:
df_weather = pp.cyclical_wind_encoding(df_weather, 'mean_wind_dir')
display(df_weather)

Unnamed: 0,municipalityName,from,to,temp,wind_speed,mean_wind_dir,municipalityId,area,coordinates,geometry_type,mean_wind_dir_sin,mean_wind_dir_cos
0,Aabenraa,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,3.1,12.5,253,580,DK1,"[9.27705335, 54.95857106]",Point,-0.956305,-0.292372
1,Aalborg,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,3.8,12.9,284,851,DK1,"[9.99710026, 56.98086976]",Point,-0.970296,0.241922
2,Aarhus,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,4.1,10.1,274,751,DK1,"[10.10360521, 56.15511031]",Point,-0.997564,0.069756
3,Albertslund,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,2.9,11.1,257,165,DK2,"[12.35232083, 55.68497735]",Point,-0.974370,-0.224951
4,Allerød,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,2.7,11.1,261,201,DK2,"[12.31517796, 55.85193129]",Point,-0.987688,-0.156434
...,...,...,...,...,...,...,...,...,...,...,...,...
11955309,Vejle,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,4.3,3.0,185,630,DK1,"[9.36401747, 55.69959549]",Point,-0.087156,-0.996195
11955310,Vesthimmerlands,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,6.4,4.8,187,820,DK1,"[9.38374995, 56.81543648]",Point,-0.121869,-0.992546
11955311,Viborg,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,5.6,4.1,187,791,DK1,"[9.54980131, 56.47080654]",Point,-0.121869,-0.992546
11955312,Vordingborg,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,4.3,3.3,204,390,DK2,"[11.97379373, 55.06464257]",Point,-0.406737,-0.913545


In [12]:
pp.plot_map(df_facts,id=ids[0],color_by='area', save='map.png', do_print=False)

Map saved as: output/map.png


In [13]:
# removing dk2 as we will only use dk1
df_weather_dk1 = df_weather[df_weather['area'] == 'DK1'].drop(columns=['area','coordinates','mean_wind_dir', 'municipalityId'], axis=1)

# removing data from before 2017-10-23
df_weather_dk1 = df_weather_dk1[df_weather_dk1['from'] >= '2017-10-23']

# reset index
df_weather_dk1.reset_index(drop=True, inplace=True)

In [14]:
# print info
print(df_weather_dk1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3239808 entries, 0 to 3239807
Data columns (total 8 columns):
 #   Column             Dtype              
---  ------             -----              
 0   municipalityName   object             
 1   from               datetime64[ns, UTC]
 2   to                 datetime64[ns, UTC]
 3   temp               float64            
 4   wind_speed         float64            
 5   geometry_type      object             
 6   mean_wind_dir_sin  float64            
 7   mean_wind_dir_cos  float64            
dtypes: datetime64[ns, UTC](2), float64(4), object(2)
memory usage: 197.7+ MB
None


In [15]:
plt.figure(figsize=(14, 8))
for municipality, group_data in df_weather_dk1.groupby('municipalityName'):
    plt.plot(group_data['from'], group_data['temp'], label=municipality, alpha=0.5, linewidth=0.25)

# add vertical line at august 1st 2024
plt.axvline(x=pd.to_datetime('2024-08-01 00:00:00'), color='black', linestyle='--', linewidth=2)

# adding titles and labels
plt.xlabel('Date', fontsize=26)
plt.ylabel('Mean Temperature (°C)', fontsize=26)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()

plt.savefig('output/mean_temp.png')
plt.close()

In [16]:
# print figure of mean wind speed data for DK1
plt.figure(figsize=(14, 8))
for municipality, group_data in df_weather_dk1.groupby('municipalityName'):
    plt.plot(group_data['from'], group_data['wind_speed'], label=municipality, alpha=0.5, linewidth=0.25)

# add vertical line at august 1st 2024
plt.axvline(x=pd.to_datetime('2024-08-01 00:00:00'), color='black', linestyle='--', linewidth=2)

# adding titles and labels
plt.xlabel('Date', fontsize=26)
plt.ylabel('Mean Wind Speed (m/s)', fontsize=26)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()

plt.savefig('output/mean_wind_speed.png')
plt.close()

In [17]:
df_dk1 = df_weather_dk1.pivot_table(index=['from','to'], columns=ids[0], 
                                             values=['temp', 'wind_speed',
                                                #      'wind_dir_N', 'wind_dir_NE', 'wind_dir_E', 
                                                #      'wind_dir_SE', 'wind_dir_S', 'wind_dir_SW',
                                                #       'wind_dir_W', # 'wind_dir_NW' is dropped as it is the reference category 
                                                      'mean_wind_dir_sin', 'mean_wind_dir_cos',
                                                      ], 
                                             aggfunc='first').reset_index()

# column names
df_dk1.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_dk1.columns]

# rename columns
df_dk1.rename(columns={'from_': 'from', 'to_': 'to'}, inplace=True)

df_dk1

Unnamed: 0,from,to,mean_wind_dir_cos_Aabenraa,mean_wind_dir_cos_Aalborg,mean_wind_dir_cos_Aarhus,mean_wind_dir_cos_Assens,mean_wind_dir_cos_Billund,mean_wind_dir_cos_Brønderslev,mean_wind_dir_cos_Esbjerg,mean_wind_dir_cos_Faaborg-Midtfyn,...,wind_speed_Syddjurs,wind_speed_Sønderborg,wind_speed_Thisted,wind_speed_Tønder,wind_speed_Varde,wind_speed_Vejen,wind_speed_Vejle,wind_speed_Vesthimmerlands,wind_speed_Viborg,wind_speed_Ærø
0,2017-10-23 00:00:00+00:00,2017-10-23 01:00:00+00:00,-0.992546,0.615661,0.052336,-0.984808,-0.017452,0.642788,-0.453990,-0.891007,...,1.6,3.9,3.5,2.6,2.5,1.7,1.4,2.7,1.7,4.9
1,2017-10-23 01:00:00+00:00,2017-10-23 02:00:00+00:00,-0.898794,0.731354,0.224951,-0.994522,0.139173,0.798636,-0.390731,-0.933580,...,1.5,4.0,4.3,2.9,2.8,1.8,1.6,3.2,1.9,4.4
2,2017-10-23 02:00:00+00:00,2017-10-23 03:00:00+00:00,-0.819152,0.731354,0.241922,-1.000000,-0.139173,0.656059,-0.500000,-0.970296,...,2.0,3.5,4.5,2.8,2.9,1.8,1.8,3.5,2.0,4.6
3,2017-10-23 03:00:00+00:00,2017-10-23 04:00:00+00:00,-0.642788,0.766044,0.258819,-0.965926,-0.241922,0.719340,-0.469472,-0.994522,...,2.1,3.0,5.2,2.6,2.7,2.2,1.7,3.9,2.4,3.8
4,2017-10-23 04:00:00+00:00,2017-10-23 05:00:00+00:00,-0.358368,0.601815,0.515038,-0.970296,0.052336,0.453990,-0.052336,-0.999391,...,1.9,3.1,4.8,2.9,3.2,2.3,2.2,3.5,2.3,3.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62299,2024-11-30 19:00:00+00:00,2024-11-30 20:00:00+00:00,-0.961262,-0.933580,-0.891007,-0.906308,-0.978148,-0.920505,-1.000000,-0.906308,...,4.6,3.8,6.4,3.4,4.6,3.6,3.2,4.7,4.4,4.7
62300,2024-11-30 20:00:00+00:00,2024-11-30 21:00:00+00:00,-0.956305,-0.933580,-0.874620,-0.913545,-0.990268,-0.920505,-0.998630,-0.945519,...,4.3,3.0,6.2,3.2,4.2,3.7,3.4,4.8,4.4,4.1
62301,2024-11-30 21:00:00+00:00,2024-11-30 22:00:00+00:00,-0.998630,-0.974370,-0.978148,-0.951057,-1.000000,-0.956305,-0.992546,-0.974370,...,3.6,2.9,5.8,3.2,4.5,3.3,3.0,4.8,4.4,3.8
62302,2024-11-30 22:00:00+00:00,2024-11-30 23:00:00+00:00,-1.000000,-0.984808,-0.974370,-0.984808,-0.999391,-0.970296,-0.990268,-0.981627,...,3.5,2.7,5.7,3.2,4.7,2.9,2.7,4.5,4.0,3.9


In [18]:
pca_var = 0.99

pca_temp = pp.pca_filtering(df_dk1, prefix='temp_', n_components=pca_var)
pca_wind_speed = pp.pca_filtering(df_dk1, prefix='wind_speed_', n_components=pca_var)
pca_wind_dir = pp.pca_filtering(df_dk1, prefix='mean_wind_dir_', n_components=pca_var)

# merge the PCA components with the original DataFrame but only keeping the from and to columns
df_dk1 = pd.concat([df_dk1[['from', 'to']], pca_temp, pca_wind_speed, pca_wind_dir], axis=1)

Reduced from 52 to 3, explained variance: 0.9919464798710205
Reduced from 52 to 14, explained variance: 0.9902023832106164
Reduced from 104 to 39, explained variance: 0.990520017447982


In [19]:
# cylclical encoding of time variables
df_dk1['hour'] = df_dk1['from'].dt.hour
df_dk1['day'] = df_dk1['from'].dt.dayofweek
df_dk1['month'] = df_dk1['from'].dt.month

pp.cyclical_encoding(df_dk1,'hour',24)
pp.cyclical_encoding(df_dk1,'day',7)
pp.cyclical_encoding(df_dk1,'month',12)

display(df_dk1)

Unnamed: 0,from,to,temp_pca_1,temp_pca_2,temp_pca_3,wind_speed_pca_1,wind_speed_pca_2,wind_speed_pca_3,wind_speed_pca_4,wind_speed_pca_5,...,mean_wind_dir_pca_36,mean_wind_dir_pca_37,mean_wind_dir_pca_38,mean_wind_dir_pca_39,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,2017-10-23 00:00:00+00:00,2017-10-23 01:00:00+00:00,-1.965042,-2.793365,1.331024,-14.991578,1.089768,2.674326,0.121991,3.878821,...,0.143676,0.443117,0.288199,0.153231,0.000000,1.000000,0.000000,1.000000,-0.866025,0.500000
1,2017-10-23 01:00:00+00:00,2017-10-23 02:00:00+00:00,-2.769109,-2.735380,1.354594,-14.144977,1.523653,1.517807,0.471239,3.776611,...,0.100583,0.553386,0.451921,0.058094,0.258819,0.965926,0.000000,1.000000,-0.866025,0.500000
2,2017-10-23 02:00:00+00:00,2017-10-23 03:00:00+00:00,-3.036467,-2.789144,1.323717,-13.758680,2.347449,1.341344,0.504761,3.357439,...,-0.119623,0.203594,0.282752,0.073587,0.500000,0.866025,0.000000,1.000000,-0.866025,0.500000
3,2017-10-23 03:00:00+00:00,2017-10-23 04:00:00+00:00,-2.117943,-2.070532,1.087930,-13.081191,4.589197,0.726156,0.510603,3.487723,...,-0.155833,0.152083,0.109397,0.171273,0.707107,0.707107,0.000000,1.000000,-0.866025,0.500000
4,2017-10-23 04:00:00+00:00,2017-10-23 05:00:00+00:00,-1.405781,-2.057688,0.749725,-12.360959,5.434530,0.681460,-0.897302,4.524487,...,-0.119831,-0.099463,-0.291321,-0.224099,0.866025,0.500000,0.000000,1.000000,-0.866025,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62299,2024-11-30 19:00:00+00:00,2024-11-30 20:00:00+00:00,-32.360111,9.982777,1.142634,-0.048666,3.439706,1.073724,1.912495,-1.644523,...,-0.134103,0.117466,0.061329,-0.110426,-0.965926,0.258819,-0.974928,-0.222521,-0.500000,0.866025
62300,2024-11-30 20:00:00+00:00,2024-11-30 21:00:00+00:00,-32.239192,10.276164,0.826658,-0.807312,3.943964,1.133963,1.187763,-1.950719,...,-0.091987,0.125553,0.026835,-0.095172,-0.866025,0.500000,-0.974928,-0.222521,-0.500000,0.866025
62301,2024-11-30 21:00:00+00:00,2024-11-30 22:00:00+00:00,-32.106334,10.854210,0.121364,-2.848535,4.101911,0.134174,0.980854,-0.768188,...,-0.002047,0.053607,0.016133,-0.055806,-0.707107,0.707107,-0.974928,-0.222521,-0.500000,0.866025
62302,2024-11-30 22:00:00+00:00,2024-11-30 23:00:00+00:00,-31.283511,10.747201,-0.280606,-4.228504,3.451502,0.074769,1.453930,-0.792821,...,0.007300,0.091745,0.019543,-0.075861,-0.500000,0.866025,-0.974928,-0.222521,-0.500000,0.866025


## Energinet data

--------
https://www.energidataservice.dk/tso-electricity/Elspotprices


In [20]:
# find the first date
start_date = df_dk1['from'].min()
end_date = df_dk1['to'].max()

# convert to YYYY-MM-DDTHH:MM string
start = start_date.strftime('%Y-%m-%dT%H:%M')
end = end_date.strftime('%Y-%m-%dT%H:%M')

# define url and parameters
base_url = 'https://api.energidataservice.dk/dataset/Elspotprices'
params = {
    'offset': 0,
    'start': start,
    'filter': '{"PriceArea":["DK1"]}' # DK1 is the price area for west Denmark, DK2 is for east Denmark https://energinet.dk/El/Systemydelser/Introduktion-til-Systemydelser/Oversigt-over-systemydelser/
    }

In [21]:
# download the data
df_energy = pp.get_energydata(url=base_url, params=params)

# drop the HourDK column as dmi data is in UTC
df_energy.drop(columns=['HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)

# convert to datetime
df_energy['HourUTC'] = pd.to_datetime(df_energy['HourUTC']).dt.tz_localize('UTC')
# remove data after end_date
df_energy = df_energy[df_energy['HourUTC'] <= end_date]
df_energy


Unnamed: 0,HourUTC,SpotPriceDKK
358,2024-12-01 00:00:00+00:00,671.380005
359,2024-11-30 23:00:00+00:00,710.099976
360,2024-11-30 22:00:00+00:00,795.539978
361,2024-11-30 21:00:00+00:00,856.030029
362,2024-11-30 20:00:00+00:00,836.640015
...,...,...
62660,2017-10-23 02:00:00+00:00,165.399994
62661,2017-10-23 01:00:00+00:00,185.279999
62662,2017-10-23 00:00:00+00:00,201.580002
62663,2017-10-22 23:00:00+00:00,207.389999


In [22]:
# print figure of mean wind speed data for DK1
plt.figure(figsize=(14, 8))
plt.plot(df_energy['HourUTC'], df_energy['SpotPriceDKK'])

# add vertical line at august 1st 2024
plt.axvline(x=pd.to_datetime('2024-08-01 00:00:00'), color='black', linestyle='--', linewidth=2)

# Adding titles and labels
plt.xlabel('Date', fontsize=26)
plt.ylabel('Spot Price (DKK per MWh)', fontsize=26)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()

plt.savefig('output/spot_price_dkk.png')
# plt.show()
plt.close()

In [23]:
df_energy_plot = df_energy.copy()

# first difference the price
df_energy_plot['diff'] = df_energy_plot['SpotPriceDKK'].diff()

# plot the ACF and PACF
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
plot_acf(df_energy_plot['diff'].dropna(), ax=ax[0], lags=72)
ax[0].set_title('ACF', fontsize=22)
ax[0].tick_params(axis='both', which='major', labelsize=18)

plot_pacf(df_energy_plot['diff'].dropna(), ax=ax[1], lags=72)
ax[1].set_title('PACF', fontsize=22)
ax[1].tick_params(axis='both', which='major', labelsize=18)
plt.tight_layout()
plt.savefig('output/acf_pacf.png')
# plt.show()
plt.close()


## Import oil and gas prices from Yahoo Finance


--------
https://finance.yahoo.com/quote/TTF=F/

https://finance.yahoo.com/quote/BZ=F/

In [24]:
# define tickers and time range
tickers = ["BZ=F", "TTF=F"]
start_date = "2017-01-01"
end_date = "2024-12-01"

# Fetch historical data
data = {}
for ticker in tickers:
    data[ticker] = yf.download(ticker, start=start_date, end=end_date)

# Combine dataframes and align dates
df_comodities = pd.DataFrame()
for ticker, df in data.items():
    column_name = "oil_price" if ticker == "BZ=F" else "gas_price"
    df = df["Close"].rename(column_name)  # Keep only the Close price and rename the column
    df_comodities = pd.concat([df_comodities, df], axis=1)

# Expand to include all dates and fill missing values
df_comodities.index = pd.to_datetime(df_comodities.index)
df_comodities = df_comodities.asfreq('D')  # Expand to daily frequency
df_comodities = df_comodities.ffill()  # Forward fill only
df_comodities.dropna(inplace=True)  # Drop any remaining NaN values

# Display the resulting dataframe
display(df_comodities.head())



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,oil_price,gas_price
2017-10-23,57.369999,18.09
2017-10-24,58.330002,17.959999
2017-10-25,58.439999,18.110001
2017-10-26,59.299999,18.07
2017-10-27,60.439999,18.15


In [25]:
# Plot Brent Crude
plt.figure(figsize=(14, 8))
plt.plot(df_comodities.index, df_comodities['oil_price'], label='Brent Crude')
plt.axvline(x=pd.to_datetime('2024-08-01'), color='black', linestyle='--', linewidth=2)
plt.xlabel('Date', fontsize=26)
plt.ylabel('Spot Price (USD per Barrel)', fontsize=26)
# plt.title('Brent Crude Oil Prices', fontsize=28)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()
plt.savefig('output/brent_crude_prices.png')
plt.close()

In [26]:
# Plot Natural Gas
plt.figure(figsize=(14, 8))
plt.plot(df_comodities.index, df_comodities['gas_price'], label='TTF Natural Gas')
plt.axvline(x=pd.to_datetime('2024-08-01'), color='black', linestyle='--', linewidth=2)
plt.xlabel('Date', fontsize=26)
plt.ylabel('Spot Price (EUR per MWh)', fontsize=26)
# plt.title('TTF Natural Gas Prices', fontsize=28)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()
plt.savefig('output/ttf_natural_gas_prices.png')
plt.close()

## Combine datasets



In [27]:
# join on from and HourUTC
df = pd.merge(df_energy, df_dk1, left_on=['HourUTC'], right_on=['from'], how='inner').drop(columns=['HourUTC'], axis=1)

display(df)

Unnamed: 0,SpotPriceDKK,from,to,temp_pca_1,temp_pca_2,temp_pca_3,wind_speed_pca_1,wind_speed_pca_2,wind_speed_pca_3,wind_speed_pca_4,...,mean_wind_dir_pca_36,mean_wind_dir_pca_37,mean_wind_dir_pca_38,mean_wind_dir_pca_39,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
0,710.099976,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,-30.477286,10.129055,-0.518030,-3.725266,2.606497,-0.498396,1.235580,...,-0.019515,0.092657,0.022411,-0.058334,-0.258819,0.965926,-0.974928,-0.222521,-0.500000,0.866025
1,795.539978,2024-11-30 22:00:00+00:00,2024-11-30 23:00:00+00:00,-31.283511,10.747201,-0.280606,-4.228504,3.451502,0.074769,1.453930,...,0.007300,0.091745,0.019543,-0.075861,-0.500000,0.866025,-0.974928,-0.222521,-0.500000,0.866025
2,856.030029,2024-11-30 21:00:00+00:00,2024-11-30 22:00:00+00:00,-32.106334,10.854210,0.121364,-2.848535,4.101911,0.134174,0.980854,...,-0.002047,0.053607,0.016133,-0.055806,-0.707107,0.707107,-0.974928,-0.222521,-0.500000,0.866025
3,836.640015,2024-11-30 20:00:00+00:00,2024-11-30 21:00:00+00:00,-32.239192,10.276164,0.826658,-0.807312,3.943964,1.133963,1.187763,...,-0.091987,0.125553,0.026835,-0.095172,-0.866025,0.500000,-0.974928,-0.222521,-0.500000,0.866025
4,868.260010,2024-11-30 19:00:00+00:00,2024-11-30 20:00:00+00:00,-32.360111,9.982777,1.142634,-0.048666,3.439706,1.073724,1.912495,...,-0.134103,0.117466,0.061329,-0.110426,-0.965926,0.258819,-0.974928,-0.222521,-0.500000,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62299,284.950012,2017-10-23 04:00:00+00:00,2017-10-23 05:00:00+00:00,-1.405781,-2.057688,0.749725,-12.360959,5.434530,0.681460,-0.897302,...,-0.119831,-0.099463,-0.291321,-0.224099,0.866025,0.500000,0.000000,1.000000,-0.866025,0.500000
62300,220.339996,2017-10-23 03:00:00+00:00,2017-10-23 04:00:00+00:00,-2.117943,-2.070532,1.087930,-13.081191,4.589197,0.726156,0.510603,...,-0.155833,0.152083,0.109397,0.171273,0.707107,0.707107,0.000000,1.000000,-0.866025,0.500000
62301,165.399994,2017-10-23 02:00:00+00:00,2017-10-23 03:00:00+00:00,-3.036467,-2.789144,1.323717,-13.758680,2.347449,1.341344,0.504761,...,-0.119623,0.203594,0.282752,0.073587,0.500000,0.866025,0.000000,1.000000,-0.866025,0.500000
62302,185.279999,2017-10-23 01:00:00+00:00,2017-10-23 02:00:00+00:00,-2.769109,-2.735380,1.354594,-14.144977,1.523653,1.517807,0.471239,...,0.100583,0.553386,0.451921,0.058094,0.258819,0.965926,0.000000,1.000000,-0.866025,0.500000


In [28]:
df['date'] = df['from'].dt.date
df_comodities['date'] = df_comodities.index.date

# merge datasets
df_merged = pd.merge(df, df_comodities, on='date', how='inner')

# drop the date column
df_merged.drop(columns=['date'], inplace=True)
df_merged.reset_index(drop=True, inplace=True)

# add constant
df_merged['constant'] = 1

# Display the resulting merged dataframe
display(df_merged)


Unnamed: 0,SpotPriceDKK,from,to,temp_pca_1,temp_pca_2,temp_pca_3,wind_speed_pca_1,wind_speed_pca_2,wind_speed_pca_3,wind_speed_pca_4,...,mean_wind_dir_pca_39,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos,oil_price,gas_price,constant
0,596.570007,2024-11-29 23:00:00+00:00,2024-11-30 00:00:00+00:00,-33.480249,7.339981,1.718044,3.248249,2.159318,0.612078,3.505092,...,-0.093112,-0.258819,0.965926,-0.433884,-0.900969,-0.500000,0.866025,72.940002,47.811001,1
1,770.270020,2024-11-29 22:00:00+00:00,2024-11-29 23:00:00+00:00,-32.131543,8.173021,2.038769,2.387323,3.215623,0.288633,3.701821,...,-0.004524,-0.500000,0.866025,-0.433884,-0.900969,-0.500000,0.866025,72.940002,47.811001,1
2,848.200012,2024-11-29 21:00:00+00:00,2024-11-29 22:00:00+00:00,-30.659681,8.780246,1.967827,1.620036,2.482432,0.516816,3.950176,...,-0.081036,-0.707107,0.707107,-0.433884,-0.900969,-0.500000,0.866025,72.940002,47.811001,1
3,836.049988,2024-11-29 20:00:00+00:00,2024-11-29 21:00:00+00:00,-29.321284,9.297498,1.494915,1.490229,2.968878,1.071656,3.674438,...,-0.073105,-0.866025,0.500000,-0.433884,-0.900969,-0.500000,0.866025,72.940002,47.811001,1
4,894.219971,2024-11-29 19:00:00+00:00,2024-11-29 20:00:00+00:00,-27.818541,9.207599,0.853380,2.309477,2.890789,0.990981,2.872615,...,-0.040381,-0.965926,0.258819,-0.433884,-0.900969,-0.500000,0.866025,72.940002,47.811001,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62275,284.950012,2017-10-23 04:00:00+00:00,2017-10-23 05:00:00+00:00,-1.405781,-2.057688,0.749725,-12.360959,5.434530,0.681460,-0.897302,...,-0.224099,0.866025,0.500000,0.000000,1.000000,-0.866025,0.500000,57.369999,18.090000,1
62276,220.339996,2017-10-23 03:00:00+00:00,2017-10-23 04:00:00+00:00,-2.117943,-2.070532,1.087930,-13.081191,4.589197,0.726156,0.510603,...,0.171273,0.707107,0.707107,0.000000,1.000000,-0.866025,0.500000,57.369999,18.090000,1
62277,165.399994,2017-10-23 02:00:00+00:00,2017-10-23 03:00:00+00:00,-3.036467,-2.789144,1.323717,-13.758680,2.347449,1.341344,0.504761,...,0.073587,0.500000,0.866025,0.000000,1.000000,-0.866025,0.500000,57.369999,18.090000,1
62278,185.279999,2017-10-23 01:00:00+00:00,2017-10-23 02:00:00+00:00,-2.769109,-2.735380,1.354594,-14.144977,1.523653,1.517807,0.471239,...,0.058094,0.258819,0.965926,0.000000,1.000000,-0.866025,0.500000,57.369999,18.090000,1


In [29]:
print(df_merged.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62280 entries, 0 to 62279
Data columns (total 68 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   SpotPriceDKK          62280 non-null  float64            
 1   from                  62280 non-null  datetime64[ns, UTC]
 2   to                    62280 non-null  datetime64[ns, UTC]
 3   temp_pca_1            62280 non-null  float64            
 4   temp_pca_2            62280 non-null  float64            
 5   temp_pca_3            62280 non-null  float64            
 6   wind_speed_pca_1      62280 non-null  float64            
 7   wind_speed_pca_2      62280 non-null  float64            
 8   wind_speed_pca_3      62280 non-null  float64            
 9   wind_speed_pca_4      62280 non-null  float64            
 10  wind_speed_pca_5      62280 non-null  float64            
 11  wind_speed_pca_6      62280 non-null  float64            
 12  wind

In [30]:
# save the data
df_merged.to_csv(path_out, index=False)
print(f'Data saved to {path_out}')

Data saved to ../../data/fulldata.csv
