Combine data from DMI & Energinet
=======================
This loads in csv file created by the [loadbulkweather.ipynb](loadbulkweather.ipynb) and pulls data from the Energinet API

In [15]:
import pandas as pd

import assetpricing_functions as ap
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading DMI data

In [16]:
# path to the CSV file
path = '/Users/johan/Downloads/weather-out/weather_data.csv' 

df_weather = pd.read_csv(path)

In [17]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21490296 entries, 0 to 21490295
Data columns (total 23 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   cellId                        object 
 1   from                          object 
 2   to                            object 
 3   geometry_type                 object 
 4   coordinates                   object 
 5   mean_temp                     float64
 6   max_wind_speed_10min          float64
 7   temp_soil_30                  float64
 8   mean_wind_speed               float64
 9   temp_soil_10                  float64
 10  mean_pressure                 float64
 11  temp_grass                    float64
 12  mean_relative_hum             float64
 13  min_temp                      float64
 14  mean_radiation                float64
 15  mean_wind_dir                 int64  
 16  bright_sunshine               float64
 17  vapour_pressure_deficit_mean  float64
 18  leaf_moisture       

In [18]:
# keep only the columns we need
df_weather = df_weather[['cellId', 
                         'from',
                         'to',
                         'coordinates', 
                         'mean_temp',
                         'mean_wind_speed',
                         'mean_wind_dir', 
                        #  'mean_cloud_cover', # problems as some data is missing here
                        #  'bright_sunshine' # problems as some data is missing here
                         ]]

In [19]:
# convert to datetime
df_weather['from'] = pd.to_datetime(df_weather['from'])
df_weather['to'] = pd.to_datetime(df_weather['to'])

df_weather['wind_dir'] = df_weather['mean_wind_dir'].apply(ap.degrees_to_cardinal)

# adding electricity area to the dataframe
df_weather['area'] = df_weather['cellId'].apply(lambda x: 'DK1' if int(x[-2:]) < 62 else 'DK2') # DK1 is the price area for west Denmark, DK2 is for east Denmark https://energinet.dk/El/Systemydelser/Introduktion-til-Systemydelser/Oversigt-over-systemydelser/

display(df_weather.head())

Unnamed: 0,cellId,from,to,coordinates,mean_temp,mean_wind_speed,mean_wind_dir,wind_dir,area
0,20km_604_64,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[[[11.1613, 54.4883], [11.4698, 54.4824], [11....",2.0,12.9,249,W,DK2
1,20km_604_66,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[[[11.4698, 54.4824], [11.7781, 54.4757], [11....",2.0,13.2,250,W,DK2
2,20km_604_68,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[[[11.7781, 54.4757], [12.0863, 54.4682], [12....",1.8,13.7,248,W,DK2
3,20km_606_50,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[[[9, 54.6873], [9.3102, 54.6869], [9.3116, 54...",3.0,12.7,250,W,DK1
4,20km_606_52,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[[[9.3102, 54.6869], [9.6205, 54.6858], [9.623...",3.1,12.6,251,W,DK1


In [20]:
# create df with only cellId, geometry type and coordinates
df_geometry = df_weather[['cellId', 'area', 'coordinates']].drop_duplicates()

# plot the map of the data points
ap.plot_polygons(df_geometry, color_by='area', save='polygon_by_area.png')

Map saved as: output/polygon_by_area.png


In [21]:
# show rows with missing values
display(df_weather[df_weather.isnull().any(axis=1)])

Unnamed: 0,cellId,from,to,coordinates,mean_temp,mean_wind_speed,mean_wind_dir,wind_dir,area
3123010,20km_604_64,2013-01-01 00:00:00+00:00,2013-01-01 01:00:00+00:00,"[[[11.1613, 54.4883], [11.4698, 54.4824], [11....",,11.3,198,S,DK2
3123011,20km_604_66,2013-01-01 00:00:00+00:00,2013-01-01 01:00:00+00:00,"[[[11.4698, 54.4824], [11.7781, 54.4757], [11....",,11.7,200,S,DK2
3123012,20km_604_68,2013-01-01 00:00:00+00:00,2013-01-01 01:00:00+00:00,"[[[11.7781, 54.4757], [12.0863, 54.4682], [12....",,12.2,202,S,DK2
3123013,20km_606_50,2013-01-01 00:00:00+00:00,2013-01-01 01:00:00+00:00,"[[[9, 54.6873], [9.3102, 54.6869], [9.3116, 54...",,8.8,218,SW,DK1
3123014,20km_606_52,2013-01-01 00:00:00+00:00,2013-01-01 01:00:00+00:00,"[[[9.3102, 54.6869], [9.6205, 54.6858], [9.623...",,8.7,218,SW,DK1
...,...,...,...,...,...,...,...,...,...
4682107,20km_636_56,2013-12-31 22:00:00+00:00,2013-12-31 23:00:00+00:00,"[[[9.9979, 57.3789], [10.3304, 57.3758], [10.3...",,,163,S,DK1
4682108,20km_636_58,2013-12-31 22:00:00+00:00,2013-12-31 23:00:00+00:00,"[[[10.3304, 57.3758], [10.6629, 57.3718], [10....",,,165,S,DK1
4682109,20km_638_54,2013-12-31 22:00:00+00:00,2013-12-31 23:00:00+00:00,"[[[9.6686, 57.5607], [10.0028, 57.5585], [10.0...",,,163,S,DK1
4682110,20km_638_56,2013-12-31 22:00:00+00:00,2013-12-31 23:00:00+00:00,"[[[10.0028, 57.5585], [10.3369, 57.5554], [10....",,,165,S,DK1


In [22]:
# removing dk2 as we will only use dk1
df_weather_dk1 = df_weather[df_weather['area'] == 'DK1'].drop(columns=['area','coordinates','mean_wind_dir'], axis=1)

# removing anything before 2014 as 2013 is missing temperature data and decemeber 2013 is missing wind data
df_weather_dk1 = df_weather_dk1[df_weather_dk1['from'] >= '2014-01-01']

# create wind direction dummies
df_weather_dk1 = pd.get_dummies(df_weather_dk1, columns=['wind_dir'], prefix='wind_dir')

# reset index
df_weather_dk1.reset_index(drop=True, inplace=True)

In [23]:
# check for missing values
display(df_weather_dk1.isnull().sum())

# print info
print(df_weather_dk1.info())

# show the first rows
display(df_weather_dk1.head())

cellId             0
from               0
to                 0
mean_temp          0
mean_wind_speed    0
wind_dir_E         0
wind_dir_N         0
wind_dir_NE        0
wind_dir_NW        0
wind_dir_S         0
wind_dir_SE        0
wind_dir_SW        0
wind_dir_W         0
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11803375 entries, 0 to 11803374
Data columns (total 13 columns):
 #   Column           Dtype              
---  ------           -----              
 0   cellId           object             
 1   from             datetime64[ns, UTC]
 2   to               datetime64[ns, UTC]
 3   mean_temp        float64            
 4   mean_wind_speed  float64            
 5   wind_dir_E       bool               
 6   wind_dir_N       bool               
 7   wind_dir_NE      bool               
 8   wind_dir_NW      bool               
 9   wind_dir_S       bool               
 10  wind_dir_SE      bool               
 11  wind_dir_SW      bool               
 12  wind_dir_W       bool               
dtypes: bool(8), datetime64[ns, UTC](2), float64(2), object(1)
memory usage: 540.3+ MB
None


Unnamed: 0,cellId,from,to,mean_temp,mean_wind_speed,wind_dir_E,wind_dir_N,wind_dir_NE,wind_dir_NW,wind_dir_S,wind_dir_SE,wind_dir_SW,wind_dir_W
0,20km_606_50,2014-01-01 00:00:00+00:00,2014-01-01 01:00:00+00:00,3.3,4.3,False,False,False,False,False,True,False,False
1,20km_606_52,2014-01-01 00:00:00+00:00,2014-01-01 01:00:00+00:00,3.4,4.3,False,False,False,False,False,True,False,False
2,20km_606_54,2014-01-01 00:00:00+00:00,2014-01-01 01:00:00+00:00,4.2,6.2,False,False,False,False,False,True,False,False
3,20km_606_58,2014-01-01 00:00:00+00:00,2014-01-01 01:00:00+00:00,3.6,6.5,False,False,False,False,False,True,False,False
4,20km_606_60,2014-01-01 00:00:00+00:00,2014-01-01 01:00:00+00:00,3.5,6.7,False,False,False,False,False,True,False,False


In [24]:
df_wide_dk1 = df_weather_dk1.pivot_table(index=['from','to'], columns='cellId', 
                                             values=['mean_temp', 'mean_wind_speed',
                                                     'wind_dir_N', 'wind_dir_NE', 'wind_dir_E', 
                                                     'wind_dir_SE', 'wind_dir_S', 'wind_dir_SW',
                                                      'wind_dir_W'], 
                                             aggfunc='first').reset_index()

# column names
df_wide_dk1.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_wide_dk1.columns]

# rename columns
df_wide_dk1.rename(columns={'from_': 'from', 'to_': 'to'}, inplace=True)

df_wide_dk1

Unnamed: 0,from,to,mean_temp_20km_606_50,mean_temp_20km_606_52,mean_temp_20km_606_54,mean_temp_20km_606_58,mean_temp_20km_606_60,mean_temp_20km_608_46,mean_temp_20km_608_48,mean_temp_20km_608_50,...,wind_dir_W_20km_634_54,wind_dir_W_20km_634_56,wind_dir_W_20km_634_58,wind_dir_W_20km_634_61,wind_dir_W_20km_636_54,wind_dir_W_20km_636_56,wind_dir_W_20km_636_58,wind_dir_W_20km_638_54,wind_dir_W_20km_638_56,wind_dir_W_20km_638_58
0,2014-01-01 00:00:00+00:00,2014-01-01 01:00:00+00:00,3.3,3.4,4.2,3.6,3.5,3.7,3.4,3.3,...,False,False,False,False,False,False,False,False,False,False
1,2014-01-01 01:00:00+00:00,2014-01-01 02:00:00+00:00,2.8,2.9,4.2,3.6,3.5,3.4,3.0,2.8,...,False,False,False,False,False,False,False,False,False,False
2,2014-01-01 02:00:00+00:00,2014-01-01 03:00:00+00:00,3.6,3.6,4.3,3.5,3.4,3.4,3.5,3.6,...,False,False,False,False,False,False,False,False,False,False
3,2014-01-01 03:00:00+00:00,2014-01-01 04:00:00+00:00,2.6,2.7,4.1,3.3,3.2,3.4,2.8,2.6,...,False,False,False,False,False,False,False,False,False,False
4,2014-01-01 04:00:00+00:00,2014-01-01 05:00:00+00:00,3.0,3.1,3.7,3.0,2.8,3.5,3.1,3.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94422,2024-10-09 06:00:00+00:00,2024-10-09 07:00:00+00:00,11.7,11.7,13.0,13.2,13.3,11.0,11.4,11.6,...,False,False,False,False,False,False,False,False,False,False
94423,2024-10-09 07:00:00+00:00,2024-10-09 08:00:00+00:00,12.4,12.4,13.5,13.8,13.8,12.0,12.2,12.4,...,False,False,False,False,False,False,False,False,False,False
94424,2024-10-09 08:00:00+00:00,2024-10-09 09:00:00+00:00,12.7,12.8,13.6,14.1,14.1,12.4,12.6,12.7,...,False,False,False,False,False,False,False,False,False,False
94425,2024-10-09 09:00:00+00:00,2024-10-09 10:00:00+00:00,13.5,13.4,13.4,13.8,13.9,13.2,13.4,13.4,...,False,False,False,False,False,False,False,False,False,False


## Energinet data

--------
https://www.energidataservice.dk/tso-electricity/Elspotprices


In [25]:
# define url and parameters
base_url = 'https://api.energidataservice.dk/dataset/Elspotprices'
params = {
    'offset': 0,
    'start': '2014-01-01T00:00',
    'end': '2024-10-01T03:00',
    'filter': '{"PriceArea":["DK1"]}' # DK1 is the price area for west Denmark, DK2 is for east Denmark https://energinet.dk/El/Systemydelser/Introduktion-til-Systemydelser/Oversigt-over-systemydelser/
    }

In [26]:
# download the data
df_energy = ap.get_energydata(url=base_url, params=params)

# drop the HourDK column as dmi data is in UTC
df_energy.drop(columns=['HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)

# convert to datetime
df_energy['HourUTC'] = pd.to_datetime(df_energy['HourUTC']).dt.tz_localize('UTC')
df_energy


Unnamed: 0,HourUTC,SpotPriceDKK
0,2024-10-01 00:00:00+00:00,0.370000
1,2024-09-30 23:00:00+00:00,0.520000
2,2024-09-30 22:00:00+00:00,23.930000
3,2024-09-30 21:00:00+00:00,73.370003
4,2024-09-30 20:00:00+00:00,149.199997
...,...,...
94221,2014-01-01 03:00:00+00:00,86.970001
94222,2014-01-01 02:00:00+00:00,87.269997
94223,2014-01-01 01:00:00+00:00,90.180000
94224,2014-01-01 00:00:00+00:00,96.669998


## Combine datasets



In [27]:
# join on from and HourUTC
df = pd.merge(df_energy, df_wide_dk1, left_on=['HourUTC'], right_on=['from'], how='inner').drop(columns=['HourUTC'], axis=1)

display(df)

Unnamed: 0,SpotPriceDKK,from,to,mean_temp_20km_606_50,mean_temp_20km_606_52,mean_temp_20km_606_54,mean_temp_20km_606_58,mean_temp_20km_606_60,mean_temp_20km_608_46,mean_temp_20km_608_48,...,wind_dir_W_20km_634_54,wind_dir_W_20km_634_56,wind_dir_W_20km_634_58,wind_dir_W_20km_634_61,wind_dir_W_20km_636_54,wind_dir_W_20km_636_56,wind_dir_W_20km_636_58,wind_dir_W_20km_638_54,wind_dir_W_20km_638_56,wind_dir_W_20km_638_58
0,0.370000,2024-10-01 00:00:00+00:00,2024-10-01 01:00:00+00:00,11.4,11.5,12.7,11.6,11.2,11.1,11.4,...,False,False,False,False,False,False,False,False,False,False
1,0.520000,2024-09-30 23:00:00+00:00,2024-10-01 00:00:00+00:00,11.8,11.8,12.8,11.8,11.4,11.2,11.6,...,False,False,False,False,False,False,False,False,False,False
2,23.930000,2024-09-30 22:00:00+00:00,2024-09-30 23:00:00+00:00,11.8,11.8,12.9,11.9,11.5,11.0,11.6,...,False,False,False,False,False,False,False,False,False,False
3,73.370003,2024-09-30 21:00:00+00:00,2024-09-30 22:00:00+00:00,12.0,12.0,13.0,12.1,11.7,11.0,11.7,...,False,False,False,False,False,False,False,False,False,False
4,149.199997,2024-09-30 20:00:00+00:00,2024-09-30 21:00:00+00:00,11.9,12.0,13.2,12.3,11.9,10.9,11.7,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94220,84.660004,2014-01-01 04:00:00+00:00,2014-01-01 05:00:00+00:00,3.0,3.1,3.7,3.0,2.8,3.5,3.1,...,False,False,False,False,False,False,False,False,False,False
94221,86.970001,2014-01-01 03:00:00+00:00,2014-01-01 04:00:00+00:00,2.6,2.7,4.1,3.3,3.2,3.4,2.8,...,False,False,False,False,False,False,False,False,False,False
94222,87.269997,2014-01-01 02:00:00+00:00,2014-01-01 03:00:00+00:00,3.6,3.6,4.3,3.5,3.4,3.4,3.5,...,False,False,False,False,False,False,False,False,False,False
94223,90.180000,2014-01-01 01:00:00+00:00,2014-01-01 02:00:00+00:00,2.8,2.9,4.2,3.6,3.5,3.4,3.0,...,False,False,False,False,False,False,False,False,False,False


In [28]:
# save the data
df.to_csv('/Users/johan/Downloads/data.csv', index=False)