Combine data from DMI, Energinet and FRED
=======================
This loads in csv file created by the [loadbulkweather.ipynb](loadbulkweather.ipynb) and pulls data from the Energinet API

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import assetpricing_functions as ap
import yfinance as yf
%load_ext autoreload
%autoreload 2

## Loading DMI data

In [2]:
# path to the CSV file
path = '/Users/johan/Documents/04 Uni/09 Asset Pricing Data/weather-muni/weather_muni.csv'
path_out = '../../data/fulldata.csv'

# read the data
df_weather = pd.read_csv(path)

In [3]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11955314 entries, 0 to 11955313
Data columns (total 25 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   municipalityId                int64  
 1   municipalityName              object 
 2   from                          object 
 3   to                            object 
 4   geometry_type                 object 
 5   coordinates                   object 
 6   mean_radiation                float64
 7   mean_wind_speed               float64
 8   acc_precip                    float64
 9   temp_soil_10                  float64
 10  max_wind_speed_3sec           float64
 11  temp_grass                    float64
 12  max_temp_w_date               float64
 13  mean_relative_hum             float64
 14  max_wind_speed_10min          float64
 15  mean_cloud_cover              float64
 16  leaf_moisture                 float64
 17  mean_temp                     float64
 18  vapour_pressure_defi

In [4]:
# define ids
if 'cellId' in df_weather.columns:
    ids = ['cellId']
if 'municipalityId' in df_weather.columns:
    ids = ['municipalityName','municipalityId']

# convert to datetime
df_weather['from'] = pd.to_datetime(df_weather['from'])
df_weather['to'] = pd.to_datetime(df_weather['to'])

# columns to keep
cols = ids + ['from', 'to', 'coordinates', 'geometry_type', # general columns
              'mean_temp', # temperature data
              'mean_wind_speed', 'mean_wind_dir', # wind data
            #   'mean_cloud_cover', 'bright_sunshine', # cloud and sunshine data
              ]

# keep only the columns we need
df_weather = df_weather[cols]

# rename columns
df_weather.rename(columns={'mean_temp':'temp','mean_wind_speed':'wind_speed'}, inplace=True)

# adding electricity area to the dataframe DK1 is the price area for west Denmark, DK2 is for east Denmark https://energinet.dk/El/Systemydelser/Introduktion-til-Systemydelser/Oversigt-over-systemydelser/
if 'cellId' in df_weather.columns:
    df_weather['area'] = df_weather['cellId'].apply(lambda x: 'DK1' if int(x[-2:]) < 62 else 'DK2')
elif 'municipalityId' in df_weather.columns:
    df_weather['area'] = df_weather['municipalityId'].apply(lambda x: 'DK1' if x > 400 else 'DK2')

display(df_weather.head())

Unnamed: 0,municipalityName,municipalityId,from,to,coordinates,geometry_type,temp,wind_speed,mean_wind_dir,area
0,København,101,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.49390862, 55.7040906]",Point,3.1,10.0,250,DK2
1,Frederiksberg,147,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.52373306, 55.67936546]",Point,3.2,9.7,248,DK2
2,Ballerup,151,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.36840182, 55.72775072]",Point,2.9,10.6,258,DK2
3,Brøndby,153,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.40438199, 55.64503727]",Point,3.1,10.4,253,DK2
4,Dragør,155,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,"[12.65022813, 55.59380739]",Point,3.1,11.8,259,DK2


In [5]:
# create a fact table with information on municipalities
df_facts = df_weather[ids + ['area','coordinates','geometry_type']].drop_duplicates()

In [6]:
# count nan values in each column
print(df_weather.isna().sum()) 

municipalityName    0
municipalityId      0
from                0
to                  0
coordinates         0
geometry_type       0
temp                0
wind_speed          0
mean_wind_dir       0
area                0
dtype: int64


In [7]:
# controlling for values completely missing
control = df_weather[['from', 'to']].apply(pd.to_datetime).drop_duplicates()
control.set_index('from', inplace=True)

# Create a complete date range from the start to the end of your data
full_range = pd.date_range(start=control.index.min(), end=control.index.max(), freq='h')

# Reindex your DataFrame to this full range
df_reindexed = control.reindex(full_range)

# Identify missing hours
missing_hours = df_reindexed[df_reindexed.isnull().any(axis=1)]

print("Missing hours:")
print(missing_hours.index)

Missing hours:
DatetimeIndex([], dtype='datetime64[ns, UTC]', freq='h')


In [8]:
# adding missing hours
if missing_hours.shape[0] < 720:
    # ensure datetime format
    df_weather['from'] = pd.to_datetime(df_weather['from'])

    # original shape
    shape_original = df_weather.shape

    # get the min and max 'from' dates per ID
    min_max_dates = df_weather.groupby(ids[0])['from'].agg(['min', 'max']).reset_index()

    # for each ID, create a DataFrame with all hourly 'from' timestamps between min and max dates
    date_ranges = []
    for idx, row in min_max_dates.iterrows():
        id_value = row[ids[0]]
        start_date = row['min']
        end_date = row['max']
        date_range = pd.date_range(start=start_date, end=end_date, freq='h')
        df_id = pd.DataFrame({ids[0]: id_value, 'from': date_range})
        date_ranges.append(df_id)

    # concatenate all date ranges into a single DataFrame
    df_all_hours = pd.concat(date_ranges, ignore_index=True)

    # merge the original data with the full hourly data
    df_weather = pd.merge(df_all_hours, df_weather, on=[ids[0], 'from'], how='left')

    # create the 'to' column by adding one hour to 'from'
    df_weather['to'] = df_weather['from'] + pd.DateOffset(hours=1)

    # Optional: Sort the DataFrame
    df_weather.sort_values(by=['from', ids[0]], inplace=True)

    # reset index
    df_weather.reset_index(drop=True, inplace=True)

    # new shape
    shape_new = df_weather.shape

    print(f"Added {shape_new[0] - shape_original[0]} rows")

Added 0 rows


In [9]:
# adding the missing values
df_weather = pd.merge(df_weather, df_facts, on=ids[0], how='left')

# remove columns that end with '_x'
df_weather = df_weather.drop(columns=[col for col in df_weather.columns if col.endswith('_x')])

# rename columns that end with '_y' by removing the '_y' suffix
df_weather = df_weather.rename(columns=lambda x: x[:-2] if x.endswith('_y') else x)

In [10]:
# select only float columns for interpolation
float_columns = df_weather.select_dtypes(include=['float']).columns

# count NaN values across all float columns
nan_count = df_weather[float_columns].isna().sum().sum()
print(f'Total NaN values: {nan_count}')

# Check if NaN values exceed 10,000
if nan_count > 10000:
    # Create a boolean mask for rows with NaN values in any float column
    nan_mask = df_weather[float_columns].isna().any(axis=1)
    # Find the last NaN date
    last_nan_date = df_weather['from'][nan_mask].iloc[-1]
    # Drop all rows before the last NaN date
    df_weather = df_weather[df_weather['from'] > last_nan_date].copy()

    print(f'Dropped all rows before {last_nan_date}, none interpolated')

else:
    for idx in df_weather[ids[0]].unique():
        df_weather.loc[df_weather[ids[0]] == idx, float_columns] = df_weather.loc[df_weather[ids[0]] == idx, float_columns].interpolate()
    
    print(f'A total of {nan_count} NaN values were interpolated')

# check if na values are gone
print(f'Total NaN values left: {df_weather[float_columns].isna().sum().sum()}')

display(df_weather)

Total NaN values: 0
A total of 0 NaN values were interpolated
Total NaN values left: 0


Unnamed: 0,municipalityName,from,to,temp,wind_speed,mean_wind_dir,municipalityId,area,coordinates,geometry_type
0,Aabenraa,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,3.1,12.5,253,580,DK1,"[9.27705335, 54.95857106]",Point
1,Aalborg,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,3.8,12.9,284,851,DK1,"[9.99710026, 56.98086976]",Point
2,Aarhus,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,4.1,10.1,274,751,DK1,"[10.10360521, 56.15511031]",Point
3,Albertslund,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,2.9,11.1,257,165,DK2,"[12.35232083, 55.68497735]",Point
4,Allerød,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,2.7,11.1,261,201,DK2,"[12.31517796, 55.85193129]",Point
...,...,...,...,...,...,...,...,...,...,...
11955309,Vejle,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,4.3,3.0,185,630,DK1,"[9.36401747, 55.69959549]",Point
11955310,Vesthimmerlands,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,6.4,4.8,187,820,DK1,"[9.38374995, 56.81543648]",Point
11955311,Viborg,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,5.6,4.1,187,791,DK1,"[9.54980131, 56.47080654]",Point
11955312,Vordingborg,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,4.3,3.3,204,390,DK2,"[11.97379373, 55.06464257]",Point


In [11]:
df_weather['wind_dir'] = df_weather['mean_wind_dir'].apply(ap.degrees_to_cardinal)

display(df_weather.head())

Unnamed: 0,municipalityName,from,to,temp,wind_speed,mean_wind_dir,municipalityId,area,coordinates,geometry_type,wind_dir
0,Aabenraa,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,3.1,12.5,253,580,DK1,"[9.27705335, 54.95857106]",Point,W
1,Aalborg,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,3.8,12.9,284,851,DK1,"[9.99710026, 56.98086976]",Point,W
2,Aarhus,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,4.1,10.1,274,751,DK1,"[10.10360521, 56.15511031]",Point,W
3,Albertslund,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,2.9,11.1,257,165,DK2,"[12.35232083, 55.68497735]",Point,W
4,Allerød,2010-12-31 23:00:00+00:00,2011-01-01 00:00:00+00:00,2.7,11.1,261,201,DK2,"[12.31517796, 55.85193129]",Point,W


In [12]:
ap.plot_map(df_facts,id=ids[0],color_by='area', save='map.png', do_print=False)

Map saved as: output/map.png


In [13]:
# removing dk2 as we will only use dk1
df_weather_dk1 = df_weather[df_weather['area'] == 'DK1'].drop(columns=['area','coordinates','mean_wind_dir'], axis=1)

# removing data from before April 1st 2016
df_weather_dk1 = df_weather_dk1[df_weather_dk1['from'] >= '2017-10-23']

# create wind direction dummies
df_weather_dk1 = pd.get_dummies(df_weather_dk1, columns=['wind_dir'], prefix='wind_dir')

# reset index
df_weather_dk1.reset_index(drop=True, inplace=True)

In [14]:
# print info
print(df_weather_dk1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3239808 entries, 0 to 3239807
Data columns (total 15 columns):
 #   Column            Dtype              
---  ------            -----              
 0   municipalityName  object             
 1   from              datetime64[ns, UTC]
 2   to                datetime64[ns, UTC]
 3   temp              float64            
 4   wind_speed        float64            
 5   municipalityId    int64              
 6   geometry_type     object             
 7   wind_dir_E        bool               
 8   wind_dir_N        bool               
 9   wind_dir_NE       bool               
 10  wind_dir_NW       bool               
 11  wind_dir_S        bool               
 12  wind_dir_SE       bool               
 13  wind_dir_SW       bool               
 14  wind_dir_W        bool               
dtypes: bool(8), datetime64[ns, UTC](2), float64(2), int64(1), object(2)
memory usage: 197.7+ MB
None


In [15]:
plt.figure(figsize=(14, 8))
for municipality, group_data in df_weather_dk1.groupby('municipalityName'):
    plt.plot(group_data['from'], group_data['temp'], label=municipality, alpha=0.8)

# add vertical line at august 1st 2024
plt.axvline(x=pd.to_datetime('2024-08-01 00:00:00'), color='black', linestyle='--', linewidth=2)

# adding titles and labels
plt.xlabel('Date', fontsize=26)
plt.ylabel('Mean Temperature (°C)', fontsize=26)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()

plt.savefig('output/mean_temp.png')
plt.close()

In [16]:
# print figure of mean wind speed data for DK1
plt.figure(figsize=(14, 8))
for municipality, group_data in df_weather_dk1.groupby('municipalityName'):
    plt.plot(group_data['from'], group_data['wind_speed'], label=municipality, alpha=0.8)

# add vertical line at august 1st 2024
plt.axvline(x=pd.to_datetime('2024-08-01 00:00:00'), color='black', linestyle='--', linewidth=2)

# adding titles and labels
plt.xlabel('Date', fontsize=26)
plt.ylabel('Mean Wind Speed (m/s)', fontsize=26)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()

plt.savefig('output/mean_wind_speed.png')
plt.close()

In [17]:
df_wide_dk1 = df_weather_dk1.pivot_table(index=['from','to'], columns=ids[0], 
                                             values=['temp', 'wind_speed',
                                                     'wind_dir_N', 'wind_dir_NE', 'wind_dir_E', 
                                                     'wind_dir_SE', 'wind_dir_S', 'wind_dir_SW',
                                                      'wind_dir_W'], # 'wind_dir_NW' is dropped as it is the reference category 
                                             aggfunc='first').reset_index()

# column names
df_wide_dk1.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_wide_dk1.columns]

# rename columns
df_wide_dk1.rename(columns={'from_': 'from', 'to_': 'to'}, inplace=True)

df_wide_dk1

Unnamed: 0,from,to,temp_Aabenraa,temp_Aalborg,temp_Aarhus,temp_Assens,temp_Billund,temp_Brønderslev,temp_Esbjerg,temp_Faaborg-Midtfyn,...,wind_speed_Syddjurs,wind_speed_Sønderborg,wind_speed_Thisted,wind_speed_Tønder,wind_speed_Varde,wind_speed_Vejen,wind_speed_Vejle,wind_speed_Vesthimmerlands,wind_speed_Viborg,wind_speed_Ærø
0,2017-10-23 00:00:00+00:00,2017-10-23 01:00:00+00:00,9.3,8.9,8.3,10.6,9.5,9.1,8.9,10.5,...,1.6,3.9,3.5,2.6,2.5,1.7,1.4,2.7,1.7,4.9
1,2017-10-23 01:00:00+00:00,2017-10-23 02:00:00+00:00,9.3,8.8,7.9,10.5,9.5,9.1,8.8,10.3,...,1.5,4.0,4.3,2.9,2.8,1.8,1.6,3.2,1.9,4.4
2,2017-10-23 02:00:00+00:00,2017-10-23 03:00:00+00:00,9.8,8.7,7.6,10.3,9.3,9.0,9.1,10.1,...,2.0,3.5,4.5,2.8,2.9,1.8,1.8,3.5,2.0,4.6
3,2017-10-23 03:00:00+00:00,2017-10-23 04:00:00+00:00,9.7,8.7,8.3,10.2,9.3,9.1,9.2,10.1,...,2.1,3.0,5.2,2.6,2.7,2.2,1.7,3.9,2.4,3.8
4,2017-10-23 04:00:00+00:00,2017-10-23 05:00:00+00:00,9.9,8.7,8.7,10.2,9.3,8.8,9.2,10.1,...,1.9,3.1,4.8,2.9,3.2,2.3,2.2,3.5,2.3,3.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62299,2024-11-30 19:00:00+00:00,2024-11-30 20:00:00+00:00,3.0,5.8,3.4,3.7,4.3,5.7,5.0,3.9,...,4.6,3.8,6.4,3.4,4.6,3.6,3.2,4.7,4.4,4.7
62300,2024-11-30 20:00:00+00:00,2024-11-30 21:00:00+00:00,3.1,6.0,3.4,3.7,4.4,6.0,5.2,3.7,...,4.3,3.0,6.2,3.2,4.2,3.7,3.4,4.8,4.4,4.1
62301,2024-11-30 21:00:00+00:00,2024-11-30 22:00:00+00:00,3.4,6.3,3.4,3.6,4.4,6.2,5.2,3.2,...,3.6,2.9,5.8,3.2,4.5,3.3,3.0,4.8,4.4,3.8
62302,2024-11-30 22:00:00+00:00,2024-11-30 23:00:00+00:00,3.4,6.4,3.6,3.8,4.5,6.4,5.3,3.2,...,3.5,2.7,5.7,3.2,4.7,2.9,2.7,4.5,4.0,3.9


## Energinet data

--------
https://www.energidataservice.dk/tso-electricity/Elspotprices


In [18]:
# find the first date
start_date = df_wide_dk1['from'].min()
end_date = df_wide_dk1['to'].max()

# convert to YYYY-MM-DDTHH:MM string
start = start_date.strftime('%Y-%m-%dT%H:%M')
end = end_date.strftime('%Y-%m-%dT%H:%M')

# define url and parameters
base_url = 'https://api.energidataservice.dk/dataset/Elspotprices'
params = {
    'offset': 0,
    'start': start,
    'filter': '{"PriceArea":["DK1"]}' # DK1 is the price area for west Denmark, DK2 is for east Denmark https://energinet.dk/El/Systemydelser/Introduktion-til-Systemydelser/Oversigt-over-systemydelser/
    }

In [19]:
# download the data
df_energy = ap.get_energydata(url=base_url, params=params)

# drop the HourDK column as dmi data is in UTC
df_energy.drop(columns=['HourDK', 'PriceArea', 'SpotPriceEUR'], inplace=True)

# convert to datetime
df_energy['HourUTC'] = pd.to_datetime(df_energy['HourUTC']).dt.tz_localize('UTC')
# remove data after end_date
df_energy = df_energy[df_energy['HourUTC'] < end_date]
df_energy


Unnamed: 0,HourUTC,SpotPriceDKK
311,2024-11-30 23:00:00+00:00,710.099976
312,2024-11-30 22:00:00+00:00,795.539978
313,2024-11-30 21:00:00+00:00,856.030029
314,2024-11-30 20:00:00+00:00,836.640015
315,2024-11-30 19:00:00+00:00,868.260010
...,...,...
62612,2017-10-23 02:00:00+00:00,165.399994
62613,2017-10-23 01:00:00+00:00,185.279999
62614,2017-10-23 00:00:00+00:00,201.580002
62615,2017-10-22 23:00:00+00:00,207.389999


In [20]:
# print figure of mean wind speed data for DK1
plt.figure(figsize=(14, 8))
plt.plot(df_energy['HourUTC'], df_energy['SpotPriceDKK'])

# add vertical line at august 1st 2024
plt.axvline(x=pd.to_datetime('2024-08-01 00:00:00'), color='black', linestyle='--', linewidth=2)

# Adding titles and labels
plt.xlabel('Date', fontsize=26)
plt.ylabel('Spot Price (DKK per MWh)', fontsize=26)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()

plt.savefig('output/spot_price_dkk.png')
# plt.show()
plt.close()

## Import oil and gas prices from Yahoo Finance


--------
https://finance.yahoo.com/quote/TTF=F/

https://finance.yahoo.com/quote/BZ=F/

In [21]:
# define tickers and time range
tickers = ["BZ=F", "TTF=F"]
start_date = "2017-01-01"
end_date = "2024-12-01"

# Fetch historical data
data = {}
for ticker in tickers:
    data[ticker] = yf.download(ticker, start=start_date, end=end_date)

# Combine dataframes and align dates
df_comodities = pd.DataFrame()
for ticker, df in data.items():
    column_name = "brent_crude" if ticker == "BZ=F" else "TTF_natural_gas"
    df = df["Close"].rename(column_name)  # Keep only the Close price and rename the column
    df_comodities = pd.concat([df_comodities, df], axis=1)

# Expand to include all dates and fill missing values
df_comodities.index = pd.to_datetime(df_comodities.index)
df_comodities = df_comodities.asfreq('D')  # Expand to daily frequency
df_comodities = df_comodities.ffill()  # Forward fill only
df_comodities.dropna(inplace=True)  # Drop any remaining NaN values

# Display the resulting dataframe
display(df_comodities.head())



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,brent_crude,TTF_natural_gas
2017-10-23,57.369999,18.09
2017-10-24,58.330002,17.959999
2017-10-25,58.439999,18.110001
2017-10-26,59.299999,18.07
2017-10-27,60.439999,18.15


In [22]:
# Plot Brent Crude
plt.figure(figsize=(14, 8))
plt.plot(df_comodities.index, df_comodities['brent_crude'], label='Brent Crude')
plt.axvline(x=pd.to_datetime('2024-08-01'), color='black', linestyle='--', linewidth=2)
plt.xlabel('Date', fontsize=26)
plt.ylabel('Spot Price (USD per Barrel)', fontsize=26)
plt.title('Brent Crude Oil Prices', fontsize=28)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()
plt.savefig('output/brent_crude_prices.png')
plt.close()

In [23]:
# Plot Natural Gas
plt.figure(figsize=(14, 8))
plt.plot(df_comodities.index, df_comodities['TTF_natural_gas'], label='TTF Natural Gas', color='orange')
plt.axvline(x=pd.to_datetime('2024-08-01'), color='black', linestyle='--', linewidth=2)
plt.xlabel('Date', fontsize=26)
plt.ylabel('Spot Price (EUR per MWh)', fontsize=26)
plt.title('TTF Natural Gas Prices', fontsize=28)
plt.tick_params(axis='both', which='major', labelsize=24)
plt.grid(True)
plt.tight_layout()
plt.savefig('output/ttf_natural_gas_prices.png')
plt.close()

## Combine datasets



In [24]:
# join on from and HourUTC
df = pd.merge(df_energy, df_wide_dk1, left_on=['HourUTC'], right_on=['from'], how='inner').drop(columns=['HourUTC'], axis=1)

display(df)

Unnamed: 0,SpotPriceDKK,from,to,temp_Aabenraa,temp_Aalborg,temp_Aarhus,temp_Assens,temp_Billund,temp_Brønderslev,temp_Esbjerg,...,wind_speed_Syddjurs,wind_speed_Sønderborg,wind_speed_Thisted,wind_speed_Tønder,wind_speed_Varde,wind_speed_Vejen,wind_speed_Vejle,wind_speed_Vesthimmerlands,wind_speed_Viborg,wind_speed_Ærø
0,710.099976,2024-11-30 23:00:00+00:00,2024-12-01 00:00:00+00:00,3.6,6.4,3.7,4.1,4.6,6.4,5.5,...,3.3,3.3,6.0,3.3,4.6,3.1,3.0,4.8,4.1,3.8
1,795.539978,2024-11-30 22:00:00+00:00,2024-11-30 23:00:00+00:00,3.4,6.4,3.6,3.8,4.5,6.4,5.3,...,3.5,2.7,5.7,3.2,4.7,2.9,2.7,4.5,4.0,3.9
2,856.030029,2024-11-30 21:00:00+00:00,2024-11-30 22:00:00+00:00,3.4,6.3,3.4,3.6,4.4,6.2,5.2,...,3.6,2.9,5.8,3.2,4.5,3.3,3.0,4.8,4.4,3.8
3,836.640015,2024-11-30 20:00:00+00:00,2024-11-30 21:00:00+00:00,3.1,6.0,3.4,3.7,4.4,6.0,5.2,...,4.3,3.0,6.2,3.2,4.2,3.7,3.4,4.8,4.4,4.1
4,868.260010,2024-11-30 19:00:00+00:00,2024-11-30 20:00:00+00:00,3.0,5.8,3.4,3.7,4.3,5.7,5.0,...,4.6,3.8,6.4,3.4,4.6,3.6,3.2,4.7,4.4,4.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62299,284.950012,2017-10-23 04:00:00+00:00,2017-10-23 05:00:00+00:00,9.9,8.7,8.7,10.2,9.3,8.8,9.2,...,1.9,3.1,4.8,2.9,3.2,2.3,2.2,3.5,2.3,3.1
62300,220.339996,2017-10-23 03:00:00+00:00,2017-10-23 04:00:00+00:00,9.7,8.7,8.3,10.2,9.3,9.1,9.2,...,2.1,3.0,5.2,2.6,2.7,2.2,1.7,3.9,2.4,3.8
62301,165.399994,2017-10-23 02:00:00+00:00,2017-10-23 03:00:00+00:00,9.8,8.7,7.6,10.3,9.3,9.0,9.1,...,2.0,3.5,4.5,2.8,2.9,1.8,1.8,3.5,2.0,4.6
62302,185.279999,2017-10-23 01:00:00+00:00,2017-10-23 02:00:00+00:00,9.3,8.8,7.9,10.5,9.5,9.1,8.8,...,1.5,4.0,4.3,2.9,2.8,1.8,1.6,3.2,1.9,4.4


In [25]:
df['date'] = df['from'].dt.date
df_comodities['date'] = df_comodities.index.date

# merge datasets
df_merged = pd.merge(df, df_comodities, on='date', how='inner')

# drop the date column
df_merged.drop(columns=['date'], inplace=True)
df_merged.reset_index(drop=True, inplace=True)

# add constant
df_merged['constant'] = 1

# Display the resulting merged dataframe
display(df_merged)


Unnamed: 0,SpotPriceDKK,from,to,temp_Aabenraa,temp_Aalborg,temp_Aarhus,temp_Assens,temp_Billund,temp_Brønderslev,temp_Esbjerg,...,wind_speed_Tønder,wind_speed_Varde,wind_speed_Vejen,wind_speed_Vejle,wind_speed_Vesthimmerlands,wind_speed_Viborg,wind_speed_Ærø,brent_crude,TTF_natural_gas,constant
0,596.570007,2024-11-29 23:00:00+00:00,2024-11-30 00:00:00+00:00,2.8,5.5,3.3,4.3,4.2,5.7,4.8,...,4.2,6.0,3.6,3.6,5.8,4.6,5.7,72.940002,47.811001,1
1,770.270020,2024-11-29 22:00:00+00:00,2024-11-29 23:00:00+00:00,3.1,6.0,3.4,4.5,4.2,6.2,5.1,...,4.0,6.0,3.4,3.5,5.9,4.5,5.9,72.940002,47.811001,1
2,848.200012,2024-11-29 21:00:00+00:00,2024-11-29 22:00:00+00:00,3.3,6.2,4.1,4.5,3.9,6.4,5.2,...,4.1,5.6,3.2,3.0,5.4,4.3,5.7,72.940002,47.811001,1
3,836.049988,2024-11-29 20:00:00+00:00,2024-11-29 21:00:00+00:00,3.1,6.5,4.3,4.6,4.2,6.6,5.1,...,3.8,5.5,3.2,3.0,5.3,4.2,5.5,72.940002,47.811001,1
4,894.219971,2024-11-29 19:00:00+00:00,2024-11-29 20:00:00+00:00,3.1,6.6,4.7,4.7,4.8,6.6,5.3,...,4.0,5.5,3.7,3.4,5.7,4.2,5.3,72.940002,47.811001,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62275,284.950012,2017-10-23 04:00:00+00:00,2017-10-23 05:00:00+00:00,9.9,8.7,8.7,10.2,9.3,8.8,9.2,...,2.9,3.2,2.3,2.2,3.5,2.3,3.1,57.369999,18.090000,1
62276,220.339996,2017-10-23 03:00:00+00:00,2017-10-23 04:00:00+00:00,9.7,8.7,8.3,10.2,9.3,9.1,9.2,...,2.6,2.7,2.2,1.7,3.9,2.4,3.8,57.369999,18.090000,1
62277,165.399994,2017-10-23 02:00:00+00:00,2017-10-23 03:00:00+00:00,9.8,8.7,7.6,10.3,9.3,9.0,9.1,...,2.8,2.9,1.8,1.8,3.5,2.0,4.6,57.369999,18.090000,1
62278,185.279999,2017-10-23 01:00:00+00:00,2017-10-23 02:00:00+00:00,9.3,8.8,7.9,10.5,9.5,9.1,8.8,...,2.9,2.8,1.8,1.6,3.2,1.9,4.4,57.369999,18.090000,1


In [26]:
print(df_merged.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62280 entries, 0 to 62279
Columns: 474 entries, SpotPriceDKK to constant
dtypes: bool(364), datetime64[ns, UTC](2), float64(107), int64(1)
memory usage: 73.9 MB
None


In [27]:
# save the data
df_merged.to_csv(path_out, index=False)
print(f'Data saved to {path_out}')

Data saved to ../../data/fulldata.csv
