# Weather data

### Read HDF5 file, convert to pandas format, concat data for 2018-2020

This file contains the code to

1) Read in the weather data in hdf5 format, each year stored in a seperate file, and convert the data format to a python dictionary containing the weather data over the available time span

2) Some data exploration for the weather data

3. Merge all weather data features to one dataframe with continuos 15 min timestamps

4. Some more data exploration containing

 - the visualization of each parameter

- a correlation analysis of the parameters, concluding that only 8 of 10 parameters are relevant for further use

(4. Additional code used to check code functionality and data quality)

-------------

#### Imports

In [None]:
import h5py
import pandas as pd
import numpy as np
import pickle 
from datetime import datetime
import math
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.mode.chained_assignment = None 

#### Functions to convert data

In [None]:
def hdf_to_pandas(hdf_dataset):
    column_type_dict = {x:str(y[0]) for x,y in hdf_dataset.dtype.fields.items()}
    column_list = []
    for index in column_type_dict:
        column_list.append(index)
    list_of_rows = []
    for line in range(0, hdf_dataset.size):
        list_of_rows.append(np.asarray(hdf_dataset[line]).tolist())
    return pd.DataFrame(data=list_of_rows, columns=column_list)

def first_n_digits(num, n):
    return num // 10 ** (int(math.log(num, 10)) - n + 1)

-------------

### 1. Read in hdf5 data and convert to pandas format

#### weather data for 2018 to one dictionary

In [None]:
file = h5py.File('Data/HDF5data/weather/2018_weather.hdf5', 'r')
dset_weather = file["WEATHER_SERVICE"]
dset_weather = dset_weather["IN"]

weather_dict_2018 = {}
for key in dset_weather:
    df_variable = dset_weather[key]
    df_variable = df_variable['table']
    weather_dict_2018[key] = hdf_to_pandas(df_variable)
    
    #shorten 64 to 32 bit integer
    weather_dict_2018[key]["index"] = weather_dict_2018[key]["index"].apply(lambda x: first_n_digits(x, 10))

#### weather data for 2019 to one dictionary

In [None]:
file = h5py.File('Data/HDF5data/weather/2019_weather.hdf5', 'r')
dset_weather = file["WEATHER_SERVICE"]
dset_weather = dset_weather["IN"]

weather_dict_2019 = {}
for key in dset_weather:
    df_variable = dset_weather[key]
    df_variable = df_variable['table']
    weather_dict_2019[key] = hdf_to_pandas(df_variable)
    
    #shorten 64 to 32 bit integer
    weather_dict_2019[key]["index"] = weather_dict_2019[key]["index"].apply(lambda x: first_n_digits(x, 10))

#### weather data for 2020 to one dictionary

In [None]:
file = h5py.File('Data/HDF5data/weather/2020_weather.hdf5', 'r')
dset_weather = file["WEATHER_SERVICE"]
dset_weather = dset_weather["IN"]

weather_dict_2020 = {}
for key in dset_weather:
    df_variable = dset_weather[key]
    df_variable = df_variable['table']
    weather_dict_2020[key] = hdf_to_pandas(df_variable)
    
    #shorten 64 to 32 bit integer
    weather_dict_2020[key]["index"] = weather_dict_2020[key]["index"].apply(lambda x: first_n_digits(x, 10))

#### concat weather data, 2018-2020 for each parameter in one dataframe

In [None]:
weather_dict = {}

for parameter in weather_dict_2018:
    weather_dict[parameter] = pd.concat([weather_dict_2018[parameter],weather_dict_2019[parameter],weather_dict_2020[parameter]])

#### save to pickle file

In [None]:
with open('Data/weather/data_weather.pkl', 'wb') as f:
    pickle.dump(weather_dict, f)

#### read saved file

In [None]:
with open('Data/weather/data_weather.pkl', 'rb') as f:
    weather_dict = pickle.load(f)

______________________________

### 2. Raw data exploration

Number of available information of each feature

In [None]:
for parameter in weather_dict:
    print(str(parameter) + " " + str(len(weather_dict[parameter])))

time resolution for temperature

In [None]:
parameter = 'WEATHER_TEMPERATURE_TOTAL'
weather_dict_2019[parameter].head(5)

In [None]:
weather_dict_2019[parameter]['time_difference'] = weather_dict_2019[parameter]['index'] - weather_dict_2019[parameter]['index'].shift(1)
weather_dict_2019[parameter]['time_difference'].value_counts()

-> No standardized time stamps

______________________________

### 3. Merge weather data

Get load data index as reference 

In [None]:
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)
ref_index = load_dict['SFH10']['index']

In [None]:
df_list = []
for df_type in weather_dict:
    df_ref = ref_index.to_frame().set_index('index')
    df_ref[df_type] = np.nan
    df_temp = weather_dict[df_type]
    for index in ref_index:
        sub_df = df_temp[(df_temp['index'] >= index) & (df_temp['index'] <= index+900)]
        if sub_df.empty:
            #take previous value
            df_ref.loc[index][df_type] = df_ref.loc[index-900][df_type]
        else:
            #take mean value
            df_ref.loc[index][df_type] = sub_df.iloc[:,1].mean()
    df_list.append(df_ref)
weather_data = pd.concat(df_list, axis=1)
with open('Data/weather/data_weather_merged.pkl', 'wb') as f:
    pickle.dump(weather_data, f)

In [None]:
with open('Data/weather/data_weather_merged.pkl', 'rb') as f:
    weather_data = pickle.load(f)

______________________________

### 4. Weather data exploration

In [None]:
df_analysis = pd.DataFrame(columns=weather_data.columns, index=['min', 'max', 'mean', 'median', 'missing values'])
for column in weather_data.columns:
    df_analysis.loc['min'][column] = weather_data[column].min()
    df_analysis.loc['max'][column] = weather_data[column].max()
    df_analysis.loc['mean'][column] = weather_data[column].mean()
    df_analysis.loc['median'][column] = weather_data[column].median()
    df_analysis.loc['missing values'][column] = len(weather_data) - weather_data[column].value_counts().sum()
df_analysis

In [None]:
data_plots = weather_data.copy()
data_plots.reset_index(inplace=True)
data_plots['index'] = pd.to_datetime(data_plots['index'], unit='s')
data_plots.set_index('index', inplace=True)

fig, a = plt.subplots(5, 2, figsize=(20, 20), tight_layout=True)
data_plots.plot(ax=a, subplots=True, rot=60)

Korrelation zwischen den einzelnen Wetterparameter

In [None]:
columns_dict = {
    'WEATHER_APPARENT_TEMPERATURE_TOTAL':           'Scheintemperatur',
    'WEATHER_ATMOSPHERIC_PRESSURE_TOTAL':           'Luftdruck',
    'WEATHER_PRECIPITATION_RATE_TOTAL':             'Niederschlag',
    'WEATHER_PROBABILITY_OF_PRECIPITATION_TOTAL':   'Niederschlagswahrscheinlichkeit',
    'WEATHER_RELATIVE_HUMIDITY_TOTAL':              'Relative Luftfeuchtigkeit',
    'WEATHER_SOLAR_IRRADIANCE_GLOBAL':              'Sonneneinstrahlung',
    'WEATHER_TEMPERATURE_TOTAL':                    'Temperatur',
    'WEATHER_WIND_DIRECTION_TOTAL':                 'Windrichtung',
    'WEATHER_WIND_GUST_SPEED_TOTAL':                'Windböenstärke',
    'WEATHER_WIND_SPEED_TOTAL':                     'Windgeschwindigkeit'
}


correlation_matrix = weather_data.rename(columns=columns_dict).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Korrelationsmatrix')
#plt.xlabel('Variablen')
#plt.ylabel('Variablen')

plt.show()


-> Entfernen der Scheintemperatur sowie der Windböenstärke, da diese von der Absoluttemperatur sowie der Windgeschwindigkeit bereits gut erfasst werden

In [None]:
reduced_weather_data = weather_data.drop(columns=['WEATHER_APPARENT_TEMPERATURE_TOTAL', 'WEATHER_WIND_GUST_SPEED_TOTAL'], inplace=True)
with open('Data/weather/data_weather_v1.pkl', 'wb') as f:
    pickle.dump(reduced_weather_data, f)