# Data Exploration

This notebook contains code for

1) Processing the WPUQ data: Load heat pump and weather data

2) Exploration of the heat pump load data: Data availability, analysis of missing intervals, time series plots, aggregated load and correlation, influence of day type, operation mode, nominal capacity of heat pumps

3. Exploration of the weather data: statistical distribution, time series plots, correlation

4. Exploration of additional information: building information

In order to keep this notebook clearly readable, some functions are outsourced in utils/

____

### Imports

In [2]:
import pandas as pd
import numpy as np
import pickle
import math
import h5py

import plotly.graph_objs as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt

____

## 1. Processing WPUQ Data

In [3]:
def hdf_to_pandas(hdf_dataset):
    '''
    input:  hdf table
    output: pandas DataFrame
    '''
    column_type_dict = {x:str(y[0]) for x,y in hdf_dataset.dtype.fields.items()}
    column_list, list_of_rows = [], []
    for index in column_type_dict:
        column_list.append(index)
    for line in range(0, hdf_dataset.size):
        list_of_rows.append(np.asarray(hdf_dataset[line]).tolist())

    return pd.DataFrame(data=list_of_rows, columns=column_list)

def first_n_digits(num, n):
    return num // 10 ** (int(math.log(num, 10)) - n + 1)

### 1.1 Load heat pump data

Read in the load pump data for 36 houses in hdf5 format, each year stored in a seperate file and convert the data format to a python dictionary containing the load data of each house over the available time span

In [3]:
load_path_WPUQ = "data/WPUQ/heatpumps"
weather_path_WPUQ = "data/WPUQ/weather"
path_concat = "data/raw"
path_cleaned = "data/cleaned"

In [None]:
# 2018
file = h5py.File(f'{load_path_WPUQ}/2018_data_15min.hdf5', 'r')
dset_no_pv = file['NO_PV']
dset_pv = file["WITH_PV"]

df_dict_2018 = {}
for key in dset_no_pv.keys():
    df_dict_2018[key] = hdf_to_pandas(dset_no_pv[key]["HEATPUMP"]['table'])
for key in dset_pv.keys():
    df_dict_2018[key] = hdf_to_pandas(dset_pv[key]["HEATPUMP"]['table'])

# 2019
file = h5py.File(f'{load_path_WPUQ}/2019_data_15min.hdf5', 'r')
dset_no_pv = file['NO_PV']
dset_pv = file["WITH_PV"]

df_dict_2019 = {}
for key in dset_no_pv.keys():
    #dset_house = dset_no_pv[key]
    df_dict_2019[key] = hdf_to_pandas(dset_no_pv[key]["HEATPUMP"]['table'])
for key in dset_pv.keys():
    df_dict_2019[key] = hdf_to_pandas(dset_pv[key]["HEATPUMP"]['table'])

# 2020
file = h5py.File(f'{load_path_WPUQ}/2020_data_15min.hdf5', 'r')
dset_no_pv = file['NO_PV']
dset_pv = file["WITH_PV"]

df_dict_2020 = {}
for key in dset_no_pv.keys():
    #dset_house = dset_no_pv[key]
    df_dict_2020[key] = hdf_to_pandas(dset_no_pv[key]["HEATPUMP"]['table'])
for key in dset_pv.keys():
    df_dict_2020[key] = hdf_to_pandas(dset_pv[key]["HEATPUMP"]['table'])

# concat 
df_dict = {}

for key_house in df_dict_2020:
    df_dict[key_house] = pd.concat([df_dict_2018[key_house], df_dict_2019[key_house], df_dict_2020[key_house]])

for key_house in df_dict:
    if len(df_dict[key_house]) != 105216:
        print("issue with " + str(key_house))

print("data for {} houses".format(len(df_dict)))

with open(f'{path_concat}/data_heatpump.pkl', 'wb') as f:
    pickle.dump(df_dict, f)

### 1.2 Load Weather Data

In [None]:
file = h5py.File(f'{weather_path_WPUQ}/weather/2018_weather.hdf5', 'r')
dset_weather = file["WEATHER_SERVICE"]
dset_weather = dset_weather["IN"]

weather_dict_2018 = {}
for key in dset_weather:
    df_variable = dset_weather[key]
    df_variable = df_variable['table']
    weather_dict_2018[key] = hdf_to_pandas(df_variable)
    
    #shorten 64 to 32 bit integer
    weather_dict_2018[key]["index"] = weather_dict_2018[key]["index"].apply(lambda x: first_n_digits(x, 10))

file = h5py.File(f'{weather_path_WPUQ}/weather/2019_weather.hdf5', 'r')
dset_weather = file["WEATHER_SERVICE"]
dset_weather = dset_weather["IN"]

weather_dict_2019 = {}
for key in dset_weather:
    df_variable = dset_weather[key]
    df_variable = df_variable['table']
    weather_dict_2019[key] = hdf_to_pandas(df_variable)
    
    #shorten 64 to 32 bit integer
    weather_dict_2019[key]["index"] = weather_dict_2019[key]["index"].apply(lambda x: first_n_digits(x, 10))

file = h5py.File(f'{weather_path_WPUQ}/weather/2020_weather.hdf5', 'r')
dset_weather = file["WEATHER_SERVICE"]
dset_weather = dset_weather["IN"]

weather_dict_2020 = {}
for key in dset_weather:
    df_variable = dset_weather[key]
    df_variable = df_variable['table']
    weather_dict_2020[key] = hdf_to_pandas(df_variable)
    
    #shorten 64 to 32 bit integer
    weather_dict_2020[key]["index"] = weather_dict_2020[key]["index"].apply(lambda x: first_n_digits(x, 10))

weather_dict = {}

for parameter in weather_dict_2018:
    weather_dict[parameter] = pd.concat([weather_dict_2018[parameter],weather_dict_2019[parameter],weather_dict_2020[parameter]])

with open(f'{path_concat}/data_weather.pkl', 'wb') as f:
    pickle.dump(weather_dict, f)

Merge weather data - create consistent index

In [None]:
with open('Data/heatpump/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)
ref_index = load_dict['SFH10']['index']

df_list = []
for df_type in weather_dict:
    df_ref = ref_index.to_frame().set_index('index')
    df_ref[df_type] = np.nan
    df_temp = weather_dict[df_type]
    for index in ref_index:
        sub_df = df_temp[(df_temp['index'] >= index) & (df_temp['index'] <= index+900)]
        if sub_df.empty:
            #take previous value
            df_ref.loc[index][df_type] = df_ref.loc[index-900][df_type]
        else:
            #take mean value
            df_ref.loc[index][df_type] = sub_df.iloc[:,1].mean()
    df_list.append(df_ref)
weather_data = pd.concat(df_list, axis=1)
with open(f'{path_concat}/data_weather_merged.pkl', 'wb') as f:
    pickle.dump(weather_data, f)

____

## 2. Exploration of Heat Pump Data

In [9]:
with open(f'{path_concat}/data_heatpump.pkl', 'rb') as f:
    load_dict = pickle.load(f)

#### 2.1 Data Availability

In [None]:
from utils.plot_utils import plot_data_availability

Create Dataframe containing binary information about data availability for each timestamp

In [None]:
def check_nan(x):
    if x >= 0:
        return 1
    else: 
        return 0
    
df_result = load_dict['SFH10']['index'].to_frame()
for df in load_dict:
    load_dict[df][df] = load_dict[df]['P_TOT'].apply(check_nan)
    df_result = pd.concat([df_result, load_dict[df][df]], axis=1)
df_result.set_index('index', inplace=True)
df_result

In [None]:
# set time zone to germany 
import locale
locale.setlocale(locale.LC_TIME, 'de_DE')

plot_data_availability(df_result[df_result.index > 1525125600])

Improve data quality by changing start time and selecting datasets

In [None]:
# index from start of data availability of SFH37
df_reduced = df_result[df_result.index >= 1528965000] 
plot_data_availability(df_reduced)

In [None]:
threshold = 0.85
column_list=[]
missing_list=[]
incomplete_list = []
complete_list = []
for column in df_reduced.columns:
    percentage = df_reduced[column].sum()/len(df_reduced)
    if percentage > threshold:
        if percentage != 1:
            incomplete_list.append(column)
        if percentage ==1: 
            complete_list.append(column)
        column_list.append(column)
    else:
        missing_list.append(column)

print('reduced to {} datasets'.format(len(column_list)))

plot_data_availability(df_reduced[column_list])

#### 2.2 Analysis of missing intervalls

In [None]:
def get_missing_intervals(df, column):
    """
    Identifies contiguous intervals of missing data (represented by zeros) in a given DataFrame column.

    Parameters:
    - df (pandas.DataFrame): DataFrame containing the data.
    - column (str): The name of the column to analyze for missing data intervals.

    Returns:
    - list of tuples: Each tuple represents a missing data interval with the start and end indices.
    """
    # Mark groups of contiguous values (different from the previous value)
    df['group'] = (df[column] != df[column].shift()).cumsum()

    # Identify groups where the column value is 0 (missing data)
    zero_groups = df[df[column] == 0].groupby('group')

    # Initialize an empty list to store the start and end indices of missing intervals
    result = []

    # Iterate through each group of zeros and record the start and end index
    for _, group in zero_groups:
        start_index = group.index[0]
        end_index = group.index[-1]
        result.append((start_index, end_index))

    # Remove the 'group' column to clean up the DataFrame
    df.drop(columns='group', inplace=True)

    return result

def get_interval_length(interval):
    return interval[1] - interval[0]

In [None]:
dict_intervalls = {}

df_result = df_result[df_result.index > 1528965000]
for column in df_result.columns:
    if column in incomplete_list:
        df = df_result[column].to_frame()
        intervalls = get_missing_intervals(df, column)
        dict_intervalls[column] = intervalls
        #print(column + ": " + str(intervalls))

#with open('Data/missing_intervalls_dict.pkl', 'wb') as f:
#    pickle.dump(dict_intervalls, f)

dict_intervalls

In [None]:
print("complete list 100%: " + str(complete_list))
print("incomplete list >85%: " + str(incomplete_list))
print("insufficient list <85%: " + str(missing_list))

In [None]:
sum = 0
nb = 0
month_dict={}

for key in dict_intervalls:
    tuple_list = []
    for intervalls in dict_intervalls[key]:
        start = pd.to_datetime(intervalls[0], unit='s').month
        end = pd.to_datetime(intervalls[1], unit='s').month
        secs = intervalls[1]-intervalls[0]
        sum += secs
        nb +=1
        tuple_list.append((start, end))
    month_dict[key] = [month for month in range(start, end, 1)]

print("Mittlere Anzahl an fehlenden Tagen: " + str(sum/((60*60*24)*nb)))

Distribution of missing intervals per month and dataset

In [None]:
# Dictionary mapping sensor identifiers to the months with missing data
dict_months = {
    'SFH10': [11, 12],
    'SFH11': [5, 6, 7, 8, 9],
    'SFH20': [6, 7, 8, 9, 10, 11],
    'SFH21': [6, 7, 8],
    'SFH23': [7, 8, 9],
    'SFH38': [6, 7, 8],
    'SFH39': [6, 7, 8],
    'SFH5': [8, 9],
    'SFH7': [9, 10, 11],
}

# Extract all unique months from the dictionary values to ensure coverage
unique_months = sorted(set(month for months in dict_months.values() for month in months))

# List of month names for x-axis labels
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# Creating the stacked bar chart
fig = go.Figure()
for key in dict_months:
    fig.add_trace(go.Bar(
        name=key,
        x=[month_names[month - 1] for month in unique_months],  # Convert month numbers to names
        y=[1 if month in dict_months[key] else 0 for month in unique_months]  # Presence of the month in the data
    ))

# Updating layout properties using update methods
fig.update_layout(
    barmode='stack',
    title='Number of Missing Records per Month',
    title_x=0.5,
    title_y=0.85,
    yaxis_title='Number of Missing Months',
    legend_title='Dataset',
    template="plotly_white"
)
fig.update_xaxes(
    title='Month',
    tickvals=[month_names[month - 1] for month in unique_months],  # Set custom tick values
    ticktext=[month_names[month - 1] for month in unique_months]  # Set custom tick labels
)

fig.show()

#### 2.3 Time series plots

In [None]:
from utils.plot_utils import plot_consumption, plot_resampled_consumption, plot_yearly_resampled_consumption, plot_consumption_type_histo

In [None]:
df_3 = load_dict["SFH3"]
plot_consumption(df_3, 'SFH3')
plot_resampled_consumption(df_3, 'SFH3')
plot_yearly_resampled_consumption(df_3, 'SFH3')

#### 2.4 Aggregated Load - Correlation

In [None]:
INDEX_START = 1525270500
COLUMNS = ['P_TOT', 'Q_TOT', 'S_TOT', 'PF_TOT']

In [None]:
INDEX_START = 1525270500
COLUMNS = ['P_TOT', 'Q_TOT', 'S_TOT', 'PF_TOT']

# exemplary index
df_3 = load_dict["SFH3"]
df_3 = df_3[df_3['index']>INDEX_START]
df_3.set_index('index', inplace=True)
df_3.index = pd.to_datetime(df_3.index, unit='s')

# aggregate load of all datasets
df_summe = pd.DataFrame(index=df_3.index, columns=COLUMNS)
for key in load_dict:
    df_house = load_dict[key].copy()
    df_house['index'] = pd.to_datetime(df_house['index'], unit='s')
    df_house.set_index('index', inplace=True)
    df_house = df_house[df_house.index > pd.to_datetime(INDEX_START, unit='s')]
    df_house = df_house[COLUMNS]
    df_summe = df_summe.fillna(0) + df_house.fillna(0)

# calculate mean of cos(phi)  
df_summe['PF_TOT'] = df_summe['PF_TOT']/len(load_dict)

In [None]:
# get correlation matrix
correlation_matrix = df_summe.corr()
ylabels = ['Wirkleistung P', 'Scheinleistung S', 'Blindleistung Q', 'Leistungsfaktor cos(φ)']
xlabels = ['P', 'S', 'Q', 'cos(φ)']

heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Korrelationsmatrix',fontsize=14)
heatmap.set_yticklabels(ylabels, rotation=0, fontsize=14)
heatmap.set_xticklabels(xlabels, rotation=45, fontsize=14)  # Rotation für bessere Lesbarkeit
plt.show()

#### 2.5 Influence of day type

In [None]:
# Convert the DataFrame index to datetime, assuming the index represents Unix timestamps
df_summe.index = pd.to_datetime(df_summe.index, unit='s')

# Separate the DataFrame into weekdays and weekends based on the day of the week
df_weekdays = df_summe[df_summe.index.dayofweek < 5]  # Monday=0, Sunday=6
df_weekend = df_summe[df_summe.index.dayofweek >= 5]

# Calculate the average values for weekdays and weekends
weekdays_avg = df_weekdays[['P_TOT', 'Q_TOT', 'S_TOT']].mean()
weekend_avg = df_weekend[['P_TOT', 'Q_TOT', 'S_TOT']].mean()

# Create a new DataFrame with the average values
df_avg = pd.DataFrame({'Weekdays': weekdays_avg, 'Weekend': weekend_avg})

# Create a horizontal bar chart
fig = go.Figure()

# Add bars for each dataset
for i, col in enumerate(df_avg.columns):
    fig.add_trace(go.Bar(
        y=df_avg.index,  # Column names are shown on the y-axis
        x=df_avg[col],  # Average values are shown on the x-axis
        name=col,  # Legend name
        orientation='h',  # Horizontal bars
        text=df_avg[col].round(2),  # Display rounded average values inside the bars
        textposition='inside'
        # marker_color=colors[i]  # Color of the bars, assuming 'colors' is a predefined list
    ))

# Update the layout for a grouped bar chart
fig.update_layout(
    barmode='group',  # Group the bars
    title='Average Values for Weekdays and Weekends',
    title_x=0.5,
    xaxis_title='Average Values',
    yaxis_title='',
    legend_title='Type of Day',
    bargap=0.2,  # Space between the bar groups
    template='simple_white'
)

# Display the chart
fig.show()

#### 2.6 Operation modes

- > P < 100W: Standby
- > 100W < P < 4kW: compressor mode
- > P > 4kW: heating rod mode

In [13]:
def check_operation_mode(x):
    if x < 100:
        return 1
    elif (x > 100) & (x < 4000):
        return 2
    elif x >=4000:
        return 3
    else:
        return 0
    
df_result = load_dict['SFH10']['index'].to_frame()
for df in load_dict:
    load_dict[df][df] = load_dict[df]['P_TOT'].apply(check_operation_mode)
    df_result = pd.concat([df_result, load_dict[df][df]], axis=1)
df_result.set_index('index', inplace=True)

Plot histogram for each year

In [None]:
sorted_columns = sorted(df_result.columns, key=lambda x: int(x.replace("SFH", "")))
df_consumptions_2018 = pd.DataFrame(index=sorted_columns, columns=['Standby', 'Kompressions-Modus', 'Heizstab-Modus'])

for index in df_consumptions_2018.index:
    df_house = load_dict[index].set_index('index')['P_TOT'].to_frame().fillna(0)
    df_house.index = pd.to_datetime(df_house.index, unit='s')
    df_house = df_house[df_house.index.year==2018]
    df_house = df_house.resample('H').mean()
    df_consumptions_2018.loc[index]['Standby'] = df_house[df_house['P_TOT']<100]['P_TOT'].sum()
    df_consumptions_2018.loc[index]['Kompressions-Modus'] = df_house[(df_house['P_TOT']>100)&(df_house['P_TOT']<4000)]['P_TOT'].sum()
    df_consumptions_2018.loc[index]['Heizstab-Modus'] = df_house[df_house['P_TOT']>=4000]['P_TOT'].sum()

for column in df_consumptions_2018.columns:
    df_consumptions_2018[column] = df_consumptions_2018[column]/1000
df_consumptions_2018.head()
plot_consumption_type_histo(df_consumptions_2018, 2018)

sorted_columns = sorted(df_result.columns, key=lambda x: int(x.replace("SFH", "")))
df_consumptions_2019 = pd.DataFrame(index=sorted_columns, columns=['Standby', 'Kompressions-Modus', 'Heizstab-Modus'])

for index in df_consumptions_2019.index:
    df_house = load_dict[index].set_index('index')['P_TOT'].to_frame().fillna(0)
    df_house.index = pd.to_datetime(df_house.index, unit='s')
    df_house = df_house[df_house.index.year==2019]
    df_house = df_house.resample('H').mean()
    df_consumptions_2019.loc[index]['Standby'] = df_house[df_house['P_TOT']<100]['P_TOT'].sum()
    df_consumptions_2019.loc[index]['Kompressions-Modus'] = df_house[(df_house['P_TOT']>100)&(df_house['P_TOT']<4000)]['P_TOT'].sum()
    df_consumptions_2019.loc[index]['Heizstab-Modus'] = df_house[df_house['P_TOT']>=4000]['P_TOT'].sum()

for column in df_consumptions_2019.columns:
    df_consumptions_2019[column] = df_consumptions_2019[column]/1000
df_consumptions_2019.head()
plot_consumption_type_histo(df_consumptions_2019,2019)

sorted_columns = sorted(df_result.columns, key=lambda x: int(x.replace("SFH", "")))
df_consumptions_2020 = pd.DataFrame(index=sorted_columns, columns=['Standby', 'Kompressions-Modus', 'Heizstab-Modus'])

for index in df_consumptions_2020.index:
    df_house = load_dict[index].set_index('index')['P_TOT'].to_frame().fillna(0)
    df_house.index = pd.to_datetime(df_house.index, unit='s')
    df_house = df_house[df_house.index.year==2020]
    df_house = df_house.resample('H').mean()
    df_consumptions_2020.loc[index]['Standby'] = df_house[df_house['P_TOT']<100]['P_TOT'].sum()
    df_consumptions_2020.loc[index]['Kompressions-Modus'] = df_house[(df_house['P_TOT']>100)&(df_house['P_TOT']<4000)]['P_TOT'].sum()
    df_consumptions_2020.loc[index]['Heizstab-Modus'] = df_house[df_house['P_TOT']>=4000]['P_TOT'].sum()

for column in df_consumptions_2020.columns:
    df_consumptions_2020[column] = df_consumptions_2020[column]/1000
df_consumptions_2020.head()
plot_consumption_type_histo(df_consumptions_2020, 2020)

#### 2.7 Calculation of heat pump nominal power 

In [3]:
from utils import data_utils
import config 
from utils.plot_utils import plot_consumption_with_band, plot_quantile_comparison

In [None]:
data = data_utils.data_loader(config.columns)

for index in data.index.unique():
    sub_df = data[data.index==index]
    if sub_df["P_TOT"].max() < 7900:
        continue
    elif len(sub_df[sub_df["P_TOT"]>7900]) < 200:
        continue
    else:
        sub_df = sub_df.set_index("index")
        sub_df.index = pd.to_datetime(sub_df.index, unit="s")
        plot_consumption_with_band(sub_df, '2019-01-01', '2021-01-01', index)

plot_quantile_comparison(data)

____

## 3. Exploration of Weather Data

In [5]:
with open(f'{path_concat}/data_weather.pkl', 'rb') as f:
    weather_dict = pickle.load(f)
with open(f'{path_concat}/data_weather_merged.pkl', 'rb') as f:
    weather_data = pickle.load(f)

In [14]:
df_analysis = pd.DataFrame(columns=weather_data.columns, index=['min', 'max', 'mean', 'median', 'missing values'])
for column in weather_data.columns:
    df_analysis.loc['min'][column] = weather_data[column].min()
    df_analysis.loc['max'][column] = weather_data[column].max()
    df_analysis.loc['mean'][column] = weather_data[column].mean()
    df_analysis.loc['median'][column] = weather_data[column].median()
    df_analysis.loc['missing values'][column] = len(weather_data) - weather_data[column].value_counts().sum()
df_analysis

Unnamed: 0,WEATHER_APPARENT_TEMPERATURE_TOTAL,WEATHER_ATMOSPHERIC_PRESSURE_TOTAL,WEATHER_PRECIPITATION_RATE_TOTAL,WEATHER_PROBABILITY_OF_PRECIPITATION_TOTAL,WEATHER_RELATIVE_HUMIDITY_TOTAL,WEATHER_SOLAR_IRRADIANCE_GLOBAL,WEATHER_TEMPERATURE_TOTAL,WEATHER_WIND_DIRECTION_TOTAL,WEATHER_WIND_GUST_SPEED_TOTAL,WEATHER_WIND_SPEED_TOTAL
min,-18.0,977.599976,0.0,5.0,19.0,0.0,-12.6,0.0,0.277778,0.1
max,48.099998,1047.699951,38.550001,100.0,100.0,975.0,37.400002,360.0,34.166668,14.1
mean,9.919311,1015.48095,0.085169,34.733337,72.6693,139.729787,11.020408,196.629154,7.416688,2.985826
median,10.2,1016.0,0.0,35.0,75.0,5.0,10.2,210.0,6.666667,2.675
missing values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Plot time series for each parameter

In [None]:
data_plots = weather_data.copy()
data_plots.index = pd.to_datetime(data_plots.index, unit="s")

# Create a subplot grid with 5 rows and 2 columns
fig = make_subplots(rows=5, cols=2, subplot_titles=data_plots.columns, vertical_spacing=0.05)

# Map DataFrame columns to subplots
row_col_pairs = [(row, col) for row in range(1, 6) for col in range(1, 3)]
for (column, (row, col)) in zip(data_plots.columns, row_col_pairs):
    fig.add_trace(
        go.Scatter(x=data_plots.index, y=data_plots[column], name=column),
        row=row, col=col
    )

# Customize the layout to match the Plotly White style
fig.update_layout(
    title_text='Verlauf verschiedener Spalten über die Zeit',
    title_x=0.5,
    showlegend=False,
    template='plotly_white',
    height=2000  # Adjust height to provide enough space for all subplots
)

# Customize x-axis ticks for all subplots
for axis in fig.layout:
    if axis.startswith('xaxis'):
        fig.layout[axis].tickangle = 45

# Display the plots
fig.show()

Plot time series for temperature and wind speed

In [None]:
df = weather_data.copy()
df.index = pd.to_datetime(df.index, unit="s")

# Create a subplot layout with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=('Messwerte Temperatur', 'Messwerte Windgeschwindigkeit'))

# Add the first column as a scatter plot to the subplot
fig.add_trace(go.Scatter(x=df.index, y=df['WEATHER_TEMPERATURE_TOTAL'], name='Temperatur'), row=1, col=1)

# Add the second column as a scatter plot to the subplot
fig.add_trace(go.Scatter(x=df.index, y=df['WEATHER_WIND_SPEED_TOTAL'], name='Windgeschwindigkeit'), row=1, col=2)

# Axis labels for the first subplot
fig.update_xaxes(title_text='Zeit', row=1, col=1)
fig.update_yaxes(title_text='Temperatur in °C', row=1, col=1)

# Axis labels for the second subplot
fig.update_xaxes(title_text='Zeit', row=1, col=2)
fig.update_yaxes(title_text='Windgeschwindigkeit in m/s', row=1, col=2)

# Update the layout to display the plots side by side
fig.update_layout(title_text='Verlauf der Wettermessdaten', showlegend=False, title_x=0.5, template='plotly_white', width=900)

# Display the plot
fig.show()

Correlation

In [None]:
columns_dict = {
    'WEATHER_APPARENT_TEMPERATURE_TOTAL':           'Scheintemperatur',
    'WEATHER_ATMOSPHERIC_PRESSURE_TOTAL':           'Luftdruck',
    'WEATHER_PRECIPITATION_RATE_TOTAL':             'Niederschlag',
    'WEATHER_PROBABILITY_OF_PRECIPITATION_TOTAL':   'Niederschlagswahrscheinlichkeit',
    'WEATHER_RELATIVE_HUMIDITY_TOTAL':              'Relative Luftfeuchtigkeit',
    'WEATHER_SOLAR_IRRADIANCE_GLOBAL':              'Sonneneinstrahlung',
    'WEATHER_TEMPERATURE_TOTAL':                    'Temperatur',
    'WEATHER_WIND_DIRECTION_TOTAL':                 'Windrichtung',
    'WEATHER_WIND_GUST_SPEED_TOTAL':                'Windböenstärke',
    'WEATHER_WIND_SPEED_TOTAL':                     'Windgeschwindigkeit'
}


correlation_matrix = weather_data.rename(columns=columns_dict).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Korrelationsmatrix')
#plt.xlabel('Variablen')
#plt.ylabel('Variablen')

plt.show()

-> Entfernen der Scheintemperatur sowie der Windböenstärke, da diese von der Absoluttemperatur sowie der Windgeschwindigkeit bereits gut erfasst werden

In [None]:
reduced_weather_data = weather_data.drop(columns=['WEATHER_APPARENT_TEMPERATURE_TOTAL', 'WEATHER_WIND_GUST_SPEED_TOTAL'])
with open(f'{path_cleaned}/data_weather_v1.pkl', 'wb') as f:
    pickle.dump(reduced_weather_data, f)

____

## 4. Exploration of Additional Information

In [6]:
info = pd.read_excel('data/raw/Gebaeudeinformationen.xlsx', header=0)
info.at[27, "Building area"] = info[info["Number of inhabitants"]==3].dropna()["Building area"].mean()
info.to_excel("data/cleaned/Gebaeudeinformationen_cleaned.xlsx")
info.head(3)

Unnamed: 0,Building number,Building area,Number of inhabitants
0,3,140.0,2
1,4,160.0,2
2,5,160.0,3
