In [1]:
import json

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('data/oozg/oozg_timestamp_ohe.csv')
df.head()

Unnamed: 0,Power,Messpunkt_ID,Timestamp,Bezeichnung_Strom,OHE_Labels
0,0.018,CH1024201234500000000000010008655,2022-03-01 00:00:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
1,0.024,CH1024201234500000000000010008655,2022-03-01 00:15:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2,0.01,CH1024201234500000000000010008655,2022-03-01 00:30:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
3,0.03,CH1024201234500000000000010008655,2022-03-01 00:45:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,0.013,CH1024201234500000000000010008655,2022-03-01 01:00:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"


In [5]:
# Convert string representation into a list
df['OHE_Labels'] = df['OHE_Labels'].apply(lambda x: eval(x))

In [6]:
with open("data/oozg/idx2label_oozg.json") as f:
    idx2label = json.load(f)
idx2label

{'0': 'Autoladestation',
 '1': 'Beleuchtung',
 '2': 'Blockspeicher',
 '3': 'Boiler',
 '4': 'Direktheizung',
 '5': 'Doppeltarif/Privatzähler',
 '6': 'Einspeiser Photovoltaik Anlage',
 '7': 'Einzelspeicher',
 '8': 'Saunaofen',
 '9': 'Tumbler',
 '10': 'Unknown',
 '11': 'WP Boiler',
 '12': 'Waschmaschine',
 '13': 'Waschmaschine und Tumbler',
 '14': 'Wärmepumpe'}

In [8]:
for idx, label in idx2label.items():
    df[label] = df['OHE_Labels'].apply(lambda x: x[int(idx)])
df.head()

Unnamed: 0,Power,Messpunkt_ID,Timestamp,Bezeichnung_Strom,OHE_Labels,Autoladestation,Beleuchtung,Blockspeicher,Boiler,Direktheizung,Doppeltarif/Privatzähler,Einspeiser Photovoltaik Anlage,Einzelspeicher,Saunaofen,Tumbler,Unknown,WP Boiler,Waschmaschine,Waschmaschine und Tumbler,Wärmepumpe
0,0.018,CH1024201234500000000000010008655,2022-03-01 00:00:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
1,0.024,CH1024201234500000000000010008655,2022-03-01 00:15:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
2,0.01,CH1024201234500000000000010008655,2022-03-01 00:30:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0.03,CH1024201234500000000000010008655,2022-03-01 00:45:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,0.013,CH1024201234500000000000010008655,2022-03-01 01:00:00,"Waschmaschine,Boiler","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]",0,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [9]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [18]:
def filter_one_year(group):
    start_date = group['Timestamp'].min()
    end_date = start_date + pd.DateOffset(years=1)
    return group[(group['Timestamp'] >= start_date) & (group['Timestamp'] < end_date)]

def filter_half_year(group):
    start_date = group['Timestamp'].min()
    end_date = start_date + pd.DateOffset(months=6)
    return group[(group['Timestamp'] >= start_date) & (group['Timestamp'] < end_date)]

In [19]:
one_year_df = df.groupby('Messpunkt_ID').apply(filter_one_year).reset_index(drop=True)
half_year_df = df.groupby('Messpunkt_ID').apply(filter_half_year).reset_index(drop=True)

In [21]:
print(f"Length of original df: {len(df)}")
print(f"Length of one year df: {len(one_year_df)}")
print(f"Length of half year df: {len(half_year_df)}")

Length of one year df: 2803200
Length of half year df: 1412800
Length of original df: 2803200


In [22]:
import os
for idx, device in idx2label.items():
    device_df = one_year_df.groupby('Messpunkt_ID')[device].max().reset_index()
    device_df.columns = ['id', 'label']
    if "/" in device:
        filename = f'{device.replace("/", "_").lower()}.csv'
    else:
        filename = f'{device.replace(" ", "_").lower()}.csv'
    try:
        device_df.to_csv(f'labels/{filename}', index=False)
    except:
        print(f'Error with filename: {filename}')

In [23]:
# save one_year_df to csv
one_year_df.to_csv('data/oozg/oozg_timestamp_ohe_one_year.csv', index=False)

: 