**Stromverbrauch Cleaning**

In [41]:
import pandas as pd

file_path = "../../00_Uncleaned_Data/Features/06Stromverbrauch_15minIntervall.csv"
df = pd.read_csv(file_path, delimiter=';')

df.head()

Unnamed: 0,Start der Messung,Start der Messung (Text),Stromverbrauch,Grundversorgte Kunden,Freie Kunden,Jahr,Monat,Tag,Wochentag,Tag des Jahres,Quartal,Woche des Jahres
0,2024-11-20T22:45:00+00:00,2024-11-20T23:45:00+0100,29723.678,14692.131,15031.548,2024,11,20,2,325,4,47
1,2024-11-20T22:30:00+00:00,2024-11-20T23:30:00+0100,30489.012,15280.308,15208.704,2024,11,20,2,325,4,47
2,2024-11-20T22:15:00+00:00,2024-11-20T23:15:00+0100,30939.471,15676.639,15262.832,2024,11,20,2,325,4,47
3,2024-11-20T22:00:00+00:00,2024-11-20T23:00:00+0100,31937.689,16444.321,15493.368,2024,11,20,2,325,4,47
4,2024-11-20T21:45:00+00:00,2024-11-20T22:45:00+0100,32717.173,17053.991,15663.182,2024,11,20,2,325,4,47


In [42]:
df.drop(columns=['Jahr', 'Monat', 'Tag', 'Wochentag', 'Tag des Jahres', 'Quartal', 'Woche des Jahres', 'Grundversorgte Kunden', 'Freie Kunden'], inplace=True)
df.head()

Unnamed: 0,Start der Messung,Start der Messung (Text),Stromverbrauch
0,2024-11-20T22:45:00+00:00,2024-11-20T23:45:00+0100,29723.678
1,2024-11-20T22:30:00+00:00,2024-11-20T23:30:00+0100,30489.012
2,2024-11-20T22:15:00+00:00,2024-11-20T23:15:00+0100,30939.471
3,2024-11-20T22:00:00+00:00,2024-11-20T23:00:00+0100,31937.689
4,2024-11-20T21:45:00+00:00,2024-11-20T22:45:00+0100,32717.173


In [43]:
# Ensure 'Start der Messung' is in datetime format
df['Start der Messung'] = pd.to_datetime(df['Start der Messung'], errors='coerce')

# Extract the hour from the 'Start der Messung' column
df['Hour'] = df['Start der Messung'].dt.floor('h')  

# Group by the hour and sum the relevant columns
aggregated_df = df.groupby('Hour').agg({
    'Stromverbrauch': 'sum'
}).reset_index()

# Display the resulting DataFrame 
aggregated_df = aggregated_df.sort_values(by='Hour', ascending=False)
aggregated_df.head()

Unnamed: 0,Hour,Stromverbrauch
112978,2024-11-20 22:00:00+00:00,123089.85
112977,2024-11-20 21:00:00+00:00,133712.379
112976,2024-11-20 20:00:00+00:00,143317.149
112975,2024-11-20 19:00:00+00:00,155465.131
112974,2024-11-20 18:00:00+00:00,169421.384


In [44]:
#checking for missing values

# Create a date range from the minimum to the maximum date in the 'Datum' column
date_range = pd.date_range(start=aggregated_df['Hour'].min(), end=aggregated_df['Hour'].max(), freq='D')

# Create a DataFrame with all the dates in the range
all_dates = pd.DataFrame({'Hour': date_range})

# Merge this new DataFrame with the original to find missing dates
missing_dates = pd.merge(all_dates, aggregated_df, on='Hour', how='left', indicator=True)

# Rows that are missing in the original DataFrame will have '_merge' == 'left_only'
missing_rows = missing_dates[missing_dates['_merge'] == 'left_only']

# Output the missing rows (if any)
if not missing_rows.empty:
    print(f"Missing rows for the following dates: {missing_rows['Hour'].tolist()}")
else:
    print("No rows are missing.")

No rows are missing.


In [45]:
# Ensure 'Datum' is a datetime object
df['Hour'] = pd.to_datetime(df['Hour'])

# Create a complete range of hourly timestamps from the first to the last point
full_range = pd.date_range(start=df['Hour'].min(), end=df['Hour'].max(), freq='h')

# Identify missing timestamps
missing_timestamps = full_range.difference(df['Hour'])

# Display the results
if missing_timestamps.empty:
    print("No missing hourly timestamps. All hours are accounted for.")
else:
    print("Missing hourly timestamps:")
    print(missing_timestamps)

Missing hourly timestamps:
DatetimeIndex(['2012-10-28 01:00:00+00:00', '2013-10-27 01:00:00+00:00',
               '2014-10-26 01:00:00+00:00', '2015-10-25 01:00:00+00:00',
               '2016-10-30 01:00:00+00:00', '2017-10-29 01:00:00+00:00',
               '2018-10-28 01:00:00+00:00', '2019-10-27 01:00:00+00:00',
               '2020-10-25 01:00:00+00:00', '2021-10-31 01:00:00+00:00',
               '2022-10-30 01:00:00+00:00', '2023-10-29 01:00:00+00:00',
               '2024-10-27 01:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq=None)


In [46]:
print(aggregated_df.index)


Index([112978, 112977, 112976, 112975, 112974, 112973, 112972, 112971, 112970,
       112969,
       ...
            9,      8,      7,      6,      5,      4,      3,      2,      1,
            0],
      dtype='int64', length=112979)


In [47]:
#Rename cloumn and set index
aggregated_df.rename(columns={'Hour': 'Datum'}, inplace=True)
aggregated_df.set_index('Datum', inplace=True)
aggregated_df.head()

Unnamed: 0_level_0,Stromverbrauch
Datum,Unnamed: 1_level_1
2024-11-20 22:00:00+00:00,123089.85
2024-11-20 21:00:00+00:00,133712.379
2024-11-20 20:00:00+00:00,143317.149
2024-11-20 19:00:00+00:00,155465.131
2024-11-20 18:00:00+00:00,169421.384


In [48]:
#make a moving average for the two missing values we are interested in

# Define the missing timestamps we are interested in
missing_hours = ['2022-10-30 01:00:00+00:00', '2023-10-29 01:00:00+00:00']

# Convert them to datetime
missing_hours = pd.to_datetime(missing_hours)

# Ensure the index of the DataFrame is consistently tz-aware
aggregated_df.index = pd.to_datetime(aggregated_df.index).tz_convert('UTC')

# Ensure the missing hours are in the DataFrame
for hour in missing_hours:
    if hour not in aggregated_df.index:
        # Insert with pd.NA to maintain consistency
        aggregated_df.loc[hour] = pd.NA

# Sort the DataFrame by index to maintain chronological order
aggregated_df.sort_index(inplace=True)

# Calculate the moving average for the missing hours
for hour in missing_hours:
    # Calculate the moving average (using a window of 3 for simplicity)
    moving_avg = aggregated_df.loc[hour - pd.Timedelta(hours=1):hour + pd.Timedelta(hours=1), 'Stromverbrauch'].mean(skipna=True)
    
    # Fill the missing value with the moving average
    aggregated_df.at[hour, 'Stromverbrauch'] = moving_avg

# Display the updated DataFrame
print(aggregated_df.loc[missing_hours])



                           Stromverbrauch
2022-10-30 01:00:00+00:00      95567.1525
2023-10-29 01:00:00+00:00      96768.2645


  aggregated_df.loc[hour] = pd.NA
  aggregated_df.loc[hour] = pd.NA


In [49]:
# Check for 0 values
zero_values = (df['Stromverbrauch'] == 0).sum()

# Check for NaN values
nan_values = df['Stromverbrauch'].isna().sum()

# Check for '/N' values
slash_n_values = (df['Stromverbrauch'] == '/N').sum()

print(f"Zero values: {zero_values}")
print(f"NaN values: {nan_values}")
print(f"'/N' values: {slash_n_values}")

Zero values: 0
NaN values: 0
'/N' values: 0


In [50]:
save_path = "../../02_Cleaned/Features/06Stromverbrauch_hourly_cleaned.csv"
aggregated_df.to_csv(save_path)