**Gas Cleaning**

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import pytz

In [2]:
Gas = pd.read_csv("../../00_Uncleaned_Data/Features/05Gasverbrauch_im_Versorgungsgebiet_der_IWB.csv", sep=";")

In [57]:

Gas = Gas.drop(columns=["Datum","Zeit", "Jahr", "Monat", "Tag", "Wochentag", "Tag des Jahres", "Quartal", "Woche des Jahres"])
Gas.head()


Unnamed: 0,Start der Messung,Gasverbrauch
0,2023-10-18T23:00:00+00:00,185054.4299
1,2023-10-19T00:00:00+00:00,184888.9114
2,2023-10-19T01:00:00+00:00,196871.6645
3,2023-10-19T04:00:00+00:00,322512.8374
4,2023-10-19T05:00:00+00:00,348277.549


In [None]:


def fix_dst_transitions(df, timestamp_column=None, value_columns=None, return_utc=True):
    """
    Fix time series data affected by DST transitions in Basel, Switzerland.
    
    Parameters:
    df: pandas DataFrame with timestamp data
    timestamp_column: name of the column containing timestamps. If None, will try to auto-detect
    value_columns: list of columns to process (must be numeric). If None, will process all numeric columns
    return_utc: boolean, if True returns timestamps in UTC format
    
    Returns:
    DataFrame with corrected timestamps and values, in UTC if specified
    """
    # Make a copy to avoid modifying original data
    df = df.copy()
    
    # Try to automatically detect timestamp column if not specified
    if timestamp_column is None:
        time_like_columns = df.select_dtypes(include=['datetime64']).columns
        if len(time_like_columns) > 0:
            timestamp_column = time_like_columns[0]
        else:
            # Try to find columns with common timestamp names
            possible_names = ['timestamp', 'time', 'date', 'datetime', 'Timestamp', 'Time', 'Date', 'DateTime', 'Start der Messung']
            for name in possible_names:
                if name in df.columns:
                    timestamp_column = name
                    break
    
    if timestamp_column is None or timestamp_column not in df.columns:
        raise ValueError(f"Could not find timestamp column. Available columns are: {df.columns.tolist()}")
    
    # Convert to datetime if not already
    if not pd.api.types.is_datetime64_any_dtype(df[timestamp_column]):
        df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    
    # Identify numeric columns if value_columns not specified
    if value_columns is None:
        value_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        # Remove timestamp column if it's in the numeric columns
        value_columns = [col for col in value_columns if col != timestamp_column]
    
    if not value_columns:
        raise ValueError("No numeric columns found to process")
    
    # Store non-numeric columns for later
    all_columns = df.columns.tolist()
    non_numeric_cols = [col for col in all_columns if col not in value_columns and col != timestamp_column]
    
    # Set timezone to Europe/Zurich (covers Basel) if not already set
    zurich_tz = pytz.timezone('Europe/Zurich')
    
    # Check if timestamp is already tz-aware
    if df[timestamp_column].dt.tz is None:
        df[timestamp_column] = df[timestamp_column].dt.tz_localize('UTC').dt.tz_convert(zurich_tz)
    elif df[timestamp_column].dt.tz != zurich_tz:
        df[timestamp_column] = df[timestamp_column].dt.tz_convert(zurich_tz)
    
    # Process numeric columns
    df_numeric = df[[timestamp_column] + value_columns].copy()
    df_numeric = df_numeric.set_index(timestamp_column).sort_index()
    df_resampled = df_numeric.resample('1H').mean()
    df_processed = df_resampled.interpolate(method='time')
    df_processed = df_processed.reset_index()
    
    # Process non-numeric columns
    if non_numeric_cols:
        df_non_numeric = df[[timestamp_column] + non_numeric_cols].copy()
        df_non_numeric = df_non_numeric.set_index(timestamp_column).sort_index()
        
        # For non-numeric columns, use forward fill (or another appropriate method)
        df_non_numeric_resampled = df_non_numeric.resample('1H').ffill()
        df_non_numeric_processed = df_non_numeric_resampled.reset_index()
        
        # Merge numeric and non-numeric data
        df_processed = pd.merge(
            df_processed,
            df_non_numeric_processed,
            on=timestamp_column,
            how='outer'
        )
    
    # Convert back to UTC if requested
    if return_utc:
        df_processed[timestamp_column] = df_processed[timestamp_column].dt.tz_convert('UTC')
    
    # Sort by timestamp
    df_processed = df_processed.sort_values(timestamp_column)
    
    # Ensure original column order
    final_columns = [timestamp_column] + [col for col in all_columns if col != timestamp_column]
    df_processed = df_processed[final_columns]
    
    return df_processed

# Example usage:
# corrected_Gas = fix_dst_transitions(Gas, value_columns=['your_numeric_column'])


In [58]:
import pandas as pd
from pytz import timezone


# Konvertieren der Zeitspalte in pandas-Datetime
Gas['Start der Messung'] = pd.to_datetime(Gas['Start der Messung'], utc=True)

# Funktion zur Anpassung der Zeiten
def adjust_timestamps(df):
    local_tz = timezone('Europe/Berlin')
    adjusted_times = []
    
    for timestamp in df['Start der Messung']:
        # Lokale Zeit mit Zeitzone
        local_time = timestamp.astimezone(local_tz)
        
        # Sommerzeit prüfen und anpassen
        if local_time.dst().total_seconds() != 0:  # Wenn Sommerzeit aktiv
            timestamp -= pd.Timedelta(hours=1)
        
        # Sicherstellen, dass keine Duplikate entstehen
        while timestamp in adjusted_times:
            timestamp += pd.Timedelta(hours=1)  # Verschieben, falls doppelter Eintrag
        
        adjusted_times.append(timestamp)
    
    return pd.Series(adjusted_times)

# Anwendung der Anpassung
Gas['Adjusted UTC'] = adjust_timestamps(Gas)

print(Gas)

              Start der Messung  Gasverbrauch              Adjusted UTC
0     2023-10-18 23:00:00+00:00   185054.4299 2023-10-18 22:00:00+00:00
1     2023-10-19 00:00:00+00:00   184888.9114 2023-10-18 23:00:00+00:00
2     2023-10-19 01:00:00+00:00   196871.6645 2023-10-19 00:00:00+00:00
3     2023-10-19 04:00:00+00:00   322512.8374 2023-10-19 03:00:00+00:00
4     2023-10-19 05:00:00+00:00   348277.5490 2023-10-19 04:00:00+00:00
...                         ...           ...                       ...
28244 2023-11-11 07:00:00+00:00   440050.3820 2023-11-11 07:00:00+00:00
28245 2023-11-11 10:00:00+00:00   409672.0796 2023-11-11 10:00:00+00:00
28246 2023-11-11 11:00:00+00:00   395212.7034 2023-11-11 11:00:00+00:00
28247 2023-11-11 12:00:00+00:00   363730.7498 2023-11-11 12:00:00+00:00
28248 2023-11-11 13:00:00+00:00   375884.6556 2023-11-11 13:00:00+00:00

[28249 rows x 3 columns]


In [59]:
Gas.sort_values(by="Adjusted UTC", inplace=True)
Gas.head()

Unnamed: 0,Start der Messung,Gasverbrauch,Adjusted UTC
12528,2021-09-01 03:00:00+00:00,269919.3321,2021-09-01 02:00:00+00:00
12529,2021-09-01 04:00:00+00:00,267289.9266,2021-09-01 03:00:00+00:00
8069,2021-09-01 05:00:00+00:00,269941.12,2021-09-01 04:00:00+00:00
12530,2021-09-01 06:00:00+00:00,259909.3179,2021-09-01 05:00:00+00:00
21263,2021-09-01 07:00:00+00:00,243958.8099,2021-09-01 06:00:00+00:00


In [60]:
Gas[Gas["Start der Messung"] == "2022-03-27T01"]

Unnamed: 0,Start der Messung,Gasverbrauch,Adjusted UTC
3201,2022-03-27 01:00:00+00:00,346278.76,2022-03-27 00:00:00+00:00
23891,2022-03-27 01:00:00+00:00,251581.2461,2022-03-27 03:00:00+00:00


In [61]:
Gas[Gas["Start der Messung"] == "2022-03-27T00"]

Unnamed: 0,Start der Messung,Gasverbrauch,Adjusted UTC
23890,2022-03-27 00:00:00+00:00,227953.7917,2022-03-27 01:00:00+00:00


In [62]:
Gas[Gas["Start der Messung"] == "2022-03-27T02"]

Unnamed: 0,Start der Messung,Gasverbrauch,Adjusted UTC
23892,2022-03-27 02:00:00+00:00,438581.9031,2022-03-27 04:00:00+00:00


In [37]:
corrected_Gas = fix_dst_transitions(Gas, timestamp_column="Start der Messung", value_columns="Gasverbrauch", return_utc=True)
print("\nSample output:")
print(corrected_df.head())

TypeError: can only concatenate list (not "str") to list

In [15]:
print(Gas.head())
print(Gas.dtypes)

           Start der Messung  Gasverbrauch       Datum   Zeit  Jahr  Monat  \
0  2023-10-18T23:00:00+00:00   185054.4299  2023-10-19  01:00  2023     10   
1  2023-10-19T00:00:00+00:00   184888.9114  2023-10-19  02:00  2023     10   
2  2023-10-19T01:00:00+00:00   196871.6645  2023-10-19  03:00  2023     10   
3  2023-10-19T04:00:00+00:00   322512.8374  2023-10-19  06:00  2023     10   
4  2023-10-19T05:00:00+00:00   348277.5490  2023-10-19  07:00  2023     10   

   Tag  Wochentag  Tag des Jahres  Quartal  Woche des Jahres  
0   19          3             292        4                42  
1   19          3             292        4                42  
2   19          3             292        4                42  
3   19          3             292        4                42  
4   19          3             292        4                42  
Start der Messung     object
Gasverbrauch         float64
Datum                 object
Zeit                  object
Jahr                   int64
Monat   

Aufgrund der Zeitverschiebung haben sich die Daten in der CSV File verschoben. Nach langem Probieren hat es trotzdem nicht geklappt, weshalb ich es mit der parquet file probiert hatte, wo es auch komsich war aber einfacher zu lösen.


In [145]:
Gas_new = pd.read_parquet(r"C:\Users\maxd2\OneDrive - Universitaet St.Gallen\Desktop\DSF\Gasverbrauch_new_2411.parquet")

In [143]:
Gas_new[Gas_new["timestamp"] == "2022-03-27T01"]

Unnamed: 0,timestamp,value,date,time,year,month,day,weekday,dayofyear,quarter,weekofyear
11037,2022-03-27 01:00:00+01:00,227953.7917,2022-03-27,01:00,2022,3,27,6,86,1,12


In [154]:
Gas_new[Gas_new["timestamp"] == pd.Timestamp('2022-03-27 01:00:00+00:00')]

Unnamed: 0,timestamp,value,date,time,year,month,day,weekday,dayofyear,quarter,weekofyear
11038,2022-03-27 03:00:00+02:00,251581.2461,2022-03-27,02:00,2022,3,27,6,86,1,12
26585,2022-03-27 03:00:00+02:00,346278.76,2022-03-27,03:00,2022,3,27,6,86,1,12


In [177]:
# so we import the data as usual, but drop the timestamp column as there are mistakes with the daylight saving time

Gas_new = pd.read_parquet(r"C:\Users\maxd2\OneDrive - Universitaet St.Gallen\Desktop\DSF\Gasverbrauch_new_2411.parquet")

Gas_new.drop(columns=["timestamp"], inplace=True)


# Assuming these columns exist in your DataFrame
Gas_new['timestamp'] = pd.to_datetime(
    Gas_new['year'].astype(str) + '-' + 
    Gas_new['month'].astype(str) + '-' + 
    Gas_new['day'].astype(str) + ' ' + 
    Gas_new['time'].astype(str),
    format='%Y-%m-%d %H:%M',
    errors='coerce', 
    utc=True
)

# Check the result
print(Gas_new[['year', 'month', 'day', 'time', 'timestamp']])


       year  month  day   time                 timestamp
0      2023     10   19  05:00 2023-10-19 05:00:00+00:00
1      2023     10   19  08:00 2023-10-19 08:00:00+00:00
2      2023     10   19  10:00 2023-10-19 10:00:00+00:00
3      2023     10   19  11:00 2023-10-19 11:00:00+00:00
4      2023     10   19  19:00 2023-10-19 19:00:00+00:00
...     ...    ...  ...    ...                       ...
28316  2023     12   31  22:00 2023-12-31 22:00:00+00:00
28317  2023      1    1  06:00 2023-01-01 06:00:00+00:00
28318  2023      1    1  10:00 2023-01-01 10:00:00+00:00
28319  2023      1    1  16:00 2023-01-01 16:00:00+00:00
28320  2023      1    1  21:00 2023-01-01 21:00:00+00:00

[28321 rows x 5 columns]


In [178]:
Gas_new[Gas_new["timestamp"] == pd.Timestamp('2022-03-27 03:00:00+00:00')]

Unnamed: 0,value,date,time,year,month,day,weekday,dayofyear,quarter,weekofyear,timestamp
26585,346278.76,2022-03-27,03:00,2022,3,27,6,86,1,12,2022-03-27 03:00:00+00:00


In [179]:
Gas_new['timestamp'] = Gas_new['timestamp'] + pd.Timedelta(hours= -1)


In [180]:
Gas_new[Gas_new["timestamp"] == pd.Timestamp('2022-03-27 00:00:00+00:00')]

Unnamed: 0,value,date,time,year,month,day,weekday,dayofyear,quarter,weekofyear,timestamp
11037,227953.7917,2022-03-27,01:00,2022,3,27,6,86,1,12,2022-03-27 00:00:00+00:00


In [181]:
Gas_new = Gas_new[Gas_new['timestamp'] >= '2022-01-01']

In [182]:
Gas_new.drop(columns=["year", "month", "day", "time", "date","weekday","dayofyear","quarter","weekofyear"], inplace=True)

In [187]:
Gas_new.rename(columns={"value": "Gasverbrauch"}, inplace=True)
Gas_new.rename(columns={"timestamp": "Datum"}, inplace=True)

In [1]:
Gas_new.to_csv("../../02_Cleaned/Features/05Gasverbrauch_cleaned.csv", index=False)

NameError: name 'Gas_new' is not defined