In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [10]:
def load_and_preprocess_data(file_path='./data/time_series_60min_singleindex.csv'):
    # Load CSV (utc_timestamp index, focus on DE wind + proxies: solar_gen (weather proxy), load (demand proxy))
    df = pd.read_csv(file_path, parse_dates=['utc_timestamp'], index_col='utc_timestamp')
    # Filter to relevant DE columns (actual generation/load in MW)
    df = df[['DE_wind_onshore_generation_actual', 'DE_wind_offshore_generation_actual', 
             'DE_solar_generation_actual', 'DE_load_actual_entsoe_transparency']]
    df.columns = ['wind_onshore', 'wind_offshore', 'solar_gen', 'load']  # Rename

    # Initial missing values handling
    df = df.ffill().interpolate(method='linear')

    # Remove outliers per numeric column (|z| > 3), keeping index alignment
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        s = df[col]
        std = s.std(ddof=0)
        if std == 0 or np.isnan(std):
            continue
        z = (s - s.mean()) / std
        df.loc[z.abs() > 3, col] = np.nan

    # Re-interpolate after outlier removal
    df = df.interpolate(method='linear').ffill().bfill()

    # Create total wind yield target (MW) AFTER cleaning
    df['total_wind'] = df['wind_onshore'] + df['wind_offshore']

    # Filter to post-2015 for completeness (dataset starts ~2015)
    df = df[df.index >= '2015-01-01'].dropna()

    print(f"Processed data shape: {df.shape} (2015–mid-2020, hourly)")
    return df



In [11]:
df = load_and_preprocess_data(); print(df.head()); print(df.describe())

Processed data shape: (50400, 5) (2015–mid-2020, hourly)
                           wind_onshore  wind_offshore  solar_gen     load  \
utc_timestamp                                                                
2015-01-01 00:00:00+00:00        8336.0          517.0       71.0  41151.0   
2015-01-01 01:00:00+00:00        8540.0          514.0       71.0  40135.0   
2015-01-01 02:00:00+00:00        8552.0          518.0       71.0  39106.0   
2015-01-01 03:00:00+00:00        8643.0          520.0       71.0  38765.0   
2015-01-01 04:00:00+00:00        8712.0          520.0       71.0  38941.0   

                           total_wind  
utc_timestamp                          
2015-01-01 00:00:00+00:00      8853.0  
2015-01-01 01:00:00+00:00      9054.0  
2015-01-01 02:00:00+00:00      9070.0  
2015-01-01 03:00:00+00:00      9163.0  
2015-01-01 04:00:00+00:00      9232.0  
       wind_onshore  wind_offshore     solar_gen          load    total_wind
count  50400.000000   50400.000000  504