In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
# !pip install pyarrow
# !pip install fastparquet
# !pip install xgboost


# Method of preprocessing
1. Import the datasets
2. Transform the datasets into hourly format
3. Handle missing values and rows
4. Final feature engineering and storing
5. Create additional combined super-dataset

# 1. Import the datasets

In [3]:
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

In [4]:
x_train_estimated_a = pd.read_parquet('A/x_train_estimated.parquet')
x_train_estimated_b = pd.read_parquet('B/x_train_estimated.parquet')
x_train_estimated_c = pd.read_parquet('C/x_train_estimated.parquet')

In [5]:
x_train_observed_a = pd.read_parquet('A/x_train_observed.parquet')
x_train_observed_b = pd.read_parquet('B/x_train_observed.parquet')
x_train_observed_c = pd.read_parquet('C/x_train_observed.parquet')


In [6]:
x_test_estimated_a = pd.read_parquet('A/x_test_estimated.parquet')
x_test_estimated_b = pd.read_parquet('B/x_test_estimated.parquet')
x_test_estimated_c = pd.read_parquet('C/x_test_estimated.parquet')


In [7]:
x_train_merged_a = pd.concat([x_train_observed_a,x_train_estimated_a])
x_train_merged_b = pd.concat([x_train_observed_b,x_train_estimated_b])
x_train_merged_c = pd.concat([x_train_observed_c,x_train_estimated_c])

# 2. Transform into hourly

- Observed and estimated measurements are taken every 15 minutes, while energy is measured every hour
- We need to transform the measurements into hourly aggregations to match the labels

In [8]:
# Calculating from 15-minute intervals to hourly intervals based on different aggregation methods
def resample_to_hourly(df, aggregation_methods):
    df.set_index('date_forecast', inplace=True)
    df_hourly = df.resample('H').agg(aggregation_methods)
    df_hourly.reset_index(inplace=True)
    
    return df_hourly

# Aggregation methods based on features' names
aggregation_methods = {
    'date_calc' : 'max',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'mean',
    'clear_sky_energy_1h:J': 'sum',
    'clear_sky_rad:W': 'mean',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'max',
    'dew_point_2m:K': 'mean',
    'diffuse_rad:W': 'mean',
    'diffuse_rad_1h:J': 'sum',
    'direct_rad:W': 'mean',
    'direct_rad_1h:J': 'sum',
    'effective_cloud_cover:p': 'mean',
    'elevation:m': 'mean',
    'fresh_snow_12h:cm': 'sum',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'sum',
    'fresh_snow_3h:cm': 'sum',
    'fresh_snow_6h:cm': 'sum',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'max',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'mean',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'mean',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'mean',
    'sun_elevation:d': 'mean',
    'super_cooled_liquid_water:kgm2': 'mean',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean'
}

# Apply the function to each dataset
x_train_a_hourly = resample_to_hourly(x_train_merged_a, aggregation_methods)
x_train_b_hourly = resample_to_hourly(x_train_merged_b, aggregation_methods)
x_train_c_hourly = resample_to_hourly(x_train_merged_c, aggregation_methods)

x_test_a_hourly = resample_to_hourly(x_test_estimated_a, aggregation_methods)
x_test_b_hourly = resample_to_hourly(x_test_estimated_b, aggregation_methods)
x_test_c_hourly = resample_to_hourly(x_test_estimated_c, aggregation_methods)


# 3. Handle missing values and rows
- Remove NaN pv measurement values from y
- Remove rows that are not present in both x and y

In [9]:
# Identify the indices of the rows with NaN values in the 'pv_measurement' column
nan_indices_a = train_a[train_a['pv_measurement'].isna()].index
nan_indices_b = train_b[train_b['pv_measurement'].isna()].index
nan_indices_c = train_c[train_c['pv_measurement'].isna()].index

# Drop these indices from y_train
train_a = train_a.drop(nan_indices_a)
train_b = train_b.drop(nan_indices_b)
train_c = train_c.drop(nan_indices_c)


In [10]:
# Remove all rows with date-time values that are not present in both x and y in order to synchronize x and its labels. 
def remove_non_synchronous_rows(x_train, y_train, x_date_column='date_forecast', y_date_column='time'):
    # Convert date columns to datetime format for easier comparison
    x_train[x_date_column] = pd.to_datetime(x_train[x_date_column])
    y_train[y_date_column] = pd.to_datetime(y_train[y_date_column])
    
    # Find common dates
    common_dates = x_train[x_date_column][x_train[x_date_column].isin(y_train[y_date_column])]
    
    # Filter both datasets based on common dates
    x_train_synced = x_train[x_train[x_date_column].isin(common_dates)]
    y_train_synced = y_train[y_train[y_date_column].isin(common_dates)]
    
    return x_train_synced, y_train_synced

# Remove the rows with date and time that only shows up in one of the sets
x_train_a_hourly, train_a = remove_non_synchronous_rows(x_train_a_hourly, train_a)
x_train_b_hourly, train_b = remove_non_synchronous_rows(x_train_b_hourly, train_b)
x_train_c_hourly, train_c = remove_non_synchronous_rows(x_train_c_hourly, train_c)


# 4. Final feature engineering and storing
- Extract year, month, day and hour features from each datetime column
- Store the cleaned data for each location

In [11]:
# Extracts year, month, day, and hour features from a given datetime column
def extract_date_features(df, date_column, prefix):
    # Convert to datetime
    df[date_column] = pd.to_datetime(df[date_column])

    # Extract features
    df[f'{prefix}_year'] = df[date_column].dt.year
    df[f'{prefix}_month'] = df[date_column].dt.month
    df[f'{prefix}_day'] = df[date_column].dt.day
    df[f'{prefix}_hour'] = df[date_column].dt.hour
    
    # Drop the original date column
    df.drop(columns=[date_column], inplace=True)

# List of datasets
datasets = [x_train_a_hourly, x_train_b_hourly, x_train_c_hourly, x_test_a_hourly, x_test_b_hourly, x_test_c_hourly, train_a, train_b, train_c]

# Loop through datasets and extract date-time features for both date_forecast, date_calc and 'time'
for dataset in datasets:
    if 'date_forecast' in dataset.columns:
        extract_date_features(dataset, 'date_forecast', 'forecast')
    if 'date_calc' in dataset.columns:
        extract_date_features(dataset, 'date_calc', 'calc')
    if 'time' in dataset.columns:
        extract_date_features(dataset, 'time', 'time')


In [17]:
# Store the cleaned datasets
output_dir = 'cleaned_data'
# Ensure directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Directories for each location
dir_a = os.path.join(output_dir, 'A')
dir_b = os.path.join(output_dir, 'B')
dir_c = os.path.join(output_dir, 'C')

# Ensure subdirectories exist
for dir_path in [dir_a, dir_b, dir_c]:
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

# Saving datasets for location A
x_train_a_hourly.to_csv(os.path.join(dir_a, 'x_train_a.csv'), index=False)
x_test_a_hourly.to_csv(os.path.join(dir_a, 'x_test_a.csv'), index=False)
train_a.to_csv(os.path.join(dir_a, 'train_a.csv'), index=False)

# Saving datasets for location B
x_train_b_hourly.to_csv(os.path.join(dir_b, 'x_train_b.csv'), index=False)
x_test_b_hourly.to_csv(os.path.join(dir_b, 'x_test_b.csv'), index=False)
train_b.to_csv(os.path.join(dir_b, 'train_b.csv'), index=False)

# Saving datasets for location C
x_train_c_hourly.to_csv(os.path.join(dir_c, 'x_train_c.csv'), index=False)
x_test_c_hourly.to_csv(os.path.join(dir_c, 'x_test_c.csv'), index=False)
train_c.to_csv(os.path.join(dir_c, 'train_c.csv'), index=False)


# 5. Create super-sets for x_train, y_train and x_test
- For experimentational purposes we create three supersets, eachcontaining encoded rows telling if their from A, B and C.
- This is used to see if training a single model is better than training three separate models
- Sorted by forecast_date for x_train, time_date for y_train, and location and forecast_date for x_test (in order to match the submission format)

In [13]:
# Adds the location for the dataset, encodes it into three columns and removes the original location column
def add_and_encode_location(dfs, locations):
    for df, loc in zip(dfs, locations):
        df['location'] = loc
        for unique_loc in locations:
            df[f'location_{unique_loc}'] = (df['location'] == unique_loc).astype(int)
        df.drop('location', axis=1, inplace=True)

datasets = [x_train_a_hourly, x_train_b_hourly, x_train_c_hourly, x_test_a_hourly, x_test_b_hourly, x_test_c_hourly, train_a, train_b, train_c]
locations = ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C']

add_and_encode_location(datasets, locations)

In [14]:
# Combine all x_train, x_test and y_train datasets creating a superset containing data for all three locations 
x_train_combined = pd.concat([x_train_a_hourly, x_train_b_hourly, x_train_c_hourly], ignore_index=True)
x_test_combined = pd.concat([x_test_a_hourly, x_test_b_hourly, x_test_c_hourly], ignore_index=True)
y_train_combined = pd.concat([train_a, train_b, train_c], ignore_index=True)

In [15]:
# Sorting x_train
x_train = x_train_combined.sort_values(by=['forecast_year', 'forecast_month', 'forecast_day', 'forecast_hour','location_A', 'location_B', 'location_C'])
# Sorting y_train
y_train = y_train_combined.sort_values(by=[ 'time_year', 'time_month', 'time_day', 'time_hour','location_A', 'location_B', 'location_C'])
# Sorting x_test to match the sorting method used in test.csv
x_test = x_test_combined.sort_values(by=['location_A', 'location_B', 'location_C', 'forecast_year', 'forecast_month', 'forecast_day', 'forecast_hour'])

In [16]:
# Store the data as csv-files in a folder called cleaned_and_combined_data
output_dir = 'cleaned_and_combined_data'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the dataframes into the folder
x_train.to_csv(os.path.join(output_dir, 'x_train_combined.csv'), index=False)
x_test_combined.to_csv(os.path.join(output_dir, 'x_test_combined.csv'), index=False)
y_train.to_csv(os.path.join(output_dir, 'y_train_combined.csv'), index=False)