In [46]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import holidays
from autogluon.tabular import TabularDataset, TabularPredictor

In [47]:
X_test = pd.read_csv('X_test_GgyECq8.csv')
X_train = pd.read_csv('X_train_Wwou3IE.csv')
y_train = pd.read_csv('y_train_jJtXgMX.csv')
y_pred = pd.read_csv('y_random_pt8afo8.csv')

In [48]:
def compute_weighted_accuracy(y_actual, y_pred):
    # If y_actual is a DataFrame, extract the 'spot_id_delta' column, otherwise assume it's already a numpy array
    if isinstance(y_actual, pd.DataFrame):
        actual = y_actual["spot_id_delta"].values
    else:
        actual = y_actual
    
    # If y_pred is a DataFrame, extract the 'spot_id_delta' column, otherwise assume it's already a numpy array
    if isinstance(y_pred, pd.DataFrame):
        predicted = y_pred["spot_id_delta"].values
    else:
        predicted = y_pred
    
    # actual = y_actual["spot_id_delta"].values
    # predicted = y_pred["spot_id_delta"].values

    correct_direction = (np.sign(actual) == np.sign(predicted)).astype(int)

    weights = np.abs(actual)

    weighted_accuracy = np.sum(correct_direction * weights) / np.sum(weights)

    return round(weighted_accuracy, 2)

In [49]:
def preprocess_data(df):
    # Ensure 'DELIVERY_START' is set as the index with timezone handling
    if 'DELIVERY_START' in df.columns:
        df['DELIVERY_START'] = pd.to_datetime(df['DELIVERY_START'], utc=True)  # Parse with timezone info
        df = df.set_index('DELIVERY_START')
        df.index = df.index.tz_convert('Europe/Berlin')  # Adjust to desired timezone, if needed

    # Add time-related features
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['month'] = df.index.month

    # Handle missing values by filling with the mean of each column
    df = df.fillna(df.mean())

    # Drop unwanted columns
    df = df.drop(columns=['predicted_spot_price'], errors='ignore')

    df['delta_load_wo_renewables'] = df['load_forecast'] - (df['coal_power_available']+df['gas_power_available']+
                                                df['nucelear_power_available'])

    return df

def preprocess_y(y):
    # Ensure 'DELIVERY_START' is set as the index
    if 'DELIVERY_START' in y.columns:
        y['DELIVERY_START'] = pd.to_datetime(y['DELIVERY_START'], utc=True)  # Parse with timezone info
        y = y.set_index('DELIVERY_START')
        y.index = y.index.tz_convert('Europe/Berlin')  # Adjust to desired timezone
    return y

def split_train_data(X, y):
    # Ensure indices align during split
    split_size = len(X) // 3
    X_train1, X_train2, X_train3 = X.iloc[:split_size], X.iloc[split_size:2*split_size], X.iloc[2*split_size:]
    y_train1, y_train2, y_train3 = y.iloc[:split_size], y.iloc[split_size:2*split_size], y.iloc[2*split_size:]
    return X_train1, X_train2, X_train3, y_train1, y_train2, y_train3

def evaluate_model(model, X, y_actual):
    # Predict on the given dataset
    predictions = model.predict(X)
    # Use custom weighted accuracy
    y_pred_df = pd.DataFrame(predictions, index=X.index, columns=["spot_id_delta"])
    weighted_acc = compute_weighted_accuracy(y_actual, y_pred_df)

    mse = mean_squared_error(y_actual, predictions)
    mae = mean_absolute_error(y_actual, predictions)
    return mse, mae, weighted_acc

# Preprocess X_train and X_test
X_train = preprocess_data(X_train)
X_test = preprocess_data(X_test)

# Preprocess y_train (only index adjustment)
y_train = preprocess_y(y_train)

In [50]:
X_train

Unnamed: 0_level_0,load_forecast,coal_power_available,gas_power_available,nucelear_power_available,wind_power_forecasts_average,solar_power_forecasts_average,wind_power_forecasts_std,solar_power_forecasts_std,hour,dayofweek,month,delta_load_wo_renewables
DELIVERY_START,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-01-01 02:00:00+01:00,49439.0,3386.0,11487.0,44118.0,3035.0,0.0,79.248348,0.000000,2,5,1,-9552.0
2022-01-01 03:00:00+01:00,46511.0,3386.0,11487.0,44118.0,3143.0,0.0,61.776532,0.000000,3,5,1,-12480.0
2022-01-01 04:00:00+01:00,45158.0,3386.0,11487.0,44118.0,3288.0,0.0,44.291112,0.000000,4,5,1,-13833.0
2022-01-01 05:00:00+01:00,44779.0,3386.0,11487.0,44118.0,3447.0,0.0,36.127588,0.000000,5,5,1,-14212.0
2022-01-01 06:00:00+01:00,45284.0,3386.0,11487.0,44118.0,3679.0,0.0,30.983023,0.000000,6,5,1,-13707.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-29 19:00:00+02:00,50814.0,3386.0,11952.0,38320.0,7552.0,651.0,247.408490,7.821622,19,2,3,-2844.0
2023-03-29 20:00:00+02:00,50628.0,3386.0,11952.0,38320.0,8338.0,109.0,155.795012,2.534054,20,2,3,-3030.0
2023-03-29 21:00:00+02:00,48201.0,3386.0,11952.0,38320.0,9115.0,0.0,126.884684,0.000000,21,2,3,-5457.0
2023-03-29 22:00:00+02:00,47967.0,3386.0,11952.0,38320.0,9636.0,0.0,156.669189,0.000000,22,2,3,-5691.0


In [51]:
# Ensure indices align during split
split = int(len(X_train)*0.8)
X_train1, X_train2 = X_train.iloc[:split], X_train.iloc[split:]
y_train1, y_train2 = y_train.iloc[:split], y_train.iloc[split:]

# AutoGluon
train_data = TabularDataset(X_train1.copy())
target = y_train1.columns[0]
train_data[target] = y_train1
validation_data = TabularDataset(X_train2.copy())
validation_data[target] = y_train2
model = TabularPredictor(label=target, verbosity=0).fit(train_data, tuning_data=validation_data)

# Prepare test data
test_data = TabularDataset(X_test.copy())
y_pred = model.predict(test_data)
y_pred = pd.DataFrame(y_pred)

# Train data predictions
y_pred_train = model.predict(train_data.drop(columns=[target]))
y_pred_train = pd.DataFrame(y_pred_train)

# Save predictions
y_pred.columns = ["spot_id_delta"]
y_pred.reset_index(inplace=True)
y_pred.to_csv('y_pred_test.csv', index=False)

print('TRAIN: ', evaluate_model(model, X_train1, y_train1)[2])
print('Validation: ', evaluate_model(model, X_train2, y_train2)[2])
print('Full TRAIN: ', evaluate_model(model, X_train, y_train)[2])

model.leaderboard(validation_data)

		module 'pandas.core.strings' has no attribute 'StringMethods'
		module 'pandas.core.strings' has no attribute 'StringMethods'
		module 'pandas.core.strings' has no attribute 'StringMethods'


TRAIN:  0.71
Validation:  0.57
Full TRAIN:  0.69


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-16.913286,-16.913286,root_mean_squared_error,0.085829,0.075917,15.976072,0.004,0.000778,0.029329,2,True,9
1,CatBoost,-16.932592,-16.932592,root_mean_squared_error,0.016613,0.0,1.651879,0.016613,0.0,1.651879,1,True,4
2,XGBoost,-17.0365,-17.0365,root_mean_squared_error,0.0,0.020177,0.070928,0.0,0.020177,0.070928,1,True,7
3,NeuralNetTorch,-18.553637,-18.553637,root_mean_squared_error,0.038646,0.020895,48.982542,0.038646,0.020895,48.982542,1,True,8
4,NeuralNetFastAI,-18.673413,-18.673413,root_mean_squared_error,0.065216,0.054962,14.223937,0.065216,0.054962,14.223937,1,True,6
5,RandomForestMSE,-19.380335,-19.380335,root_mean_squared_error,0.494277,0.03164,1.866882,0.494277,0.03164,1.866882,1,True,3
6,KNeighborsUnif,-22.216666,-22.216666,root_mean_squared_error,0.029155,0.020694,0.031883,0.029155,0.020694,0.031883,1,True,1
7,KNeighborsDist,-22.397829,-22.397829,root_mean_squared_error,0.02581,0.02053,0.027041,0.02581,0.02053,0.027041,1,True,2
8,ExtraTreesMSE,-24.252334,-24.252334,root_mean_squared_error,0.374077,0.037924,0.549128,0.374077,0.037924,0.549128,1,True,5
