In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
import holidays
from autogluon.tabular import TabularDataset, TabularPredictor

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
X_test = pd.read_csv('X_test_GgyECq8.csv')
X_train = pd.read_csv('X_train_Wwou3IE.csv')
y_train = pd.read_csv('y_train_jJtXgMX.csv')
y_pred = pd.read_csv('y_random_pt8afo8.csv')

In [3]:
def compute_weighted_accuracy(y_actual, y_pred):
    # If y_actual is a DataFrame, extract the 'spot_id_delta' column, otherwise assume it's already a numpy array
    if isinstance(y_actual, pd.DataFrame):
        actual = y_actual["spot_id_delta"].values
    else:
        actual = y_actual
    
    # If y_pred is a DataFrame, extract the 'spot_id_delta' column, otherwise assume it's already a numpy array
    if isinstance(y_pred, pd.DataFrame):
        predicted = y_pred["spot_id_delta"].values
    else:
        predicted = y_pred
    
    # actual = y_actual["spot_id_delta"].values
    # predicted = y_pred["spot_id_delta"].values

    correct_direction = (np.sign(actual) == np.sign(predicted)).astype(int)

    weights = np.abs(actual)

    weighted_accuracy = np.sum(correct_direction * weights) / np.sum(weights)

    return round(weighted_accuracy, 2)

In [4]:
def preprocess_data(df):
    # Ensure 'DELIVERY_START' is set as the index with timezone handling
    if 'DELIVERY_START' in df.columns:
        df['DELIVERY_START'] = pd.to_datetime(df['DELIVERY_START'], utc=True)  # Parse with timezone info
        df = df.set_index('DELIVERY_START')
        df.index = df.index.tz_convert('Europe/Berlin')  # Adjust to desired timezone, if needed

    # Add time-related features
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['month'] = df.index.month

    # Handle missing values by filling with the mean of each column
    df = df.fillna(df.mean())

    df = pd.get_dummies(df, drop_first=True)  # Encode categorical variables

    # Drop unwanted columns
    df = df.drop(columns=['predicted_spot_price'], errors='ignore')

    df['delta_load'] = df['load_forecast'] - (df['coal_power_available']+df['gas_power_available']+
                                              df['nucelear_power_available']+df['wind_power_forecasts_average']+
                                              df['solar_power_forecasts_average'])

    df['delta_load_wo_renewables'] = df['load_forecast'] - (df['coal_power_available']+df['gas_power_available']+
                                              df['nucelear_power_available'])
    
    df['renewables_share'] = (df['wind_power_forecasts_average']+df['solar_power_forecasts_average']) / (df['coal_power_available']+df['gas_power_available']+
                                              df['nucelear_power_available']+df['wind_power_forecasts_average']+
                                              df['solar_power_forecasts_average'])
        
    df['wind_share'] = df['wind_power_forecasts_average'] / (df['coal_power_available']+df['gas_power_available']+
                                              df['nucelear_power_available']+df['wind_power_forecasts_average']+
                                              df['solar_power_forecasts_average'])
    
    df['facteur_de_charge'] = (df['coal_power_available'] + df['gas_power_available'] + 
                               df['nucelear_power_available'] + df['wind_power_forecasts_average'] + 
                               df['solar_power_forecasts_average']) / (df['coal_power_available'].max() + 
                                                                        df['gas_power_available'].max() + 
                                                                        df['nucelear_power_available'].max() + 
                                                                        df['wind_power_forecasts_average'].max() + 
                                                                        df['solar_power_forecasts_average'].max())

    df['cyclicite_demand'] = df['load_forecast'] * np.sin(242 * np.pi * df['hour'] / 24)

    return df

def preprocess_y(y):
    # Ensure 'DELIVERY_START' is set as the index
    if 'DELIVERY_START' in y.columns:
        y['DELIVERY_START'] = pd.to_datetime(y['DELIVERY_START'], utc=True)  # Parse with timezone info
        y = y.set_index('DELIVERY_START')
        y.index = y.index.tz_convert('Europe/Berlin')  # Adjust to desired timezone

    return y

def split_train_data(X, y):
    # Ensure indices align during split
    split_size = len(X) // 3
    X_train1, X_train2, X_train3 = X.iloc[:split_size], X.iloc[split_size:2*split_size], X.iloc[2*split_size:]
    y_train1, y_train2, y_train3 = y.iloc[:split_size], y.iloc[split_size:2*split_size], y.iloc[2*split_size:]
    return X_train1, X_train2, X_train3, y_train1, y_train2, y_train3

def evaluate_model(model, X, y_actual):
    # Predict on the given dataset
    predictions = model.predict(X)
    # Use custom weighted accuracy
    y_pred_df = pd.DataFrame(predictions, index=X.index, columns=["spot_id_delta"])
    weighted_acc = compute_weighted_accuracy(y_actual, y_pred_df)

    mse = mean_squared_error(y_actual, predictions)
    mae = mean_absolute_error(y_actual, predictions)
    return mse, mae, weighted_acc

def launch(X_train, y_train, X_test):
    # Preprocess X_train and X_test
    X_train = preprocess_data(X_train)
    X_test = preprocess_data(X_test)

    # Preprocess y_train (only index adjustment)
    y_train = preprocess_y(y_train)

    # AutoGluon
    train_data = TabularDataset(X_train.copy())
    target = y_train.columns[0]
    train_data[target] = y_train
    model = TabularPredictor(label=target, verbosity=0).fit(train_data)

    # Prepare test data
    test_data = TabularDataset(X_test.copy())
    y_pred = model.predict(test_data)
    y_pred = pd.DataFrame(y_pred)

    # Train data predictions
    y_pred_train = model.predict(train_data.drop(columns=[target]))
    y_pred_train = pd.DataFrame(y_pred_train)

    # Save predictions
    y_pred.columns = ["spot_id_delta"]
    y_pred.reset_index(inplace=True)
    # y_pred.to_csv('y_pred_test.csv', index=False)

    print(evaluate_model(model, X_train, y_train))

    print(model.leaderboard(test_data))

    return X_test, model, y_pred

In [None]:
# Execute the pipeline
X_test, model, y_pred_test = launch(X_train, y_train, X_test)

		module 'pandas.core.strings' has no attribute 'StringMethods'
		module 'pandas.core.strings' has no attribute 'StringMethods'
