## Imports

In [138]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost
import optuna
import time
import lightgbm as lgb
import os

from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
from scipy.optimize import minimize as sp_minimize

## Configs

In [139]:
train_path = '../data/train.csv'
test_path = '../data/test_for_participants.csv'
sample_path = '../data/sample_submission.csv'
VAL_START = '2025-01-01'
SEED = 42
N_TRIALS_LGB = 30
SAVED_LGB_PATH = '../models/lgb_final.txt'
ROUND_MULTIPLIER = 1.15

os.makedirs('../models', exist_ok=True)

## Data loading

In [140]:
train_raw = pd.read_csv(train_path)
test_raw = pd.read_csv(test_path)
sample_sub = pd.read_csv(sample_path)

In [141]:
for df in [train_raw, test_raw]:
    df['delivery_start'] = pd.to_datetime(df['delivery_start'])
    df['delivery_end'] = pd.to_datetime(df['delivery_end'])

train_raw['is_test'] = 0
test_raw['is_test'] = 1
test_raw['target'] = np.nan

df = pd.concat([train_raw, test_raw], ignore_index=True)
df = df.sort_values(['market', 'delivery_start']).reset_index(drop=True)
df

Unnamed: 0,id,target,market,global_horizontal_irradiance,diffuse_horizontal_irradiance,direct_normal_irradiance,cloud_cover_total,cloud_cover_low,cloud_cover_mid,cloud_cover_high,...,wind_speed_80m,wind_direction_80m,wind_gust_speed_10m,wind_speed_10m,solar_forecast,wind_forecast,load_forecast,delivery_start,delivery_end,is_test
0,0,-1.913,Market A,0.0,0.0,0.0,2.0,0.0,0.0,2.0,...,31.253719,245.501450,25.199999,15.077082,0.0,24050.1,38163.0100,2023-01-01 00:00:00,2023-01-01 01:00:00,0
1,5,-0.839,Market A,0.0,0.0,0.0,15.0,0.0,0.0,15.0,...,30.918108,242.241547,23.400000,14.186923,0.0,23886.3,37379.1898,2023-01-01 01:00:00,2023-01-01 02:00:00,0
2,10,-1.107,Market A,0.0,0.0,0.0,17.0,0.0,0.0,17.0,...,26.983196,224.999893,21.240000,12.413477,0.0,23366.5,36336.8303,2023-01-01 02:00:00,2023-01-01 03:00:00,0
3,15,0.035,Market A,0.0,0.0,0.0,16.0,0.0,0.0,16.0,...,22.218153,229.600174,16.199999,10.483357,0.0,22829.8,35337.7595,2023-01-01 03:00:00,2023-01-01 04:00:00,0
4,20,-0.829,Market A,0.0,0.0,0.0,10.0,0.0,0.0,10.0,...,27.210381,244.113022,18.359999,11.918120,0.0,22347.6,34474.3403,2023-01-01 04:00:00,2023-01-01 05:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145701,146752,,Market F,0.0,0.0,0.0,100.0,100.0,100.0,100.0,...,24.863468,357.510498,27.359999,17.673029,0.0,9943.7,53190.5901,2025-11-30 18:00:00,2025-11-30 19:00:00,1
145702,146758,,Market F,0.0,0.0,0.0,100.0,100.0,100.0,100.0,...,23.857979,354.805664,27.719999,16.700275,0.0,10235.4,54071.1413,2025-11-30 19:00:00,2025-11-30 20:00:00,1
145703,146764,,Market F,0.0,0.0,0.0,100.0,100.0,100.0,71.0,...,21.485697,351.326904,25.559999,15.745627,0.0,10333.9,54517.9095,2025-11-30 20:00:00,2025-11-30 21:00:00,1
145704,146770,,Market F,0.0,0.0,0.0,100.0,100.0,100.0,100.0,...,19.559633,353.659912,21.599998,14.471821,0.0,10214.1,54572.1696,2025-11-30 21:00:00,2025-11-30 22:00:00,1


## Feature engineering

In [142]:
ds = df["delivery_start"]
df["hour"]         = ds.dt.hour
df["day_of_week"]  = ds.dt.dayofweek
df["day_of_month"] = ds.dt.day
df["month"]        = ds.dt.month
df["quarter"]      = ds.dt.quarter
df["day_of_year"]  = ds.dt.dayofyear
df["year"]         = ds.dt.year
df["is_weekend"]   = (ds.dt.dayofweek >= 5).astype(np.int8)
df["week_of_year"] = ds.dt.isocalendar().week.astype(int)

In [143]:
market_map = {f"Market {c}": i for i, c in enumerate("ABCDEF")}
df["market_id"] = df["market"].map(market_map).astype(np.int8)

In [144]:
# Demand measurements
df["residual_demand"] = df["load_forecast"] - df["solar_forecast"] - df["wind_forecast"]
df["tightness_ratio"] = df["residual_demand"] / (df["load_forecast"] + 1)

df["tightness_x_month"] = df["tightness_ratio"] * df["month"]
df["tightness_x_hour"] = df["tightness_ratio"] * df["hour"]

df.drop(columns=["tightness_ratio", "residual_demand"], inplace=True)

In [145]:
# 1. Weather Momentum (Deltas)
# Prices react to the CHANGE in wind and solar, not just the level.
for col in ['wind_speed_80m', 'solar_forecast', 'load_forecast']:
    df[f'{col}_diff_1h'] = df.groupby('market_id')[col].diff(1)
    df[f'{col}_diff_3h'] = df.groupby('market_id')[col].diff(3)

# 2. Temperature Anomaly (Relative to the last 24h)
# Is it hotter than it was yesterday? (Better than raw temp)
df['temp_24h_mean'] = df.groupby('market_id')['air_temperature_2m'].transform(
    lambda x: x.rolling(24, min_periods=1).mean()
)
df['temp_anomaly'] = df['air_temperature_2m'] - df['temp_24h_mean']

# 3. Wind Direction Stability (Circular Encoding)
# RAW degrees (0-360) confuse models. 359 is the same as 1.
df['wind_dir_sin'] = np.sin(np.deg2rad(df['wind_direction_80m']))
df['wind_dir_cos'] = np.cos(np.deg2rad(df['wind_direction_80m']))

# 4. Clean Physics (Removing the raw ones, keeping interactions)
df["residual_demand"] = df["load_forecast"] - df["solar_forecast"] - df["wind_forecast"]
df["tightness_ratio"] = df["residual_demand"] / (df["load_forecast"] + 1)
df["tightness_x_month"] = df["tightness_ratio"] * df["month"]
df.drop(columns=['tightness_ratio', 'residual_demand'], inplace=True)

## Prepare X and y

In [146]:
observed_df = df[df['is_test'] == 0].copy()
test_df = df[df['is_test'] == 1].copy()

val_mask = observed_df['delivery_start'] >= VAL_START
train_df = observed_df[~val_mask]
val_df = observed_df[val_mask]

In [147]:
drop_cols = set(['id', 'target', 'market', 'delivery_start', 'delivery_end', 'is_test'])
feat_cols = sorted([c for c in df.columns if c not in drop_cols])
cat_idx = [feat_cols.index('market_id')]

X_train = train_df[feat_cols]
y_train_real = train_df['target'].values
y_train = np.arcsinh(train_df['target'].values)

X_val = val_df[feat_cols]
y_val_real = val_df['target'].values
y_val = np.arcsinh(val_df['target'].values)

X_all = observed_df[feat_cols]
y_all_real = observed_df['target'].values
y_all = np.arcsinh(observed_df['target'].values)

X_test = test_df[feat_cols]

## LightGBM baseline

In [148]:
print("Starting baseline evaluation...")
start_time = time.time()

# 1. The "Goldilocks" Hyperparameters
baseline_params = {
    "objective": "huber",     # Robust to the massive price spikes
    "alpha": 1.5,             # Moderate Huber threshold
    "metric": "rmse",
    "verbosity": -1,
    "seed": 42,               # STRICTLY FIXED for apples-to-apples comparison
    "n_jobs": -1,
    
    # Tree Structure (Moderate)
    "max_depth": 8,           # Deep enough for physics/weather interactions, shallow enough to stop overfitting
    "num_leaves": 127,        # 2^7 - 1 (Standard moderate size; your 980 leaves was massively overfitting)
    "min_child_samples": 50,  # Forces the model to generalize terminal nodes
    
    # Learning & Sampling
    "learning_rate": 0.05,    # Fast enough to train quickly, slow enough to learn smoothly
    "feature_fraction": 0.75, # Randomly hides 25% of features per tree so 'tightness_ratio' doesn't dominate every split
    "bagging_fraction": 0.8,  # Uses 80% of rows per tree to improve generalization
    "bagging_freq": 1,        # Perform bagging every iteration
    
    # Light Regularization
    "reg_alpha": 0.1,         # L1 regularization (Lasso)
    "reg_lambda": 0.1         # L2 regularization (Ridge)
}

# 2. Prepare Datasets
dataset_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_idx, free_raw_data=False)
dataset_val = lgb.Dataset(X_val, y_val, reference=dataset_train, free_raw_data=False)

# 3. Train the Model (No Optuna)
model = lgb.train(
    baseline_params, 
    dataset_train, 
    num_boost_round=3000,     # High maximum rounds...
    valid_sets=[dataset_val],
    callbacks=[
        lgb.early_stopping(100, verbose=False), # ...but stops early if validation stops improving
        lgb.log_evaluation(200)                 # Prints progress every 200 rounds
    ],
)

# 4. Predict and Back-Transform
# (Predicting on X_val, then reversing the arcsinh transformation)
preds = model.predict(X_val)
real_preds = np.sinh(preds)

# 5. Evaluate True RMSE
final_rmse = root_mean_squared_error(y_val_real, real_preds)

print("\n" + "="*50)
print(f"âœ… BASELINE RMSE: {final_rmse:.4f}")
print(f"   Iterations: {model.best_iteration}")
print(f"   Time taken: {time.time() - start_time:.0f}s")
print("="*50 + "\n")

# Optional: Print Top 10 features to verify your new feature is being used
imp = pd.Series(model.feature_importance("gain"), index=feat_cols)
print("Top 10 Features (Gain):")
print(imp.nlargest(10).round(0))

Starting baseline evaluation...
[200]	valid_0's rmse: 1.75112

âœ… BASELINE RMSE: 50.1483
   Iterations: 113
   Time taken: 2s

Top 10 Features (Gain):
wind_speed_80m       230677.0
wind_forecast        185597.0
tightness_x_hour      78142.0
solar_forecast        72158.0
market_id             60469.0
tightness_x_month     52256.0
load_forecast         51005.0
wind_speed_10m        42109.0
surface_pressure      37705.0
day_of_year           29123.0
dtype: float64


In [149]:
# Create a dataframe to analyze the validation results
val_results = val_df[['delivery_start', 'market_id']].copy()
val_results['actual'] = y_val_real
val_results['predicted'] = real_preds
val_results['month'] = val_results['delivery_start'].dt.month

print("\n" + "="*50)
print("ðŸ“… RMSE BY MONTH (VALIDATION SET)")
print("="*50)

# Calculate and print RMSE for each month in the validation set
for m in sorted(val_results['month'].unique()):
    month_data = val_results[val_results['month'] == m]
    month_rmse = root_mean_squared_error(month_data['actual'], month_data['predicted'])
    print(f"Month {m:02d} | Rows: {len(month_data):4d} | RMSE: {month_rmse:.4f}")

print("="*50 + "\n")


ðŸ“… RMSE BY MONTH (VALIDATION SET)
Month 01 | Rows: 4464 | RMSE: 67.0054
Month 02 | Rows: 4032 | RMSE: 40.3173
Month 03 | Rows: 4458 | RMSE: 26.8413
Month 04 | Rows: 4320 | RMSE: 63.8706
Month 05 | Rows: 4464 | RMSE: 56.0183
Month 06 | Rows: 4320 | RMSE: 32.1066
Month 07 | Rows: 4464 | RMSE: 48.3347
Month 08 | Rows: 4464 | RMSE: 51.1147



## LightGBM training

In [24]:
def obj_lgb(trial):
    params = {
        "objective": "huber",
        "alpha": trial.suggest_float("huber_alpha", 0.1, 3.0),
        "metric": "rmse",
        "verbosity": -1,
        "seed": SEED,
        "n_jobs": -1,
        "num_leaves": trial.suggest_int("num_leaves", 128, 1024),
        "learning_rate": trial.suggest_float("lr", 0.005, 0.05, log=True),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 0.8),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 20),
    }

    dataset_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_idx, free_raw_data=False)
    dataset_val = lgb.Dataset(X_val, y_val, reference=dataset_train, free_raw_data=False)
    
    model = lgb.train(
        params, dataset_train, num_boost_round=8000,
        valid_sets=[dataset_val],
        callbacks=[lgb.early_stopping(150), lgb.log_evaluation(0)],
    )
    
    preds = model.predict(X_val)
    real_preds = np.sinh(preds)

    rmse = root_mean_squared_error(y_val_real, real_preds)
    trial.set_user_attr("n_iter", model.best_iteration)
    
    return rmse

In [25]:
start_time = time.time()

study_lgb = optuna.create_study(
    direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED)
)
study_lgb.optimize(obj_lgb, n_trials=N_TRIALS_LGB)

lgb_rmse = study_lgb.best_value
lgb_iterations = study_lgb.best_trial.user_attrs["n_iter"]
lgb_best_params = study_lgb.best_params.copy()

if "lr" in lgb_best_params:
    lgb_best_params["learning_rate"] = lgb_best_params.pop("lr")

lgb_best_params.update({
    "objective": "huber",
    "metric": "rmse",
    "verbosity": -1,
    "seed": SEED,
    "n_jobs": -1})
print(f"-> Best val RMSE: {lgb_rmse:.4f} ({lgb_iterations} rounds, {time.time() - start_time:.0f}s)")

n_lgb = int(lgb_iterations * ROUND_MULTIPLIER)
dataset_all = lgb.Dataset(X_all, y_all, categorical_feature=cat_idx, free_raw_data=False)
lgb_final = lgb.train(lgb_best_params, dataset_all, num_boost_round=n_lgb)
lgb_test_preds = np.sinh(lgb_final.predict(X_test))

lgb_final.save_model(SAVED_LGB_PATH)
print(f"ðŸ’¾ LightGBM saved")


[32m[I 2026-02-19 23:38:56,585][0m A new study created in memory with name: no-name-4610f0cf-2de6-4527-b2f9-8f6896590f9c[0m


ValueError: No trials are completed yet.

In [None]:
imp = pd.Series(lgb_final.feature_importance("gain"), index=feat_cols)
print("  Top-10 features:")
for f, v in imp.nlargest(10).items():
    print(f"    {f}: {v:.0f}")

  Top-10 features:
    tightness_ratio: 304292
    residual_demand: 184922
    wind_speed_80m: 173009
    wind_forecast: 116429
    wind_speed_10m: 107949
    day_of_year: 99272
    surface_pressure: 95529
    wind_gust_speed_10m: 83442
    target_histmean_mm: 75095
    wind_direction_80m: 75050
