In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aoml-m-2-test-feb-2025/sample_submission.csv
/kaggle/input/aoml-m-2-test-feb-2025/train.csv
/kaggle/input/aoml-m-2-test-feb-2025/test.csv


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd

train_df_dir = "/kaggle/input/aoml-m-2-test-feb-2025/train.csv"
test_df_dir = "/kaggle/input/aoml-m-2-test-feb-2025/test.csv"

In [4]:
train_df = pd.read_csv(train_df_dir)
test_df = pd.read_csv(test_df_dir)

test_uid = test_df["uid"]

train_df.drop("uid", axis=1, inplace=True)
test_df.drop("uid", axis=1, inplace=True)

day_map = {
    "Friday": 0,
    "Saturday": 1,
}

train_df["day"] = train_df["day"].map(day_map)
test_df["day"] = test_df["day"].map(day_map)

train_df.drop("minute", axis=1, inplace=True)
test_df.drop("minute", axis=1, inplace=True)

print(train_df.isna().sum())

day                              5479
hour                             5613
C_motion                         5517
feed_water_motion                5597
faucet_hole                      5566
vapour_pressure                  5479
vapour_enthalpy                  5437
vapour_pressure_at_division      5477
vapour_motion                    5477
feed_water_enth                  5496
vapour_temperature               5517
output_electricity_generation       0
dtype: int64


In [5]:
from sklearn.neighbors import KNeighborsRegressor


def KNN_Imputer(df):
    for i in df.columns:
        if df[i].isna().sum() == 0:
            continue

        temp_df = df.copy()
        for x in df.columns:
            if x == i:
                continue
            temp_df[x].fillna(value=temp_df[x].mean(), inplace=True)

        col = i
        other_cols = [x for x in df.columns if x != col]
        X = temp_df[other_cols][df[col].notna()]
        y = temp_df[col][df[col].notna()]

        neigh = KNeighborsRegressor(n_neighbors=3)
        neigh.fit(np.array(X), np.array(y))

        print(f"Imputing {i}")
        for i, j in enumerate(df[col].isnull()):
            if j == True:
                df[col].iloc[i] = neigh.predict(temp_df[other_cols].iloc[i : i + 1, :])[
                    0
                ]
    return df


train_df = KNN_Imputer(train_df)
test_df = KNN_Imputer(test_df)

print(train_df.isna().sum())

Imputing day
Imputing hour
Imputing C_motion
Imputing feed_water_motion
Imputing faucet_hole
Imputing vapour_pressure
Imputing vapour_enthalpy
Imputing vapour_pressure_at_division
Imputing vapour_motion
Imputing feed_water_enth
Imputing vapour_temperature
Imputing day
Imputing hour
Imputing C_motion
Imputing feed_water_motion
Imputing faucet_hole
Imputing vapour_pressure
Imputing vapour_enthalpy
Imputing vapour_pressure_at_division
Imputing vapour_motion
Imputing feed_water_enth
Imputing vapour_temperature
day                              0
hour                             0
C_motion                         0
feed_water_motion                0
faucet_hole                      0
vapour_pressure                  0
vapour_enthalpy                  0
vapour_pressure_at_division      0
vapour_motion                    0
feed_water_enth                  0
vapour_temperature               0
output_electricity_generation    0
dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

X = train_df.drop("output_electricity_generation", axis=1)
y = train_df["output_electricity_generation"]

X["vapour_pressure_per_temp"] = X["vapour_pressure"] / X["vapour_temperature"]
test_df["vapour_pressure_per_temp"] = (
    test_df["vapour_pressure"] / test_df["vapour_temperature"]
)

X["vapour_enthalpy_per_temp"] = X["vapour_enthalpy"] / X["vapour_temperature"]
test_df["vapour_enthalpy_per_temp"] = (
    test_df["vapour_enthalpy"] / test_df["vapour_temperature"]
)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train["hour_sin"] = np.sin(2 * np.pi * X_train["hour"] / 23.0)
X_train["hour_cos"] = np.cos(2 * np.pi * X_train["hour"] / 23.0)

X_test["hour_sin"] = np.sin(2 * np.pi * X_test["hour"] / 23.0)
X_test["hour_cos"] = np.cos(2 * np.pi * X_test["hour"] / 23.0)

test_df["hour_sin"] = np.sin(2 * np.pi * test_df["hour"] / 23.0)
test_df["hour_cos"] = np.cos(2 * np.pi * test_df["hour"] / 23.0)

In [7]:
import optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


def objective(trial):
    param = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(param, lgb_train, valid_sets=[lgb_train, lgb_eval])

    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

lgb_params = study.best_params
lgb_params["objective"] = "regression"
lgb_params["metric"] = "rmse"
lgb_params["verbosity"] = -1
lgb_params["boosting_type"] = "gbdt"

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

lgb_model = lgb.train(lgb_params, lgb_train, valid_sets=[lgb_train, lgb_eval])

lgb_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)
lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_pred))
print(f"LightGBM RMSE: {lgb_rmse}\n")

[I 2025-03-09 11:45:33,100] A new study created in memory with name: no-name-709cefa1-b656-4a4e-9c1a-661a69069b2d
[I 2025-03-09 11:45:36,103] Trial 0 finished with value: 1.0285551031208844 and parameters: {'lambda_l1': 0.00012717141357882817, 'lambda_l2': 1.326529126063463, 'num_leaves': 185, 'feature_fraction': 0.5063428418406376, 'bagging_fraction': 0.9339303012853281, 'bagging_freq': 4, 'min_child_samples': 26}. Best is trial 0 with value: 1.0285551031208844.
[I 2025-03-09 11:45:37,179] Trial 1 finished with value: 1.031457135839787 and parameters: {'lambda_l1': 6.569802155602175e-06, 'lambda_l2': 0.00013782163220482733, 'num_leaves': 151, 'feature_fraction': 0.4136898037127542, 'bagging_fraction': 0.7251493480360292, 'bagging_freq': 2, 'min_child_samples': 9}. Best is trial 0 with value: 1.0285551031208844.
[I 2025-03-09 11:45:38,637] Trial 2 finished with value: 1.2867616193811704 and parameters: {'lambda_l1': 1.1862749798386223e-06, 'lambda_l2': 0.07780438745293133, 'num_leaves'

LightGBM RMSE: 0.8698517181104317



In [8]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor

estimators = [
    ("lgb", lgb.LGBMRegressor(**lgb_params)),
    ("rf", RandomForestRegressor(max_depth=32, random_state=1)),
]

stacker = StackingRegressor(
    estimators=estimators, final_estimator=RandomForestRegressor()
)
stacker.fit(X_train, y_train)

stacker_pred = stacker.predict(X_test)

stacker_rmse = np.sqrt(mean_squared_error(y_test, stacker_pred))

print(f"Stacker RMSE: {stacker_rmse}\n")

Stacker RMSE: 0.8270236914958408



In [10]:
stacker_pred = stacker.predict(test_df)
stacker_submission = pd.DataFrame(
    {"uid": test_uid, "output_electricity_generation": stacker_pred}
)
train = pd.read_csv(train_df_dir)
test = pd.read_csv(test_df_dir)

grp3_df = (
    train.groupby(["day", "hour", "minute"])["output_electricity_generation"]
    .mean()
    .reset_index()
)
grp3_df = grp3_df.rename(
    columns={"output_electricity_generation": "3_output_electricity_generation"}
)
train = train.merge(grp3_df, on=["day", "hour", "minute"], how="left")
test = test.merge(grp3_df, on=["day", "hour", "minute"], how="left")

stacker_submission.loc[
    test["3_output_electricity_generation"].notnull(), "output_electricity_generation"
] = test.loc[
    test["3_output_electricity_generation"].notnull(), "3_output_electricity_generation"
].values
stacker_submission.to_csv(f"submission.csv", index=False)