# Competition Model

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from scipy.optimize import minimize
import warnings
warnings.filterwarnings("ignore")

In [11]:
train_raw = pd.read_csv("train_competition_2026.csv")
test_raw  = pd.read_csv("test_no_outcome.csv")

train_raw["time"] = pd.to_datetime(train_raw["time"])
test_raw["time"]  = pd.to_datetime(test_raw["time"])

print(f"Train raw: {train_raw.shape}")
print(f"Test raw:  {test_raw.shape}")

Train raw: (432600, 18)
Test raw:  (103500, 16)


## 2. Feature Engineering

Each observation has 30 timepoints across 5 signals (t_0 to t_4). We collapse each observation into one row with simple summary stats: mean, std, min, max, first, last, slope, range, quantiles, skewness, and kurtosis. We also add time-based features and a few interaction features.

In [None]:
def engineer_features(df):
    t_cols = [f"t_{i}" for i in range(5)]
    num_cols = ["num_0", "num_1", "num_2"]
    cat_cols = [f"cat_{i}" for i in range(5)]

    df = df.sort_values(["obs", "time"]).copy()

    # Step 1: Basic aggregations per observation
    agg_dict = {}
    for c in num_cols + cat_cols:
        agg_dict[c] = "first"
    agg_dict["sub_id"] = "first"
    for c in t_cols:
        agg_dict[c] = ["mean", "std", "min", "max", "first", "last", "median"]

    grouped = df.groupby("obs").agg(agg_dict)
    grouped.columns = ["_".join(col).strip("_") for col in grouped.columns]
    grouped = grouped.reset_index()

    # Step 2: Derived stats from the basic aggregations
    for c in t_cols:
        grouped[f"{c}_slope"] = grouped[f"{c}_last"] - grouped[f"{c}_first"]
        grouped[f"{c}_range"] = grouped[f"{c}_max"] - grouped[f"{c}_min"]
        grouped[f"{c}_cv"] = grouped[f"{c}_std"] / (grouped[f"{c}_mean"].abs() + 1e-8)

    # Step 3: Quantiles
    quantile_feats = df.groupby("obs")[t_cols].quantile([0.1, 0.25, 0.75, 0.9])
    quantile_feats = quantile_feats.unstack(level=-1)
    quantile_feats.columns = [f"{c}_q{int(q*100)}" for c, q in quantile_feats.columns]
    grouped = grouped.merge(quantile_feats.reset_index(), on="obs")

    # Step 4: Skewness and kurtosis
    skew_feats = df.groupby("obs")[t_cols].skew()
    skew_feats.columns = [f"{c}_skew" for c in t_cols]
    grouped = grouped.merge(skew_feats.reset_index(), on="obs")

    kurt_feats = df.groupby("obs")[t_cols].apply(lambda x: x.kurtosis())
    kurt_feats.columns = [f"{c}_kurt" for c in t_cols]
    grouped = grouped.merge(kurt_feats.reset_index(), on="obs")

    # Step 5: Interaction features between signals
    grouped["t0_minus_t1"] = grouped["t_0_mean"] - grouped["t_1_mean"]
    grouped["t2_minus_t3"] = grouped["t_2_mean"] - grouped["t_3_mean"]
    grouped["t_mean_all"] = grouped[[f"t_{i}_mean" for i in range(5)]].mean(axis=1)

    # Step 6: Interaction features between static numerics
    grouped["num0_times_num1"] = grouped["num_0_first"] * grouped["num_1_first"]
    grouped["num0_times_num2"] = grouped["num_0_first"] * grouped["num_2_first"]

    # Step 7: Time features
    time_feats = df.groupby("obs")["time"].first()
    grouped["hour"] = pd.to_datetime(time_feats.values).hour
    grouped["dayofweek"] = pd.to_datetime(time_feats.values).dayofweek
    grouped["is_weekend"] = (grouped["dayofweek"] >= 5).astype(int)

    # Step 8: How many observations each subject has
    sub_counts = df.groupby("sub_id")["obs"].nunique().reset_index()
    sub_counts.columns = ["sub_id", "sub_obs_count"]
    grouped = grouped.merge(sub_counts, left_on="sub_id_first", right_on="sub_id", how="left")
    grouped = grouped.drop(columns=["sub_id"])

    return grouped

In [None]:
train_agg = engineer_features(train_raw)
test_agg  = engineer_features(test_raw)

targets = train_raw.groupby("obs")[["y_1", "y_2"]].first().reset_index()
train_agg = train_agg.merge(targets, on="obs")

print(f"Train: {train_agg.shape}")
print(f"Test:  {test_agg.shape}")

Engineering features...
Train aggregated: (14420, 199)
Test aggregated:  (3450, 197)


## 3. Prepare Features

Encode categorical columns as numbers using LabelEncoder, select all feature columns, and separate out the targets (y_1, y_2) and group IDs (sub_id) for cross-validation.

In [None]:
drop_cols = ["obs", "sub_id_first", "y_1", "y_2"]
feature_cols = [c for c in train_agg.columns if c not in drop_cols]

cat_features_raw = [f"cat_{i}_first" for i in range(5)]
label_encoders = {}
for c in cat_features_raw:
    le = LabelEncoder()
    all_vals = pd.concat([train_agg[c], test_agg[c]]).astype(str)
    le.fit(all_vals)
    train_agg[c] = le.transform(train_agg[c].astype(str))
    test_agg[c]  = le.transform(test_agg[c].astype(str))
    label_encoders[c] = le

X_all = train_agg[feature_cols].copy()
y_all = train_agg[["y_1", "y_2"]].copy()
groups = train_agg["sub_id_first"].values
X_test = test_agg[feature_cols].copy()

X_all = X_all.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

print(f"Feature count: {len(feature_cols)}")
print(f"Train obs: {len(X_all)}, Test obs: {len(X_test)}")

## 4. Train Models (LightGBM + XGBoost + CatBoost)

We train 3 different models and average their predictions. We use GroupKFold so the same subject is never in both train and validation at the same time. We repeat with 3 random seeds to make results more stable.

In [None]:
N_FOLDS = 7
SEEDS = [42, 123, 2026]
TARGETS = ["y_1", "y_2"]
gkf = GroupKFold(n_splits=N_FOLDS)

n_train, n_test = len(X_all), len(X_test)
n_seeds = len(SEEDS)

# Store out of fold and test predictions for each model and target
oof_preds = {model: {t: np.zeros(n_train) for t in TARGETS} for model in ["lgb", "xgb", "cat"]}
test_preds = {model: {t: np.zeros(n_test) for t in TARGETS} for model in ["lgb", "xgb", "cat"]}

for seed in SEEDS:
    print(f"SEED {seed}")

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_all, y_all, groups)):
        print(f"  Fold {fold+1}/{N_FOLDS}")
        X_tr, X_va = X_all.iloc[tr_idx], X_all.iloc[va_idx]
        y_tr, y_va = y_all.iloc[tr_idx], y_all.iloc[va_idx]

        for target in TARGETS:
            # LightGBM
            m_lgb = lgb.LGBMRegressor(
                objective="mae", metric="mae", verbosity=-1,
                n_estimators=3000, learning_rate=0.02, num_leaves=63,
                min_child_samples=50, subsample=0.7, colsample_bytree=0.6,
                reg_alpha=0.5, reg_lambda=5.0, random_state=seed, n_jobs=-1, subsample_freq=1,
            )
            m_lgb.fit(X_tr, y_tr[target], eval_set=[(X_va, y_va[target])],
                      callbacks=[lgb.early_stopping(150, verbose=False), lgb.log_evaluation(0)])
            oof_preds["lgb"][target][va_idx] += m_lgb.predict(X_va)
            test_preds["lgb"][target] += m_lgb.predict(X_test) / (N_FOLDS * n_seeds)

            # XGBoost
            m_xgb = xgb.XGBRegressor(
                objective="reg:absoluteerror", eval_metric="mae",
                n_estimators=3000, learning_rate=0.02, max_depth=6,
                min_child_weight=50, subsample=0.7, colsample_bytree=0.6,
                reg_alpha=0.5, reg_lambda=5.0, random_state=seed, n_jobs=-1, verbosity=0, tree_method="hist",
            )
            m_xgb.fit(X_tr, y_tr[target], eval_set=[(X_va, y_va[target])], verbose=False)
            oof_preds["xgb"][target][va_idx] += m_xgb.predict(X_va)
            test_preds["xgb"][target] += m_xgb.predict(X_test) / (N_FOLDS * n_seeds)

            # CatBoost
            m_cat = CatBoostRegressor(
                loss_function="MAE", iterations=3000, learning_rate=0.03, depth=6,
                l2_leaf_reg=5.0, min_data_in_leaf=50, random_seed=seed, verbose=0, subsample=0.7,
            )
            m_cat.fit(X_tr, y_tr[target], eval_set=(X_va, y_va[target]), early_stopping_rounds=150)
            oof_preds["cat"][target][va_idx] += m_cat.predict(X_va)
            test_preds["cat"][target] += m_cat.predict(X_test) / (N_FOLDS * n_seeds)

# Average OOF predictions across seeds
for model in oof_preds:
    for target in TARGETS:
        oof_preds[model][target] /= n_seeds

## 5. Evaluate and Optimize Ensemble Weights

Check how each model did on its own, then find the best way to combine them. We search for weights (adding up to 1) that minimize MAE when blending the 3 models together. We do this separately for y_1 and y_2.

In [None]:
y1_true = y_all["y_1"].values
y2_true = y_all["y_2"].values

print("Individual Model Scores")
for name, key in [("LightGBM", "lgb"), ("XGBoost", "xgb"), ("CatBoost", "cat")]:
    m1 = mean_absolute_error(y1_true, oof_preds[key]["y_1"])
    m2 = mean_absolute_error(y2_true, oof_preds[key]["y_2"])
    print(f"  {name:10s} → y1: {m1:.4f}, y2: {m2:.4f}, avg: {(m1+m2)/2:.4f}")

def find_weights(oof_list, y_true):
    n = len(oof_list)
    def obj(w):
        blend = sum(w[i] * oof_list[i] for i in range(n))
        return mean_absolute_error(y_true, blend)
    return minimize(obj, x0=[1/n]*n, method="SLSQP",
                    bounds=[(0,1)]*n,
                    constraints={"type": "eq", "fun": lambda w: sum(w) - 1})

res_y1 = find_weights([oof_preds[m]["y_1"] for m in ["lgb", "xgb", "cat"]], y1_true)
res_y2 = find_weights([oof_preds[m]["y_2"] for m in ["lgb", "xgb", "cat"]], y2_true)
w_y1, w_y2 = res_y1.x, res_y2.x

print(f"Optimal Weights (LGB / XGB / CAT)")
print(f"  y_1: [{w_y1[0]:.3f}, {w_y1[1]:.3f}, {w_y1[2]:.3f}] → MAE: {res_y1.fun:.4f}")
print(f"  y_2: [{w_y2[0]:.3f}, {w_y2[1]:.3f}, {w_y2[2]:.3f}] → MAE: {res_y2.fun:.4f}")
print(f"  Avg MAE: {(res_y1.fun + res_y2.fun)/2:.4f}")

## 6. Generate Submission

Combine the 3 models' test predictions using the optimized weights from step 5, then save to CSV.

In [None]:
models = ["lgb", "xgb", "cat"]

final_y1 = sum(w_y1[i] * test_preds[m]["y_1"] for i, m in enumerate(models))
final_y2 = sum(w_y2[i] * test_preds[m]["y_2"] for i, m in enumerate(models))

submission = pd.DataFrame({"obs": test_agg["obs"], "y_1": final_y1, "y_2": final_y2})
submission.to_csv("sample_submission3.csv", index=False)

print(f"Saved {len(submission)} rows to sample_submission3.csv")
submission.head(10)