In [1]:
# 4_Feature_AE_and_Submission (Step 4)
# - Train AE using Step 3 Selection (cfcs top4), with test as valid
# - Early stopping: patience=100, max_epochs=5000
# - Save AE artifacts with _valid suffix

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import joblib
import sys
import subprocess
import random

try:
    import torch
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
    import torch

import torch.nn as nn

# Reproducibility seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

from sklearn.preprocessing import StandardScaler

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RISK_CATEGORIES = ["heat_stress", "unseasonably_cold", "excess_precip", "drought"]

# Local / Kaggle compatibility
if Path("/kaggle/input").exists():
    OUT_DIR = Path("/kaggle/working")
else:
    OUT_DIR = Path.cwd()

SEL_DIR = OUT_DIR / "Data" / "Selection"
SEL_DIR.mkdir(parents=True, exist_ok=True)

# --- Selection_2 (cfcs top4) ---
SIGNIFICANCE_THRESHOLD = 0.5
THRESH_TAG_2 = f"{SIGNIFICANCE_THRESHOLD:.2f}".rstrip("0").rstrip(".")
TAG_2 = f"cfcs_{THRESH_TAG_2}_top4_2"
SEL2_TRAIN = SEL_DIR / f"train_{TAG_2}.parquet"
SEL2_TEST = SEL_DIR / f"test_{TAG_2}.parquet"
AE_TAG_2 = f"ae_{TAG_2}"
AE_TAG_2_VALID = f"{AE_TAG_2}_valid"

print("SEL2_TRAIN:", SEL2_TRAIN)
print("SEL2_TEST :", SEL2_TEST)
print("AE_TAG_2_VALID:", AE_TAG_2_VALID)



SEL2_TRAIN: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Selection\train_cfcs_0.5_top4_2.parquet
SEL2_TEST : e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Selection\test_cfcs_0.5_top4_2.parquet
AE_TAG_2_VALID: ae_cfcs_0.5_top4_2_valid


In [3]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim: int, latent_dim: int = 3):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim),
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z


def pick_ae_features(df: pd.DataFrame) -> list[str]:
    features: list[str] = []

    def _add(col: str):
        if col in df.columns and col not in features:
            features.append(col)

    # Prefer engineered continuous features
    for r in RISK_CATEGORIES:
        _add(f"climate_risk_{r}_score")
        _add(f"climate_risk_{r}_change_7d")
        _add(f"climate_risk_{r}_ma_14d")
        _add(f"climate_risk_country_{r}_weighted_sum")

    _add("climate_risk_heat_drought_interact")
    _add("climate_risk_overall_weighted")

    # Fallback to all continuous climate_risk_ features if too few
    if len(features) < 3:
        fallback = [
            c
            for c in df.columns
            if c.startswith("climate_risk_")
            and not c.endswith("_low")
            and not c.endswith("_med")
            and not c.endswith("_high")
        ]
        for c in fallback:
            _add(c)

    return features


def add_ae_time_features(df: pd.DataFrame, base_cols: list[str]) -> list[str]:
    # Keep only raw AE features; no extra time transforms
    return list(base_cols)


def compute_cfcs_official(
    df: pd.DataFrame,
    climate_cols: list[str],
    futures_cols: list[str],
    sig_th: float = 0.5,
) -> dict:
    if "date_on" not in df.columns:
        raise ValueError("df missing date_on")
    if "country_name" not in df.columns:
        raise ValueError("df missing country_name")

    date = pd.to_datetime(df["date_on"], errors="coerce")
    ym = date.dt.to_period("M")

    corrs = []

    for (country, period), g in df.assign(_ym=ym).groupby(["country_name", "_ym"], dropna=False):
        c_cols = [c for c in climate_cols if g[c].std(ddof=0) > 0]
        f_cols = [c for c in futures_cols if g[c].std(ddof=0) > 0]
        if not c_cols or not f_cols:
            continue

        corr_mat = g[c_cols + f_cols].corr(method="pearson")
        block = corr_mat.loc[c_cols, f_cols].round(5).to_numpy().ravel()
        corrs.extend(block[~np.isnan(block)])

    if len(corrs) == 0:
        return {
            "cfcs": 0.0,
            "avg_sig_corr": 0.0,
            "max_corr": 0.0,
            "sig_pct": 0.0,
            "sig_count": 0,
            "total": 0,
        }

    corrs = np.asarray(corrs)
    abs_corrs = np.abs(corrs)
    sig = abs_corrs[abs_corrs >= sig_th]

    avg_sig = float(sig.mean()) if sig.size else 0.0
    max_corr = float(abs_corrs.max())
    sig_pct = float(sig.size / abs_corrs.size * 100.0)

    avg_sig_score = min(100.0, avg_sig * 100.0)
    max_score = min(100.0, max_corr * 100.0)

    cfcs = (0.5 * avg_sig_score) + (0.3 * max_score) + (0.2 * sig_pct)

    return {
        "cfcs": round(cfcs, 4),
        "avg_sig_corr": round(avg_sig, 6),
        "max_corr": round(max_corr, 6),
        "sig_pct": round(sig_pct, 6),
        "sig_count": int(sig.size),
        "total": int(abs_corrs.size),
    }


def run_ae_pipeline(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    tag: str,
    max_epochs: int = 200,
    patience: int | None = None,
    output_tag: str | None = None,
) -> None:
    ae_features = pick_ae_features(train_df)
    print(f"[{tag}] ae_features: {len(ae_features)}")
    if len(ae_features) < 3:
        raise ValueError(f"[{tag}] Not enough AE features (<3). Check input columns.")

    if "date_on" not in test_df.columns or "country_name" not in test_df.columns:
        raise ValueError("test_df missing date_on or country_name; cannot compute CFCS")

    futures_cols = [c for c in test_df.columns if c.startswith("futures_")]
    if not futures_cols:
        raise ValueError("test_df missing futures_ columns; cannot compute CFCS")

    X_train = train_df[ae_features].fillna(0.0)
    X_test = test_df[ae_features].fillna(0.0)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    save_tag = output_tag or tag

    joblib.dump(scaler, SEL_DIR / f"ae_scaler_{save_tag}.joblib")
    pd.DataFrame({"feature": ae_features}).to_parquet(SEL_DIR / f"ae_input_features_{save_tag}.parquet", index=False)

    torch.manual_seed(42)
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

    model = AutoEncoder(input_dim=X_train_tensor.shape[1], latent_dim=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    base_ae_cols = [
        "climate_risk_ae_z1",
        "climate_risk_ae_z2",
        "climate_risk_ae_z3",
        "climate_risk_ae_recon_error",
    ]

    best_state = None
    best_cfcs = float("-inf")
    best_epoch = -1
    wait = 0

    for epoch in range(max_epochs):
        model.train()
        optimizer.zero_grad()
        x_hat, _ = model(X_train_tensor)
        loss = loss_fn(x_hat, X_train_tensor)
        loss.backward()
        optimizer.step()

        if patience is not None:
            model.eval()
            with torch.no_grad():
                x_hat_val, z_val = model(X_test_tensor)
                recon = ((x_hat_val - X_test_tensor) ** 2).mean(dim=1).numpy()
                z_val = z_val.numpy()

            val_df = test_df[["country_name", "date_on"] + futures_cols].copy()
            val_df["climate_risk_ae_z1"] = z_val[:, 0]
            val_df["climate_risk_ae_z2"] = z_val[:, 1]
            val_df["climate_risk_ae_z3"] = z_val[:, 2]
            val_df["climate_risk_ae_recon_error"] = recon

            val_cfcs = compute_cfcs_official(
                val_df,
                climate_cols=base_ae_cols,
                futures_cols=futures_cols,
            )["cfcs"]

            if val_cfcs > best_cfcs + 1e-8:
                best_cfcs = val_cfcs
                best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
                best_epoch = epoch
                wait = 0
            else:
                wait += 1

            if epoch % 50 == 0:
                print(f"[{tag}] epoch {epoch}, train_loss={loss.item():.6f}, cfcs={val_cfcs:.4f}, best={best_cfcs:.4f}")

            if wait >= patience:
                print(f"[{tag}] early stop at epoch {epoch}, best_epoch={best_epoch}, best_cfcs={best_cfcs:.4f}")
                break
        else:
            if epoch % 50 == 0:
                print(f"[{tag}] epoch {epoch}, loss={loss.item():.6f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    torch.save(model.state_dict(), SEL_DIR / f"ae_model_{save_tag}.pth")

    with torch.no_grad():
        _, z_train = model(X_train_tensor)
        xhat_train, _ = model(X_train_tensor)
        _, z_test = model(X_test_tensor)
        xhat_test, _ = model(X_test_tensor)

    z_train = z_train.numpy()
    z_test = z_test.numpy()

    train_df["climate_risk_ae_z1"] = z_train[:, 0]
    train_df["climate_risk_ae_z2"] = z_train[:, 1]
    train_df["climate_risk_ae_z3"] = z_train[:, 2]
    test_df["climate_risk_ae_z1"] = z_test[:, 0]
    test_df["climate_risk_ae_z2"] = z_test[:, 1]
    test_df["climate_risk_ae_z3"] = z_test[:, 2]

    train_df["climate_risk_ae_recon_error"] = ((xhat_train - X_train_tensor) ** 2).mean(dim=1).numpy()
    test_df["climate_risk_ae_recon_error"] = ((xhat_test - X_test_tensor) ** 2).mean(dim=1).numpy()

    ae_cols = add_ae_time_features(train_df, base_ae_cols)
    _ = add_ae_time_features(test_df, base_ae_cols)

    pd.DataFrame({"feature": ae_cols}).to_parquet(SEL_DIR / f"selected_features_ae_{save_tag}.parquet", index=False)

    key_cols = [c for c in ["ID", "date_on", "country_name", "region_name", "region_id"] if c in train_df.columns]
    futures_cols = [c for c in train_df.columns if c.startswith("futures_")]

    keep_cols = key_cols + ae_cols + futures_cols
    keep_cols = [c for c in keep_cols if c in train_df.columns]

    train_out = train_df[keep_cols].copy()
    test_out = test_df[keep_cols].copy()

    train_out.to_parquet(SEL_DIR / f"train_ae_{save_tag}.parquet", index=False)
    test_out.to_parquet(SEL_DIR / f"test_ae_{save_tag}.parquet", index=False)

    print(f"[{save_tag}] saved train_ae/test_ae + selected_features_ae")



In [4]:
# === Selection_2 (cfcs top4) ===

assert SEL2_TRAIN.exists(), f"missing: {SEL2_TRAIN}"
assert SEL2_TEST.exists(), f"missing: {SEL2_TEST}"

train_df_2 = pd.read_parquet(SEL2_TRAIN)
test_df_2 = pd.read_parquet(SEL2_TEST)

# Exclude last 90 days from train
if "date_on" not in train_df_2.columns:
    raise ValueError("train_df_2 missing date_on; cannot filter by time")

train_df_2 = train_df_2.copy()
train_df_2["date_on"] = pd.to_datetime(train_df_2["date_on"], errors="coerce")
max_date = train_df_2["date_on"].max()
cutoff = max_date - pd.Timedelta(days=90)
train_df_2 = train_df_2[train_df_2["date_on"] <= cutoff].copy()

print("sel2 train (exclude last 90d):", train_df_2.shape, "test:", test_df_2.shape)

run_ae_pipeline(
    train_df_2,
    test_df_2,
    AE_TAG_2,
    max_epochs=5000,
    patience=100,
    output_tag=AE_TAG_2_VALID,
)

sel2 train (exclude last 90d): (247805, 26) test: (64691, 26)
[ae_cfcs_0.5_top4_2] ae_features: 4
[ae_cfcs_0.5_top4_2] epoch 0, train_loss=1.075589, cfcs=65.9457, best=65.9457
[ae_cfcs_0.5_top4_2] epoch 50, train_loss=0.898554, cfcs=65.8220, best=66.2882
[ae_cfcs_0.5_top4_2] epoch 100, train_loss=0.467929, cfcs=65.0134, best=66.4525
[ae_cfcs_0.5_top4_2] epoch 150, train_loss=0.067967, cfcs=64.5696, best=66.4525
[ae_cfcs_0.5_top4_2] early stop at epoch 155, best_epoch=55, best_cfcs=66.4525
[ae_cfcs_0.5_top4_2_valid] saved train_ae/test_ae + selected_features_ae
