In [1]:
# 3_Feature_Selection (Step 3)
# - Read Step 2 output: Data/Feature/train/test.parquet
# - Compute climate vs futures correlation on train by country × month
# - Score each climate feature:
#   0.5 × Avg_Sig_Corr_Score + 0.2 × Sig_Count_Score
#   (significance |corr|>=0.5, correlation rounded to 5 decimals)
# - Select Top 4 features (used by Step 4 AE)
# - Save: feature_cfcs_{THRESH}.parquet / selected_features_{THRESH}.parquet
# - Generate Selection/train & test subsets

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

SIGNIFICANCE_THRESHOLD = 0.5
THRESH_TAG = f"{SIGNIFICANCE_THRESHOLD:.2f}".rstrip("0").rstrip(".")

# Local / Kaggle compatibility
if Path("/kaggle/input").exists():
    OUT_DIR = Path("/kaggle/working")
else:
    OUT_DIR = Path.cwd()

FEAT_DIR = OUT_DIR / "Data" / "Feature"
SEL_DIR = OUT_DIR / "Data" / "Selection"
SEL_DIR.mkdir(parents=True, exist_ok=True)

train_path = FEAT_DIR / "train.parquet"
test_path = FEAT_DIR / "test.parquet"

print("train_path:", train_path)
print("test_path :", test_path)
print("SEL_DIR   :", SEL_DIR)



train_path: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Feature\train.parquet
test_path : e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Feature\test.parquet
SEL_DIR   : e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Selection


In [3]:
# Load data

train_df = pd.read_parquet(train_path)
test_df = pd.read_parquet(test_path)

print("train:", train_df.shape, "test:", test_df.shape)

KEY_COLS = [c for c in ["ID", "date_on", "country_name", "region_name", "region_id"] if c in train_df.columns]
climate_cols = [c for c in train_df.columns if c.startswith("climate_risk_")]
futures_cols = [c for c in train_df.columns if c.startswith("futures_")]

print("key cols    :", KEY_COLS)
print("climate cols:", len(climate_cols))
print("futures cols:", len(futures_cols))



train: (255756, 365) test: (64691, 365)
key cols    : ['ID', 'date_on', 'country_name', 'region_name', 'region_id']
climate cols: 326
futures cols: 17


In [4]:
# Compute correlation by country × month and score each climate feature

# Month column
if "date_on_month" in train_df.columns:
    train_df["_month"] = train_df["date_on_month"]
else:
    train_df["_month"] = pd.to_datetime(train_df["date_on"], errors="coerce").dt.month

# Initialize stats
stats = {
    c: {
        "sig_corrs": [],
        "sig_count": 0,
        "total": 0,
    }
    for c in climate_cols
}

# Grouped correlations
for (country, month), g in train_df.groupby(["country_name", "_month"], dropna=False):
    c_cols = [c for c in climate_cols if g[c].std(ddof=0) > 0]
    f_cols = [c for c in futures_cols if g[c].std(ddof=0) > 0]

    if not c_cols or not f_cols:
        continue

    corr_mat = g[c_cols + f_cols].corr(method="pearson")
    block = corr_mat.loc[c_cols, f_cols].round(5)

    for clim in c_cols:
        vals = block.loc[clim].dropna().values
        if vals.size == 0:
            continue
        abs_vals = np.abs(vals)
        sig_mask = abs_vals >= SIGNIFICANCE_THRESHOLD
        stats[clim]["total"] += len(abs_vals)
        stats[clim]["sig_count"] += int(sig_mask.sum())
        if sig_mask.any():
            stats[clim]["sig_corrs"].extend(abs_vals[sig_mask].tolist())

# Aggregate metrics
rows = []
for feat, s in stats.items():
    total = s["total"]
    sig_count = s["sig_count"]
    sig_corrs = s["sig_corrs"]

    avg_sig = float(np.mean(sig_corrs)) if sig_corrs else 0.0
    sig_pct = (sig_count / total * 100.0) if total > 0 else 0.0

    avg_sig_score = min(100.0, avg_sig * 100.0)
    score = 0.5 * avg_sig_score + 0.2 * sig_pct

    rows.append(
        {
            "feature": feat,
            "avg_sig_corr": round(avg_sig, 6),
            "sig_count": sig_count,
            "total": total,
            "sig_pct": round(sig_pct, 6),
            "score": round(score, 6),
        }
    )

feature_score = pd.DataFrame(rows).sort_values("score", ascending=False).reset_index(drop=True)

print("top 20:")
display(feature_score.head(20))
print("bottom 20:")
display(feature_score.tail(20))



top 20:


Unnamed: 0,feature,avg_sig_corr,sig_count,total,sig_pct,score
0,climate_risk_unseasonably_cold_time_since_high_20,0.684035,189,2244,8.42246,35.886219
1,climate_risk_unseasonably_cold_time_since_high_30,0.662958,212,2244,9.447415,35.037393
2,climate_risk_heat_stress_time_since_high_30,0.623515,386,2244,17.201426,34.616029
3,climate_risk_heat_stress_time_since_high_20,0.631805,302,2244,13.458111,34.281897
4,climate_risk_unseasonably_cold_vol_30d,0.620221,35,1802,1.942286,31.399514
5,climate_risk_drought_ema_30d,0.607084,76,2244,3.386809,31.031539
6,climate_risk_drought_cumsum_90d,0.593945,145,2244,6.461676,30.989583
7,climate_risk_drought_cumsum_60d,0.597878,116,2244,5.16934,30.927756
8,climate_risk_drought_ma_60d,0.597723,113,2244,5.035651,30.893267
9,climate_risk_drought_cumsum_30d,0.602103,86,2244,3.832442,30.871657


bottom 20:


Unnamed: 0,feature,avg_sig_corr,sig_count,total,sig_pct,score
306,climate_risk_excess_precip_seasonal_z,0.0,0,2244,0.0,0.0
307,climate_risk_country_heat_stress_top1_minus_top2,0.0,0,1462,0.0,0.0
308,climate_risk_country_heat_stress_score_wstd,0.0,0,1462,0.0,0.0
309,climate_risk_unseasonably_cold_balance,0.0,0,2244,0.0,0.0
310,climate_risk_heat_stress_weighted_severity_051,0.0,0,1462,0.0,0.0
311,climate_risk_heat_stress_weighted_balance,0.0,0,2244,0.0,0.0
312,climate_risk_unseasonably_cold_score,0.0,0,1581,0.0,0.0
313,climate_risk_unseasonably_cold_weighted,0.0,0,1581,0.0,0.0
314,climate_risk_unseasonably_cold_high_share,0.0,0,1581,0.0,0.0
315,climate_risk_unseasonably_cold_seasonal_z,0.0,0,1581,0.0,0.0


In [5]:
# Take Top 4 as selected features

selected = feature_score.head(4)["feature"].tolist()
removed = feature_score[~feature_score["feature"].isin(selected)]["feature"].tolist()

print(f"Top4 selected: {len(selected)} | removed: {len(removed)}")

tag = f"cfcs_{THRESH_TAG}_top4_2"

(feature_score).to_parquet(SEL_DIR / f"feature_{tag}.parquet", index=False)
pd.DataFrame({"feature": selected}).to_parquet(SEL_DIR / f"selected_features_{tag}.parquet", index=False)
pd.DataFrame({"feature": removed}).to_parquet(SEL_DIR / f"removed_features_{tag}.parquet", index=False)

print(f"saved feature / selected_features / removed_features (tag={tag})")

keep_cols = KEY_COLS + selected + futures_cols
keep_cols = [c for c in keep_cols if c in train_df.columns]

train_sel = train_df[keep_cols].copy()
test_sel = test_df[keep_cols].copy()

train_sel.to_parquet(SEL_DIR / f"train_{tag}.parquet", index=False)
test_sel.to_parquet(SEL_DIR / f"test_{tag}.parquet", index=False)

print("saved:", SEL_DIR / f"train_{tag}.parquet")
print("saved:", SEL_DIR / f"test_{tag}.parquet")



Top4 selected: 4 | removed: 322
saved feature / selected_features / removed_features (tag=cfcs_0.5_top4_2)
saved: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Selection\train_cfcs_0.5_top4_2.parquet
saved: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Selection\test_cfcs_0.5_top4_2.parquet


In [6]:
selected

['climate_risk_unseasonably_cold_time_since_high_20',
 'climate_risk_unseasonably_cold_time_since_high_30',
 'climate_risk_heat_stress_time_since_high_30',
 'climate_risk_heat_stress_time_since_high_20']