In [1]:
# 1_Data_Processing (Step 1)
# - Load raw data (competition CSV)
# - Forward-fill NaN for climate/futures by region_id + date_on time series
# - Drop rows where futures_* are still missing (no trading day)
# - Save to Data/Processing/processed.parquet
# - Create valid_ids.parquet (used for Step 4 submission alignment)


In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

# Local / Kaggle compatibility
if Path("/kaggle/input").exists():
    DATA_DIR = Path("/kaggle/input/forecasting-the-future-the-helios-corn-climate-challenge")
    OUT_DIR = Path("/kaggle/working")
else:
    BASE_DIR = Path.cwd()
    DATA_DIR = BASE_DIR / "forecasting-the-future-the-helios-corn-climate-challenge"
    OUT_DIR = BASE_DIR

RAW_MAIN = DATA_DIR / "corn_climate_risk_futures_daily_master.csv"
RAW_SHARE = DATA_DIR / "corn_regional_market_share.csv"

PROC_DIR = OUT_DIR / "Data" / "Processing"
PROC_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_DIR:", DATA_DIR)
print("OUT_DIR :", OUT_DIR)
print("RAW_MAIN:", RAW_MAIN.exists(), RAW_MAIN)
print("RAW_SHARE:", RAW_SHARE.exists(), RAW_SHARE)
print("PROC_DIR:", PROC_DIR)


DATA_DIR: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\forecasting-the-future-the-helios-corn-climate-challenge
OUT_DIR : e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo
RAW_MAIN: True e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\forecasting-the-future-the-helios-corn-climate-challenge\corn_climate_risk_futures_daily_master.csv
RAW_SHARE: True e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\forecasting-the-future-the-helios-corn-climate-challenge\corn_regional_market_share.csv
PROC_DIR: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Processing


In [3]:
# Load data

df = pd.read_csv(RAW_MAIN, low_memory=False)
df["date_on"] = pd.to_datetime(df["date_on"], errors="coerce")

print("raw df:", df.shape)
print("date_on null:", int(df["date_on"].isna().sum()))
print("date range:", df["date_on"].min(), "~", df["date_on"].max())

climate_cols = [c for c in df.columns if c.startswith("climate_risk_")]
futures_cols = [c for c in df.columns if c.startswith("futures_")]

print("climate cols:", len(climate_cols))
print("futures cols:", len(futures_cols))


raw df: (320661, 41)
date_on null: 0
date range: 2016-01-01 00:00:00 ~ 2025-12-15 00:00:00
climate cols: 12
futures cols: 17


In [4]:
# Forward-fill NaN: apply ffill to climate_risk_* and futures_* (by region_id)
# Then set remaining climate NaN to 0 (usually at the series start)

KEY_COLS = ["ID", "region_id", "date_on"]
ffill_cols = [c for c in (climate_cols + futures_cols) if c not in KEY_COLS]

work = df.sort_values(["region_id", "date_on"]).copy()
work[ffill_cols] = work.groupby("region_id")[ffill_cols].ffill()

# Remaining climate NaN -> 0
work[climate_cols] = work[climate_cols].fillna(0)

print("after ffill: climate NaN total:", int(work[climate_cols].isna().sum().sum()))
print("after ffill: futures NaN total:", int(work[futures_cols].isna().sum().sum()))


after ffill: climate NaN total: 0
after ffill: futures NaN total: 3638


In [5]:
# Drop rows with missing futures_* and save parquet

before_rows = len(work)
work_valid = work.dropna(subset=futures_cols).copy()

print("rows before:", before_rows)
print("rows after dropna(futures):", len(work_valid))

out_path = PROC_DIR / "processed.parquet"
work_valid.to_parquet(out_path, index=False)
print("saved:", out_path)

# Save valid ID list (for later submission alignment)
valid_ids_path = PROC_DIR / "valid_ids.parquet"
(
    pd.DataFrame({"ID": work_valid["ID"].astype(str)})
    .drop_duplicates()
    .to_parquet(valid_ids_path, index=False)
)
print("saved:", valid_ids_path, "| unique IDs:", work_valid["ID"].nunique())


rows before: 320661
rows after dropna(futures): 320447
saved: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Processing\processed.parquet
saved: e:\Desktop\Mr RRR_Helios Corn Futures Climate Challenge_Repo\Data\Processing\valid_ids.parquet | unique IDs: 320447
