In [1]:
# ====================== Advanced RMSE pipeline for yield_tpha ======================
# - Robust validation: StratifiedKFold by target bins (RMSE per fold + OOF)
# - Models: CatBoostRegressor + LightGBM (category-aware), stacking via LinearRegression
# - Feature eng: auto date parsing -> year/month/dayofyear/week/quarter, missingness flags
# - No hardcoded save path: writes "submission.csv" in CWD
# - Test path fixed as requested: /kaggle/input/crop-yield-prediction-challenge/crop_yield_test.csv
# ================================================================================
import os, sys, gc, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")

SEED = 42
np.random.seed(SEED)

# --------------------------- Utils ---------------------------
def rmse(a, b):
    return float(np.sqrt(np.mean((np.asarray(a) - np.asarray(b))**2)))

def make_stratified_kfold_bins(y, n_bins=10):
    # quantile bins for regression stratification
    q = np.linspace(0, 1, n_bins + 1)
    # handle duplicates: np.unique on quantiles
    bins = np.unique(np.quantile(y, q))
    # digitize (rightmost bin inclusive)
    return np.digitize(y, bins[1:-1], right=True)

def try_read(path_list):
    for p in path_list:
        if os.path.exists(p):
            return pd.read_csv(p)
    raise FileNotFoundError(f"Training file not found. Tried:\n" + "\n".join(path_list))

def parse_dates_inplace(df, prefix_keep=True):
    """Parse likely date columns in df and add date-derived features."""
    candidates = [c for c in df.columns if any(k in c.lower() for k in ["date", "time", "dt", "ts"])]
    # Also try to parse object columns heuristically
    for c in df.columns:
        if df[c].dtype == "object" and c not in candidates:
            sample = df[c].dropna().astype(str).head(100).tolist()
            if any("-" in s or "/" in s or ":" in s for s in sample):
                candidates.append(c)
    candidates = list(dict.fromkeys(candidates))  # uniq & keep order

    for col in candidates:
        try:
            dt = pd.to_datetime(df[col], errors="coerce", utc=False, infer_datetime_format=True)
            ok_ratio = (~dt.isna()).mean()
            if ok_ratio < 0.5:  # too many NaT -> skip
                continue
            base = col if prefix_keep else "dt_" + col
            df[f"{base}_year"]       = dt.dt.year.astype("Int64")
            df[f"{base}_month"]      = dt.dt.month.astype("Int64")
            df[f"{base}_dayofyear"]  = dt.dt.dayofyear.astype("Int64")
            df[f"{base}_week"]       = dt.dt.isocalendar().week.astype("Int64")
            df[f"{base}_quarter"]    = dt.dt.quarter.astype("Int64")
        except Exception:
            pass

def add_missingness_flags(df, num_cols):
    for c in num_cols:
        if df[c].isna().any():
            df[f"{c}__isna"] = df[c].isna().astype(np.int8)

def unify_categories(train, test, cat_cols):
    """Make sure train/test have identical categorical categories (LightGBM friendly)."""
    for c in cat_cols:
        tr = train[c].astype("category")
        te = test[c].astype("category")
        cats = pd.Index(sorted(set(tr.astype(str)).union(set(te.astype(str)))))
        train[c] = pd.Categorical(tr.astype(str), categories=cats)
        test[c]  = pd.Categorical(te.astype(str), categories=cats)

# --------------------------- Paths ---------------------------
TRAIN_CANDIDATES = [
    "/kaggle/input/crop-yield-prediction-challenge/crop_yield_train.csv",
    "/mnt/data/crop_yield_train.csv",  # fallback (если вы запускали локально)
]
TEST_PATH = "/kaggle/input/crop-yield-prediction-challenge/crop_yield_test.csv"
SAMPLE_SUB_CANDIDATES = [
    "/kaggle/input/crop-yield-prediction-challenge/sample_submission.csv",
    "/mnt/data/sample_submission.csv",
]

# --------------------------- Load ---------------------------
train = try_read(TRAIN_CANDIDATES)
test  = pd.read_csv(TEST_PATH)
try:
    sample_sub = try_read(SAMPLE_SUB_CANDIDATES)
except Exception:
    # Если файла sample_submission нет, создадим болванку из первого столбца теста
    first_col = test.columns[0]
    sample_sub = pd.DataFrame({first_col: test[first_col].values, "yield_tpha": 0.0})

TARGET = "yield_tpha"
assert TARGET in train.columns, f"`{TARGET}` нет в колонках train: {train.columns.tolist()}"

id_col = sample_sub.columns[0]
if id_col not in test.columns:
    id_col = "id" if "id" in test.columns else test.columns[0]

# выбросим строки без таргета
train = train[~train[TARGET].isna()].copy()

# --------------------------- Basic FE ---------------------------
# Парсим даты (safe)
parse_dates_inplace(train, prefix_keep=True)
parse_dates_inplace(test,  prefix_keep=True)

# синхронизируем столбцы между train/test (кроме таргета)
feat_cols = [c for c in train.columns if c != TARGET]
missing_in_test  = [c for c in feat_cols if c not in test.columns]
missing_in_train = [c for c in test.columns if c not in feat_cols and c != TARGET]

for c in missing_in_test:
    test[c] = np.nan
for c in missing_in_train:
    # добавляем пустые в train, если вдруг есть фичи, которых нет в train, но есть в test
    if c != TARGET:
        train[c] = np.nan

# итоговый список фич одинаков
feat_cols = [c for c in train.columns if c != TARGET]
train = train[feat_cols + [TARGET]]
test  = test[feat_cols]

# типы
num_cols = train[feat_cols].select_dtypes(include=[np.number, "Int64", "Float64"]).columns.tolist()
cat_cols = [c for c in feat_cols if c not in num_cols]

# индикаторы пропусков по числовым
add_missingness_flags(train, num_cols)
add_missingness_flags(test, num_cols)

# обновим списки фич
feat_cols = [c for c in train.columns if c != TARGET]
num_cols = train[feat_cols].select_dtypes(include=[np.number, "Int64", "Float64"]).columns.tolist()
cat_cols = [c for c in feat_cols if c not in num_cols]

# приведение категорий
for c in cat_cols:
    train[c] = train[c].astype("category")
    test[c]  = test[c].astype("category")
unify_categories(train, test, cat_cols)

# --------------------------- CV setup ---------------------------
from sklearn.model_selection import StratifiedKFold
y = train[TARGET].values
bins = make_stratified_kfold_bins(y, n_bins=10)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

X = train[feat_cols].reset_index(drop=True)
y = train[TARGET].reset_index(drop=True).astype(float)
X_test = test[feat_cols].reset_index(drop=True)

# --------------------------- Models ---------------------------
use_catboost = True
try:
    from catboost import CatBoostRegressor, Pool
except Exception:
    use_catboost = False

use_lightgbm = True
try:
    import lightgbm as lgb
    from lightgbm import LGBMRegressor
except Exception:
    use_lightgbm = False

if not (use_catboost or use_lightgbm):
    raise ImportError("Need at least one of CatBoost or LightGBM installed in the Kaggle environment.")

# параметры (чуть консервативные, но сильные; правьте при желании)
cb_params = dict(
    loss_function="RMSE",
    eval_metric="RMSE",
    iterations=5000,
    depth=8,
    learning_rate=0.03,
    l2_leaf_reg=3.0,
    random_seed=SEED,
    od_type="Iter",
    od_wait=200,
    verbose=False
)

lgb_params = dict(
    objective="regression",
    metric="rmse",
    n_estimators=10000,
    learning_rate=0.03,
    max_depth=-1,
    num_leaves=63,
    min_data_in_leaf=40,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=SEED
)

# --------------------------- OOF training ---------------------------
oof_cb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
pred_cb = np.zeros(len(X_test))
pred_lgb = np.zeros(len(X_test))

fold_rmse_cb, fold_rmse_lgb = [], []

for fold, (tr_idx, va_idx) in enumerate(cv.split(X, bins), 1):
    X_tr, X_va = X.iloc[tr_idx].copy(), X.iloc[va_idx].copy()
    y_tr, y_va = y.iloc[tr_idx].values, y.iloc[va_idx].values

    # Ensure the same categories in fold slices
    for c in cat_cols:
        X_tr[c] = X_tr[c].astype("category")
        X_va[c] = X_va[c].astype("category")
        X_tr[c] = X_tr[c].cat.set_categories(X[c].cat.categories)
        X_va[c] = X_va[c].cat.set_categories(X[c].cat.categories)

    # --------- CatBoost ---------
    if use_catboost:
        tr_pool = Pool(X_tr, y_tr, cat_features=[X_tr.columns.get_loc(c) for c in cat_cols])
        va_pool = Pool(X_va, y_va, cat_features=[X_va.columns.get_loc(c) for c in cat_cols])
        model_cb = CatBoostRegressor(**cb_params)
        model_cb.fit(tr_pool, eval_set=va_pool, verbose=False)
        pred_va = model_cb.predict(X_va)
        pred_te = model_cb.predict(X_test)
        oof_cb[va_idx] = pred_va
        pred_cb += pred_te / cv.n_splits
        fold_rmse_cb.append(rmse(y_va, pred_va))

    # --------- LightGBM ---------
    if use_lightgbm:
        # LightGBM понимает category dtype напрямую
        model_lgb = LGBMRegressor(**lgb_params)
        model_lgb.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            callbacks=[
                lgb.early_stopping(stopping_rounds=200, verbose=False),
            ],
        )
        pred_va = model_lgb.predict(X_va, num_iteration=model_lgb.best_iteration_)
        pred_te = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration_)
        oof_lgb[va_idx] = pred_va
        pred_lgb += pred_te / cv.n_splits
        fold_rmse_lgb.append(rmse(y_va, pred_va))

    print(f"[Fold {fold}] "
          + (f"CB RMSE={fold_rmse_cb[-1]:.5f}  " if use_catboost else "")
          + (f"LGBM RMSE={fold_rmse_lgb[-1]:.5f}" if use_lightgbm else ""))

# --------------------------- CV summary ---------------------------
metrics_log = []
if use_catboost:
    cv_cb = rmse(y, oof_cb)
    metrics_log.append(("CatBoost OOF RMSE", cv_cb))
if use_lightgbm:
    cv_lgb = rmse(y, oof_lgb)
    metrics_log.append(("LightGBM OOF RMSE", cv_lgb))

print("\n=== Cross-Validation summary (OOF) ===")
for name, val in metrics_log:
    print(f"{name}: {val:.6f}")

# --------------------------- Stacking (level-2) ---------------------------
from sklearn.linear_model import LinearRegression
stack_feats = []
stack_test = []
if use_catboost: stack_feats.append(oof_cb.reshape(-1,1)); stack_test.append(pred_cb.reshape(-1,1))
if use_lightgbm: stack_feats.append(oof_lgb.reshape(-1,1)); stack_test.append(pred_lgb.reshape(-1,1))

Z_tr = np.hstack(stack_feats)
Z_te = np.hstack(stack_test)

meta = LinearRegression(fit_intercept=True)
meta.fit(Z_tr, y.values)
oof_blend = meta.predict(Z_tr)
pred_blend = meta.predict(Z_te)

print("\nMeta weights (LR):", meta.coef_, " Intercept:", meta.intercept_)
print(f"Stacked OOF RMSE: {rmse(y, oof_blend):.6f}")

# --------------------------- Feature importance (top-30) ---------------------------
def top_importances(model, cols, k=30):
    imp = None
    try:
        if hasattr(model, "feature_importances_"):
            imp = model.feature_importances_
        elif hasattr(model, "get_feature_importance"):
            imp = model.get_feature_importance()
    except Exception:
        pass
    if imp is not None:
        fi = pd.DataFrame({"feature": cols, "importance": imp})
        fi = fi.sort_values("importance", ascending=False).head(k)
        print("\nTop features:")
        print(fi.to_string(index=False))
    else:
        print("\n(Feature importance not available for this model)")

# покажем важности последней обученной модели LGBM/CB (если они есть)
if use_lightgbm:
    top_importances(model_lgb, X.columns.tolist(), k=30)
elif use_catboost:
    top_importances(model_cb, X.columns.tolist(), k=30)

# --------------------------- Submission ---------------------------
# Берём стек (обычно лучше OOF-мета), но при желании можно усреднить с лучшей моделью
final_pred = pred_blend

sub = sample_sub.copy()
if id_col in sub.columns and id_col in test.columns:
    sub = sub[[id_col]].merge(
        pd.DataFrame({id_col: test[id_col].values, "yield_tpha": final_pred}),
        on=id_col, how="left"
    )
else:
    # fallback: если sample_sub странный
    key = sub.columns[0]
    sub[key] = test[key].values
    if "yield_tpha" not in sub.columns:
        sub["yield_tpha"] = final_pred
    else:
        sub["yield_tpha"] = final_pred

sub.to_csv("submission.csv", index=False)
print("\nSaved: submission.csv")
print(sub.head(10))
# ================================================================================


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3981
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 21
[LightGBM] [Info] Start training from score 6.266775
[Fold 1] CB RMSE=0.65719  LGBM RMSE=0.66621
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3980
[LightGBM] [Info] Number of data points in the train set: 3840, number of used features: 21
[LightGBM] [Info] Start training from score 6.266255
[Fold 2] CB RMSE=0.63572  LGBM RMSE=0.65022
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000537 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3980
[LightGBM] [Info] Number of data p