In [None]:
# =========================================================
# Predict the Flood â€” FULL Kaggle Notebook (Fixed RMSE)
# Competition folder: /kaggle/input/predict-the-floodd
# Target: FloodProbability (0..1)
# Model: LightGBM CV + Safe Features + Optional CatBoost Blend
# Output: submission.csv
#
# FIX: sklearn mean_squared_error no longer supports squared=False
#      -> use root_mean_squared_error OR sqrt(MSE)
# =========================================================

import os, gc, random, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold

# âœ… FIXED RMSE (works even if sklearn removed squared= parameter)
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

import lightgbm as lgb

# -----------------------------
# Reproducibility
# -----------------------------
SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
seed_everything(SEED)

# =========================================================
# 1) Load Data (Kaggle paths)
# =========================================================
DATA_DIR = "/kaggle/input/predict-the-floodd/"

train = pd.read_csv(f"{DATA_DIR}/train.csv")
test  = pd.read_csv(f"{DATA_DIR}/test.csv")
sub   = pd.read_csv(f"{DATA_DIR}/sample_submission.csv")

print("train:", train.shape, "| test:", test.shape, "| sub:", sub.shape)
display(train.head())

# =========================================================
# 2) Target and ID
# =========================================================
TARGET = "FloodProbability"
assert TARGET in train.columns, f"TARGET '{TARGET}' not found in train columns!"

ID_COL = None
for c in train.columns:
    if c.lower() in ["id", "index"]:
        ID_COL = c
        break
print("ID_COL:", ID_COL)

# =========================================================
# 3) Split features/target + align columns
# =========================================================
X = train.drop(columns=[TARGET]).copy()
y = train[TARGET].values

# Ensure test has same feature columns
test = test[X.columns].copy()

print("\nMissing values:")
print("Train missing:", X.isna().sum().sum())
print("Test missing :", test.isna().sum().sum())

print("\nTarget stats:")
print(pd.Series(y).describe())

# =========================================================
# 4) Safe Feature Engineering (row-wise stats)
# =========================================================
def add_row_stats(df):
    df = df.copy()
    num = df.select_dtypes(include=[np.number])

    df["row_mean"] = num.mean(axis=1)
    df["row_std"]  = num.std(axis=1)
    df["row_min"]  = num.min(axis=1)
    df["row_max"]  = num.max(axis=1)
    df["row_sum"]  = num.sum(axis=1)
    return df

X_fe = add_row_stats(X)
test_fe = add_row_stats(test)

print("\nAfter FE:")
print("X_fe:", X_fe.shape, "test_fe:", test_fe.shape)

# =========================================================
# 5) CV Strategy: StratifiedKFold on binned target
# =========================================================
N_SPLITS = 5
bins = pd.qcut(train[TARGET], q=20, labels=False, duplicates="drop")
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# =========================================================
# 6) LightGBM CV Training
# =========================================================
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "num_leaves": 256,
    "min_data_in_leaf": 80,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.85,
    "bagging_freq": 1,
    "lambda_l1": 0.0,
    "lambda_l2": 2.0,
    "verbosity": -1,
    "seed": SEED,
}

oof_lgb = np.zeros(len(train))
pred_test_lgb = np.zeros(len(test_fe))
fold_scores_lgb = []

print("\n===== LightGBM CV =====")
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_fe, bins), 1):
    X_tr, X_va = X_fe.iloc[tr_idx], X_fe.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    dtr = lgb.Dataset(X_tr, label=y_tr)
    dva = lgb.Dataset(X_va, label=y_va)

    model = lgb.train(
        lgb_params,
        dtr,
        num_boost_round=20000,
        valid_sets=[dva],
        valid_names=["valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=500, verbose=False),
            lgb.log_evaluation(period=500),
        ],
    )

    va_pred = model.predict(X_va, num_iteration=model.best_iteration)
    oof_lgb[va_idx] = va_pred

    fold_rmse = rmse(y_va, va_pred)
    fold_scores_lgb.append(fold_rmse)

    pred_test_lgb += model.predict(test_fe, num_iteration=model.best_iteration) / N_SPLITS

    print(f"Fold {fold}: RMSE={fold_rmse:.6f} | best_iter={model.best_iteration}")

    del model, dtr, dva, X_tr, X_va, y_tr, y_va
    gc.collect()

cv_lgb = rmse(y, oof_lgb)
print("\n[LGB] CV RMSE:", cv_lgb)
print("[LGB] Fold scores:", [round(s, 6) for s in fold_scores_lgb],
      "Mean:", np.mean(fold_scores_lgb), "Std:", np.std(fold_scores_lgb))

# =========================================================
# 7) Optional CatBoost + Blend (often boosts LB)
# =========================================================
USE_CATBOOST = True   # set False if you want only LGBM

final_pred = pred_test_lgb.copy()

if USE_CATBOOST:
    try:
        from catboost import CatBoostRegressor
        cat_ok = True
    except:
        cat_ok = False
        print("\nCatBoost not available. Using LGB only.")

    if cat_ok:
        print("\n===== CatBoost CV =====")
        pred_test_cb = np.zeros(len(test_fe))
        fold_scores_cb = []

        for fold, (tr_idx, va_idx) in enumerate(skf.split(X_fe, bins), 1):
            X_tr, X_va = X_fe.iloc[tr_idx], X_fe.iloc[va_idx]
            y_tr, y_va = y[tr_idx], y[va_idx]

            cb = CatBoostRegressor(
                loss_function="RMSE",
                iterations=30000,
                learning_rate=0.03,
                depth=8,
                l2_leaf_reg=6,
                random_seed=SEED,
                verbose=1000,
                od_type="Iter",
                od_wait=800,
            )
            cb.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)

            va_pred = cb.predict(X_va)
            fold_rmse = rmse(y_va, va_pred)
            fold_scores_cb.append(fold_rmse)

            pred_test_cb += cb.predict(test_fe) / N_SPLITS

            print(f"[CB] Fold {fold}: RMSE={fold_rmse:.6f}")

            del cb, X_tr, X_va, y_tr, y_va
            gc.collect()

        print("\n[CB] Mean:", np.mean(fold_scores_cb), "Std:", np.std(fold_scores_cb))

        # Blend weights
        W_LGB = 0.6
        W_CB  = 0.4
        final_pred = W_LGB * pred_test_lgb + W_CB * pred_test_cb
        print(f"\nBlending: LGB={W_LGB}  CB={W_CB}")

# =========================================================
# ðŸ˜Ž Create submission.csv
# =========================================================
sub2 = sub.copy()

# Find submission target column (not ID)
sub_target = [c for c in sub2.columns if c.lower() not in ["id", "index"]]
if len(sub_target) != 1:
    print("Submission columns:", sub2.columns.tolist())
    raise ValueError("Could not infer submission target column.")
sub_target = sub_target[0]

sub2[sub_target] = final_pred
sub2.to_csv("submission.csv", index=False)

print("\nSaved submission.csv âœ…")
display(sub2.head())

train: (376387, 21) | test: (185385, 21) | sub: (185385, 2)


Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,10.907827,3.450074,4.03175,4.849948,9.671641,4.934358,,5.719451,4.197449,5.450808,...,1.826199,,2.100308,4.781873,3.550904,6.009351,5.178502,1.737822,5.820953,0.5
1,,4.923656,,35.602568,8.867987,5.133364,3.503824,2.413313,1.978167,8.52566,...,4.946545,7.822959,3.35229,3.887192,3.686099,,9.037464,3.950068,,0.545
2,7.036374,9.198907,7.335568,7.727015,5.059921,1.875038,2.785596,5.246909,7.119318,7.838039,...,2.566081,3.892975,6.976857,38.595509,5.927815,8.026921,1.947679,3.335545,4.717355,0.545
3,5.853376,8.221989,6.968215,3.617405,,2.236813,3.315606,,8.114619,6.212545,...,,3.090245,5.943337,3.259453,4.208167,3.250709,2.405658,4.857694,6.087312,0.5
4,5.883916,2.269436,5.460944,5.848432,6.754861,,8.53518,4.219204,5.938841,2.57356,...,5.912197,5.859797,6.334664,4.076011,8.260247,3.315948,3.834493,3.291773,4.810802,0.545


ID_COL: None

Missing values:
Train missing: 565726
Test missing : 278802

Target stats:
count    376387.000000
mean          0.504375
std           0.051012
min           0.285000
25%           0.470000
50%           0.505000
75%           0.540000
max           0.725000
dtype: float64

After FE:
X_fe: (376387, 25) test_fe: (185385, 25)

===== LightGBM CV =====
[500]	valid's rmse: 0.0267246
[1000]	valid's rmse: 0.0267127
Fold 1: RMSE=0.026708 | best_iter=808
[500]	valid's rmse: 0.0266136
[1000]	valid's rmse: 0.0266015
Fold 2: RMSE=0.026599 | best_iter=760
