# Deep Learning


In [1]:
# ============================================================
# 0) COLAB SETUP
# ============================================================
!pip -q install pandas numpy scikit-learn matplotlib tqdm

import os, re, zipfile, random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda


In [3]:
# ============================================================
# 1) LOAD DATA
# Put your CSVs in /content (upload to Colab or mount Drive)
# ============================================================

TRAIN_FEATURES_PATH = "/content/train_hh_features.csv"
TEST_FEATURES_PATH  = "/content/test_hh_features.csv"
TRAIN_GT_PATH       = "/content/train_hh_gt.csv"
TRAIN_RATES_PATH    = "/content/train_rates_gt.csv"

train_feat = pd.read_csv(TRAIN_FEATURES_PATH)
test_feat  = pd.read_csv(TEST_FEATURES_PATH)
train_gt   = pd.read_csv(TRAIN_GT_PATH)
train_rates = pd.read_csv(TRAIN_RATES_PATH)

print("train_feat:", train_feat.shape)
print("test_feat :", test_feat.shape)
print("train_gt  :", train_gt.shape)
print("train_rates:", train_rates.shape)

train_feat.head()

train_feat: (104234, 88)
test_feat : (103023, 88)
train_gt  : (104234, 3)
train_rates: (3, 20)


Unnamed: 0,hhid,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,...,consumed4200,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000,survey_id
0,100001,1,75,4,594.80627,Female,1,0,0,0,...,Yes,No,No,No,Yes,Yes,Yes,Yes,No,100000
1,100002,1,150,4,1676.2723,Female,2,0,0,0,...,Yes,No,No,No,No,Yes,Yes,No,No,100000
2,100003,1,375,4,506.93719,Male,5,0,0,2,...,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes,100000
3,100004,1,375,4,824.61786,Male,5,0,0,1,...,No,Yes,No,No,No,Yes,Yes,No,No,100000
4,100005,1,525,4,351.47644,Male,7,1,0,0,...,Yes,No,No,Yes,No,Yes,Yes,Yes,No,100000


In [4]:
# ============================================================
# 2) KEYS + SURVEY_ID + TARGET COLUMN
# ============================================================

def extract_survey_id(hhid: int) -> int:
    # 400001 -> 400000
    return (int(hhid) // 1000) * 1000

assert "hhid" in train_feat.columns, "Missing 'hhid' in train features"
assert "hhid" in test_feat.columns,  "Missing 'hhid' in test features"

train_feat["survey_id"] = train_feat["hhid"].apply(extract_survey_id)
test_feat["survey_id"]  = test_feat["hhid"].apply(extract_survey_id)

# detect y column in train_gt
if "cons_ppp17" in train_gt.columns:
    y_col = "cons_ppp17"
elif "per_capita_household_consumption" in train_gt.columns:
    y_col = "per_capita_household_consumption"
else:
    cand = [c for c in train_gt.columns if c.lower() not in ["hhid","survey_id","household_id"]]
    assert len(cand) > 0, "Could not detect target column in train_gt"
    y_col = cand[0]

train = train_feat.merge(train_gt[["hhid", y_col]], on="hhid", how="inner")
print("Using target:", y_col)
print("train merged:", train.shape)
train[["hhid","survey_id", y_col]].head()

Using target: cons_ppp17
train merged: (104234, 89)


Unnamed: 0,hhid,survey_id,cons_ppp17
0,100001,100000,25.258402
1,100002,100000,16.996706
2,100003,100000,13.671848
3,100004,100000,7.189475
4,100005,100000,12.308855


In [5]:
# ============================================================
# 3) WEIGHT COLUMN (if none, use 1.0)
# ============================================================

possible_weight_cols = [c for c in train.columns if re.search(r"(weight|wgt|hh_w|hhweight)", c, re.IGNORECASE)]
weight_col = possible_weight_cols[0] if len(possible_weight_cols) else None
print("Detected weight_col:", weight_col)

if weight_col is None:
    train["__w__"] = 1.0
    test_feat["__w__"] = 1.0
    weight_col = "__w__"

Detected weight_col: weight


In [6]:
# ============================================================
# 4) THRESHOLDS FROM train_rates_gt
# ============================================================

assert "survey_id" in train_rates.columns, "train_rates_gt must contain survey_id"
rate_cols = [c for c in train_rates.columns if c != "survey_id"]

def parse_thr(col):
    m = re.search(r"(\d+(\.\d+)?)$", col)
    return float(m.group(1)) if m else None

thresholds = [parse_thr(c) for c in rate_cols]
if any(t is None for t in thresholds):
    raise ValueError("Could not parse thresholds from train_rates_gt column names")

thr_col_pairs = sorted(zip(thresholds, rate_cols), key=lambda x: x[0])
thresholds = [t for t,_ in thr_col_pairs]
rate_cols  = [c for _,c in thr_col_pairs]

print("thresholds:", len(thresholds), "first:", thresholds[0], "last:", thresholds[-1])

thresholds: 19 first: 3.17 last: 27.37


In [7]:
# ============================================================
# 5) FEATURE SPLIT: categorical vs numerical
# ============================================================

drop_cols = {y_col}
id_cols = {"hhid", "survey_id"}

X_all = train.drop(columns=list(drop_cols), errors="ignore").copy()
X_test = test_feat.copy()

for c in id_cols:
    if c in X_all.columns:  X_all.drop(columns=[c], inplace=True)
    if c in X_test.columns: X_test.drop(columns=[c], inplace=True)

cat_cols = [c for c in X_all.columns if X_all[c].dtype == "object"]
for c in X_all.columns:
    if c in cat_cols:
        continue
    if pd.api.types.is_integer_dtype(X_all[c]) or pd.api.types.is_bool_dtype(X_all[c]):
        if X_all[c].nunique(dropna=True) <= 64:
            cat_cols.append(c)

cat_cols = sorted(set(cat_cols))
num_cols = [c for c in X_all.columns if c not in cat_cols]

print("Categorical:", len(cat_cols))
print("Numerical  :", len(num_cols))

Categorical: 80
Numerical  : 6


In [8]:
# ============================================================
# 6) PREPROCESS: impute + encode categories + scale numericals
# ============================================================

for c in cat_cols:
    X_all[c] = X_all[c].astype("object").fillna("UNK")
    X_test[c] = X_test[c].astype("object").fillna("UNK")

for c in num_cols:
    med = X_all[c].median()
    X_all[c] = X_all[c].fillna(med)
    X_test[c] = X_test[c].fillna(med)

cat_maps = {}
X_all_cat = []
X_test_cat = []

for c in cat_cols:
    vocab = pd.Index(X_all[c].unique())
    vocab = vocab.append(pd.Index(["__UNK__"])).unique()
    mapper = {k:i for i,k in enumerate(vocab)}
    cat_maps[c] = mapper
    X_all_cat.append(X_all[c].map(lambda x: mapper.get(x, mapper["__UNK__"])).astype("int64").values)
    X_test_cat.append(X_test[c].map(lambda x: mapper.get(x, mapper["__UNK__"])).astype("int64").values)

X_all_cat = np.vstack(X_all_cat).T if len(cat_cols) else np.zeros((len(X_all),0), dtype=np.int64)
X_test_cat = np.vstack(X_test_cat).T if len(cat_cols) else np.zeros((len(X_test),0), dtype=np.int64)

scaler = StandardScaler()
X_all_num = scaler.fit_transform(X_all[num_cols].values.astype("float32")) if len(num_cols) else np.zeros((len(X_all),0), dtype=np.float32)
X_test_num = scaler.transform(X_test[num_cols].values.astype("float32")) if len(num_cols) else np.zeros((len(X_test),0), dtype=np.float32)

y = train[y_col].values.astype("float32")
w = train[weight_col].values.astype("float32")

print("X_all_cat:", X_all_cat.shape, "X_all_num:", X_all_num.shape, "y:", y.shape)

  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype("object").fillna("UNK")
  X_test[c] = X_test[c].astype("object").fillna("UNK")
  X_all[c] = X_all[c].astype

X_all_cat: (104234, 80) X_all_num: (104234, 6) y: (104234,)


In [9]:
# ============================================================
# 7) METRICS + POVERTY UTILS (SAFE FOR FOLDS)
# ============================================================

EPS = 1e-6

def wmape(y_true, y_pred, weights=None):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    if weights is None:
        weights = np.ones_like(y_true, dtype=np.float64)
    weights = np.asarray(weights, dtype=np.float64)
    num = np.sum(weights * np.abs(y_true - y_pred) / np.maximum(np.abs(y_true), EPS))
    den = np.sum(weights)
    return 100.0 * (num / max(den, EPS))

def poverty_rates_from_consumption(df_with_pred, pred_col, thresholds, weight_col):
    out = {}
    for sid, g in df_with_pred.groupby("survey_id"):
        ww = g[weight_col].values.astype("float64")
        cc = g[pred_col].values.astype("float64")
        denom = np.sum(ww)
        rates = [np.sum(ww * (cc < t)) / max(denom, EPS) for t in thresholds]
        out[int(sid)] = np.array(rates, dtype=np.float64)
    return out

def ws_wmape_score(df, y_true_col, y_pred_col, weight_col, train_rates_df, thresholds, rate_cols, w_t=None):
    # SAFE: compute only surveys present in df
    if w_t is None:
        w_t = np.ones(len(thresholds), dtype=np.float64)

    present_surveys = df["survey_id"].unique().tolist()
    rates_true = train_rates_df.set_index("survey_id").loc[present_surveys, rate_cols]

    cons_scores = {}
    for sid, g in df.groupby("survey_id"):
        cons_scores[int(sid)] = wmape(g[y_true_col].values, g[y_pred_col].values)

    rates_pred = poverty_rates_from_consumption(df, y_pred_col, thresholds, weight_col)

    rate_scores = {}
    for sid in present_surveys:
        sid = int(sid)
        rt = rates_true.loc[sid].values.astype("float64")
        rh = rates_pred[sid].astype("float64")
        ape = np.abs(rt - rh) / np.maximum(np.abs(rt), EPS)
        rate_scores[sid] = 100.0 * (np.sum(w_t * ape) / np.sum(w_t))

    final_scores = [0.90*rate_scores[int(sid)] + 0.10*cons_scores[int(sid)] for sid in present_surveys]
    return float(np.mean(final_scores)), cons_scores, rate_scores

In [10]:
# ============================================================
# 8) TORCH DATASET
# ============================================================

class TabDataset(Dataset):
    def __init__(self, x_cat, x_num, y=None, w=None, survey_id=None):
        self.x_cat = torch.tensor(x_cat, dtype=torch.long)
        self.x_num = torch.tensor(x_num, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)
        self.w = None if w is None else torch.tensor(w, dtype=torch.float32)
        self.survey_id = None if survey_id is None else torch.tensor(survey_id, dtype=torch.long)

    def __len__(self):
        return len(self.x_num)

    def __getitem__(self, i):
        if self.y is None:
            return self.x_cat[i], self.x_num[i]
        return self.x_cat[i], self.x_num[i], self.y[i], self.w[i], self.survey_id[i]

In [11]:
# ============================================================
# 9) MODEL: MLP + Embeddings (Tabular DL)
# ============================================================

class TabMLP(nn.Module):
    def __init__(self, cat_cardinalities, num_dim, hidden=(512,256,128), dropout=0.25):
        super().__init__()
        self.embeddings = nn.ModuleList()
        emb_out_dim = 0
        for card in cat_cardinalities:
            ed = min(64, max(4, int(round(1.6 * (card ** 0.56)))))
            self.embeddings.append(nn.Embedding(card, ed))
            emb_out_dim += ed

        in_dim = emb_out_dim + num_dim
        layers = []
        prev = in_dim
        for h in hidden:
            layers += [nn.Linear(prev, h), nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(dropout)]
            prev = h
        layers += [nn.Linear(prev, 1)]
        self.net = nn.Sequential(*layers)

    def forward(self, x_cat, x_num):
        if len(self.embeddings) > 0:
            embs = [emb(x_cat[:,i]) for i, emb in enumerate(self.embeddings)]
            x = torch.cat(embs + [x_num], dim=1)
        else:
            x = x_num
        return self.net(x).squeeze(1)

In [12]:
# ============================================================
# 10) LOSSES: consumption wMAPE + differentiable poverty loss
# ============================================================

def consumption_wmape_loss(y_true, y_pred):
    return torch.mean(torch.abs(y_true - y_pred) / torch.clamp(torch.abs(y_true), min=1e-6))

def soft_poverty_rates(cons, weights, thresholds, tau=0.35):
    t = torch.tensor(thresholds, device=cons.device, dtype=cons.dtype).view(1, -1)
    c = cons.view(-1, 1)
    soft = torch.sigmoid((t - c) / tau)
    wv = weights.view(-1, 1)
    num = torch.sum(wv * soft, dim=0)
    den = torch.sum(wv) + 1e-6
    return num / den

def poverty_wmape_loss_per_survey(cons_pred, weights, survey_id, rates_true_df, thresholds, rate_cols, tau=0.35):
    loss_terms = []
    for sid in torch.unique(survey_id):
        sid_int = int(sid.item())
        if sid_int not in rates_true_df.index:
            continue
        mask = (survey_id == sid)
        cons_s = cons_pred[mask]
        w_s = weights[mask]
        rates_pred = soft_poverty_rates(cons_s, w_s, thresholds, tau=tau)
        rt = torch.tensor(rates_true_df.loc[sid_int, rate_cols].values, device=cons_pred.device, dtype=cons_pred.dtype)
        loss_s = torch.mean(torch.abs(rt - rates_pred) / torch.clamp(torch.abs(rt), min=1e-6))
        loss_terms.append(loss_s)

    if len(loss_terms) == 0:
        return torch.tensor(0.0, device=cons_pred.device)
    return torch.mean(torch.stack(loss_terms))

In [13]:
# ============================================================
# 11) TRAINING: train_one_fold (SAFE: uses the provided df/arrays)
# ============================================================

def train_one_fold(train_idx, val_idx, params, train_df, X_cat, X_num, y_arr, w_arr,
                   epochs_A=25, epochs_B=15, batch_size=1024):

    xtr_cat, xtr_num = X_cat[train_idx], X_num[train_idx]
    ytr, wtr = y_arr[train_idx], w_arr[train_idx]
    sid_tr = train_df.iloc[train_idx]["survey_id"].values.astype("int64")

    xva_cat, xva_num = X_cat[val_idx], X_num[val_idx]
    yva, wva = y_arr[val_idx], w_arr[val_idx]
    sid_va = train_df.iloc[val_idx]["survey_id"].values.astype("int64")

    ds_tr = TabDataset(xtr_cat, xtr_num, ytr, wtr, sid_tr)
    ds_va = TabDataset(xva_cat, xva_num, yva, wva, sid_va)

    dl_tr = DataLoader(ds_tr, batch_size=batch_size, shuffle=True)
    dl_va = DataLoader(ds_va, batch_size=batch_size, shuffle=False)

    cat_card = [len(cat_maps[c]) for c in cat_cols]
    model = TabMLP(cat_card, X_num.shape[1], hidden=params["hidden"], dropout=params["dropout"]).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=params["lr"], weight_decay=params["wd"])
    sch = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)  # no verbose

    rates_true_df = train_rates.set_index("survey_id")[rate_cols].copy()

    def eval_val_score():
        model.eval()
        preds = []
        with torch.no_grad():
            for xb_cat, xb_num, yb, wb, sidb in dl_va:
                xb_cat = xb_cat.to(device); xb_num = xb_num.to(device)
                preds.append(model(xb_cat, xb_num).cpu().numpy())
        preds = np.concatenate(preds)

        val_df = train_df.iloc[val_idx].copy()
        val_df["pred"] = preds

        score, _, _ = ws_wmape_score(
            val_df,
            y_true_col=y_col,
            y_pred_col="pred",
            weight_col=weight_col,
            train_rates_df=train_rates,
            thresholds=thresholds,
            rate_cols=rate_cols
        )
        return score

    # Stage A
    best_score = float("inf")
    best_state = None
    patience = 5
    bad = 0

    for ep in range(1, epochs_A+1):
        model.train()
        for xb_cat, xb_num, yb, wb, sidb in dl_tr:
            xb_cat = xb_cat.to(device); xb_num = xb_num.to(device); yb = yb.to(device)
            opt.zero_grad()
            pred = model(xb_cat, xb_num)
            loss = consumption_wmape_loss(yb, pred)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        val_score = eval_val_score()
        sch.step(val_score)

        if val_score < best_score - 1e-6:
            best_score = val_score
            best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    # Stage B
    opt = torch.optim.AdamW(model.parameters(), lr=params["lr_ft"], weight_decay=params["wd"])
    sch = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)

    best_score_B = best_score
    best_state_B = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
    bad = 0

    for ep in range(1, epochs_B+1):
        model.train()
        for xb_cat, xb_num, yb, wb, sidb in dl_tr:
            xb_cat = xb_cat.to(device); xb_num = xb_num.to(device)
            yb = yb.to(device); wb = wb.to(device); sidb = sidb.to(device)

            opt.zero_grad()
            pred = model(xb_cat, xb_num)
            loss_cons = consumption_wmape_loss(yb, pred)
            loss_pov  = poverty_wmape_loss_per_survey(pred, wb, sidb, rates_true_df, thresholds, rate_cols, tau=params["tau"])
            loss = 0.10 * loss_cons + 0.90 * loss_pov
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

        val_score = eval_val_score()
        sch.step(val_score)

        if val_score < best_score_B - 1e-6:
            best_score_B = val_score
            best_state_B = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

    model.load_state_dict(best_state_B)
    return model, best_score_B

In [14]:
# ============================================================
# 12) SIMPLE TUNING + SAFE CV by survey groups
# ============================================================

valid_surveys = set(train_rates["survey_id"].unique())
mask_valid = train["survey_id"].isin(valid_surveys).values

train_filt = train.loc[mask_valid].reset_index(drop=True)

X_cat_f = X_all_cat[mask_valid]
X_num_f = X_all_num[mask_valid]
y_f     = y[mask_valid]
w_f     = w[mask_valid]
groups_f = train_filt["survey_id"].values.astype("int64")

print("Filtered surveys:", np.unique(groups_f))
print("Filtered shapes:", X_num_f.shape, y_f.shape)

gkf = GroupKFold(n_splits=len(np.unique(groups_f)))

param_grid = [
    {"hidden": (512,256,128), "dropout": 0.25, "lr": 2e-3,  "lr_ft": 1e-3,  "wd": 1e-4, "tau": 0.35},
    {"hidden": (768,384,192), "dropout": 0.30, "lr": 2e-3,  "lr_ft": 8e-4,  "wd": 3e-4, "tau": 0.30},
    {"hidden": (512,512,256), "dropout": 0.20, "lr": 1.5e-3,"lr_ft": 7e-4,  "wd": 1e-4, "tau": 0.40},
]

cv_rows = []
best_overall = (None, float("inf"), None)

for pi, params in enumerate(param_grid):
    print(f"\n{'='*60}\nParams {pi+1}/{len(param_grid)} → {params}\n{'='*60}")
    fold_scores = []
    fold_models = []

    for fi, (tr_idx, va_idx) in enumerate(gkf.split(X_num_f, y_f, groups=groups_f), start=1):
        sid_val = int(train_filt.iloc[va_idx]["survey_id"].iloc[0])
        print(f"\n▶ Fold {fi}/{len(np.unique(groups_f))} | validation survey = {sid_val}")

        try:
            model, score = train_one_fold(
                tr_idx, va_idx, params,
                train_df=train_filt,
                X_cat=X_cat_f, X_num=X_num_f, y_arr=y_f, w_arr=w_f,
                epochs_A=25, epochs_B=15, batch_size=1024
            )
            fold_scores.append(score)
            fold_models.append(model)
            print(f"  Fold score: {score:.6f}")
        except RuntimeError as e:
            print("⚠️ RuntimeError:", e)
            torch.cuda.empty_cache()

    mean_score = float(np.mean(fold_scores)) if len(fold_scores) else float("inf")
    cv_rows.append({"param_id": pi, "params": params, "mean_ws_wmape": mean_score, "fold_scores": fold_scores})
    print(f"\n>> Mean CV wS-wMAPE: {mean_score:.6f}")

    if mean_score < best_overall[1]:
        best_overall = (params, mean_score, fold_models)

cv_df = pd.DataFrame(cv_rows).sort_values("mean_ws_wmape").reset_index(drop=True)
print("\nCV results:")
cv_df

Filtered surveys: [100000 200000 300000]
Filtered shapes: (2997, 6) (2997,)

Params 1/3 → {'hidden': (512, 256, 128), 'dropout': 0.25, 'lr': 0.002, 'lr_ft': 0.001, 'wd': 0.0001, 'tau': 0.35}

▶ Fold 1/3 | validation survey = 300000
  Fold score: 41.009438

▶ Fold 2/3 | validation survey = 200000
  Fold score: 13.108150

▶ Fold 3/3 | validation survey = 100000
  Fold score: 65.443462

>> Mean CV wS-wMAPE: 39.853683

Params 2/3 → {'hidden': (768, 384, 192), 'dropout': 0.3, 'lr': 0.002, 'lr_ft': 0.0008, 'wd': 0.0003, 'tau': 0.3}

▶ Fold 1/3 | validation survey = 300000
  Fold score: 12.721703

▶ Fold 2/3 | validation survey = 200000
  Fold score: 14.431696

▶ Fold 3/3 | validation survey = 100000
  Fold score: 38.297359

>> Mean CV wS-wMAPE: 21.816920

Params 3/3 → {'hidden': (512, 512, 256), 'dropout': 0.2, 'lr': 0.0015, 'lr_ft': 0.0007, 'wd': 0.0001, 'tau': 0.4}

▶ Fold 1/3 | validation survey = 300000
  Fold score: 12.035652

▶ Fold 2/3 | validation survey = 200000
  Fold score: 12.832

Unnamed: 0,param_id,params,mean_ws_wmape,fold_scores
0,2,"{'hidden': (512, 512, 256), 'dropout': 0.2, 'l...",11.927958,"[12.035651638282475, 12.832670834009384, 10.91..."
1,1,"{'hidden': (768, 384, 192), 'dropout': 0.3, 'l...",21.81692,"[12.721703296403717, 14.431696152063815, 38.29..."
2,0,"{'hidden': (512, 256, 128), 'dropout': 0.25, '...",39.853683,"[41.00943817780773, 13.108150062879567, 65.443..."


In [15]:
# ============================================================
# 13) TRAIN FINAL MODEL ON ALL TRAIN (best params)
# ============================================================

best_params, best_cv, _ = best_overall
print("Best params:", best_params)
print("Best CV score:", best_cv)

cat_card = [len(cat_maps[c]) for c in cat_cols]
final_model = TabMLP(cat_card, X_all_num.shape[1], hidden=best_params["hidden"], dropout=best_params["dropout"]).to(device)

ds_all = TabDataset(X_all_cat, X_all_num, y, w, train["survey_id"].values.astype("int64"))
dl_all = DataLoader(ds_all, batch_size=1024, shuffle=True)

# Stage A
opt = torch.optim.AdamW(final_model.parameters(), lr=best_params["lr"], weight_decay=best_params["wd"])
for ep in range(1, 26):
    final_model.train()
    losses = []
    for xb_cat, xb_num, yb, wb, sidb in dl_all:
        xb_cat = xb_cat.to(device); xb_num = xb_num.to(device); yb = yb.to(device)
        opt.zero_grad()
        pred = final_model(xb_cat, xb_num)
        loss = consumption_wmape_loss(yb, pred)
        loss.backward()
        nn.utils.clip_grad_norm_(final_model.parameters(), 1.0)
        opt.step()
        losses.append(loss.item())
    print(f"[Final A] epoch {ep:02d} loss={np.mean(losses):.5f}")

# Stage B
rates_true_df = train_rates.set_index("survey_id")[rate_cols].copy()
opt = torch.optim.AdamW(final_model.parameters(), lr=best_params["lr_ft"], weight_decay=best_params["wd"])
for ep in range(1, 16):
    final_model.train()
    losses = []
    for xb_cat, xb_num, yb, wb, sidb in dl_all:
        xb_cat = xb_cat.to(device); xb_num = xb_num.to(device)
        yb = yb.to(device); wb = wb.to(device); sidb = sidb.to(device)

        opt.zero_grad()
        pred = final_model(xb_cat, xb_num)
        loss = 0.10 * consumption_wmape_loss(yb, pred) + 0.90 * poverty_wmape_loss_per_survey(
            pred, wb, sidb, rates_true_df, thresholds, rate_cols, tau=best_params["tau"]
        )
        loss.backward()
        nn.utils.clip_grad_norm_(final_model.parameters(), 1.0)
        opt.step()
        losses.append(loss.item())
    print(f"[Final B] epoch {ep:02d} loss={np.mean(losses):.5f}")

Best params: {'hidden': (512, 512, 256), 'dropout': 0.2, 'lr': 0.0015, 'lr_ft': 0.0007, 'wd': 0.0001, 'tau': 0.4}
Best CV score: 11.92795814139882
[Final A] epoch 01 loss=0.39133
[Final A] epoch 02 loss=0.27203
[Final A] epoch 03 loss=0.26628
[Final A] epoch 04 loss=0.26344
[Final A] epoch 05 loss=0.26119
[Final A] epoch 06 loss=0.25913
[Final A] epoch 07 loss=0.25759
[Final A] epoch 08 loss=0.25550
[Final A] epoch 09 loss=0.25330
[Final A] epoch 10 loss=0.25345
[Final A] epoch 11 loss=0.25132
[Final A] epoch 12 loss=0.24936
[Final A] epoch 13 loss=0.24907
[Final A] epoch 14 loss=0.24705
[Final A] epoch 15 loss=0.24505
[Final A] epoch 16 loss=0.24481
[Final A] epoch 17 loss=0.24358
[Final A] epoch 18 loss=0.24229
[Final A] epoch 19 loss=0.24112
[Final A] epoch 20 loss=0.23974
[Final A] epoch 21 loss=0.23822
[Final A] epoch 22 loss=0.23729
[Final A] epoch 23 loss=0.23585
[Final A] epoch 24 loss=0.23517
[Final A] epoch 25 loss=0.23365
[Final B] epoch 01 loss=0.35353
[Final B] epoch 02 lo

In [16]:
# ============================================================
# 14) INFERENCE ON TEST
# ============================================================

final_model.eval()
ds_test = TabDataset(X_test_cat, X_test_num, y=None, w=None, survey_id=None)
dl_test = DataLoader(ds_test, batch_size=2048, shuffle=False)

test_preds = []
with torch.no_grad():
    for xb_cat, xb_num in dl_test:
        xb_cat = xb_cat.to(device); xb_num = xb_num.to(device)
        test_preds.append(final_model(xb_cat, xb_num).cpu().numpy())
test_preds = np.concatenate(test_preds).astype("float64")
print("test_preds:", test_preds.shape, test_preds[:5])

test_preds: (103023,) [18.91811562  7.5873661   7.79921818 20.08713341  7.6555295 ]


In [36]:
# ============================================================
# FINAL predicted_household_consumption.csv
# ============================================================

# 1) Φόρτωσε το OFFICIAL sample από το site
sample_cons = pd.read_csv("/content/predicted_household_consumption.csv")

# 2) Καθάρισε τυχόν index στήλη
sample_cons = sample_cons.loc[:, ~sample_cons.columns.str.contains("^Unnamed")]

# 3) Βεβαιώσου ότι οι στήλες είναι ΑΚΡΙΒΩΣ αυτές
sample_cons = sample_cons[["survey_id", "hhid", "cons_ppp17"]]

# 4) Αντικατέστησε ΜΟΝΟ τις τιμές
sample_cons["cons_ppp17"] = test_preds

# 5) Έλεγχοι
assert sample_cons.shape == (103023, 3)
assert list(sample_cons.columns) == ["survey_id", "hhid", "cons_ppp17"]

# 6) Αποθήκευση
sample_cons.to_csv(
    "predicted_household_consumption.csv",
    index=False
)

sample_cons.head()


Unnamed: 0,survey_id,hhid,cons_ppp17
0,400000,400001,18.918116
1,400000,400002,7.587366
2,400000,400003,7.799218
3,400000,400004,20.087133
4,400000,400005,7.655529


In [37]:
# ============================================================
# FINAL predicted_poverty_distribution.csv
# ============================================================

# 1) Φόρτωσε το OFFICIAL sample
sample_pov = pd.read_csv("/content/predicted_poverty_distribution.csv")

# 2) Καθάρισε index στήλη
sample_pov = sample_pov.loc[:, ~sample_pov.columns.str.contains("^Unnamed")]

# 3) Σιγουρέψου για σειρά στηλών
sample_pov = sample_pov[["survey_id"] + rate_cols]

# 4) Υπολόγισε poverty rates από predictions
tmp = test_feat.copy()
tmp["pred_cons"] = test_preds

for i, sid in enumerate(sample_pov["survey_id"]):
    g = tmp[tmp["survey_id"] == sid]
    w = g[weight_col].values.astype("float64")
    c = g["pred_cons"].values.astype("float64")
    denom = np.sum(w)

    for col, t in zip(rate_cols, thresholds):
        sample_pov.loc[i, col] = float(
            np.sum(w * (c < t)) / max(denom, 1e-9)
        )

# 5) Έλεγχοι
assert sample_pov.shape == (3, 20)
assert list(sample_pov.columns)[0] == "survey_id"

# 6) Αποθήκευση
sample_pov.to_csv(
    "predicted_poverty_distribution.csv",
    index=False
)

sample_pov


Unnamed: 0,survey_id,pct_hh_below_3.17,pct_hh_below_3.94,pct_hh_below_4.60,pct_hh_below_5.26,pct_hh_below_5.88,pct_hh_below_6.47,pct_hh_below_7.06,pct_hh_below_7.70,pct_hh_below_8.40,pct_hh_below_9.13,pct_hh_below_9.87,pct_hh_below_10.70,pct_hh_below_11.62,pct_hh_below_12.69,pct_hh_below_14.03,pct_hh_below_15.64,pct_hh_below_17.76,pct_hh_below_20.99,pct_hh_below_27.37
0,400000,0.025915,0.067133,0.110638,0.146265,0.202234,0.267117,0.317589,0.367217,0.447032,0.526179,0.601848,0.646705,0.703078,0.753602,0.805767,0.856553,0.907256,0.960593,0.993873
1,500000,0.027132,0.085268,0.125594,0.153914,0.168865,0.180125,0.206953,0.239298,0.322604,0.426372,0.503472,0.584169,0.631411,0.72314,0.770727,0.834365,0.883489,0.93609,0.989263
2,600000,0.04824,0.144407,0.189668,0.231866,0.261423,0.281731,0.326259,0.370346,0.419658,0.490244,0.543706,0.609122,0.682217,0.731701,0.786597,0.851184,0.895328,0.942774,0.988635


In [38]:
import zipfile

with zipfile.ZipFile("submission.zip", "w", zipfile.ZIP_DEFLATED) as z:
    z.write("predicted_household_consumption.csv")
    z.write("predicted_poverty_distribution.csv")


In [29]:
# ============================================================
# 18) QUICK CHECKS + PLOTS
# ============================================================

assert list(pred_cons.columns) == ["survey_id","household_id","per_capita_household_consumption"]
assert pred_pov.columns[0] == "survey_id"
assert list(pred_pov.columns[1:]) == rate_cols

print("OK: submission format looks correct.")

plt.figure()
plt.hist(pred_cons["per_capita_household_consumption"].values, bins=60)
plt.xlabel("predicted consumption")
plt.ylabel("count")
plt.title("Test predicted consumption distribution")
plt.show()

plt.figure()
for _, row in pred_pov.iterrows():
    sid = int(row["survey_id"])
    plt.plot(thresholds, row[rate_cols].values, marker="o", label=f"survey {sid}")
plt.xlabel("poverty threshold")
plt.ylabel("predicted pct below threshold")
plt.title("Predicted poverty curves (test surveys)")
plt.legend()
plt.show()

AssertionError: 