# Tri-Model v3 (Parallel) — MLP + TabTransformer + FT-Transformer


Trains three models in parallel with K-Fold CV, compares OOF AUCs, auto warm-starts from best prior run if found, and creates an ensemble submission.


In [9]:

# --- CONFIG ---
COMPETITION_NAME="playground-series-s5e8"
ID_COL="id"; TARGET_COL="y"
TRAIN_PATH="playground-series-s5e8/train.csv"; TEST_PATH="playground-series-s5e8/test.csv"
N_SPLITS=5; RANDOM_SEED=2025

BATCH_SIZE=4096; EPOCHS=100; PATIENCE=12
BASE_LR=1e-3; WEIGHT_DECAY=1e-5
USE_CLASS_WEIGHTS=True; MIN_LR=1e-5
COSINE_T0=10; COSINE_T_MULT=2; GRAD_CLIP=1.0
USE_SWA=True; SWA_START_EPOCH=10; SWA_LR=5e-4

# Fine-tuned per-arch params
A_HIDDEN=[1024,512,256,128]; A_DROPOUT=0.25; A_EMB_DROPOUT=0.05; A_INPUT_DROPOUT=0.05
B_D_MODEL=192; B_N_HEAD=8; B_N_LAYERS=4; B_DROPOUT=0.2; B_EMB_DROPOUT=0.05; B_INPUT_DROPOUT=0.05; B_MLP_HEAD=[512,256]
C_D_MODEL=256; C_N_HEAD=8; C_N_LAYERS=3; C_DROPOUT=0.2; C_EMB_DROPOUT=0.05; C_INPUT_DROPOUT=0.05; C_MLP_HEAD=[512,256]

AUTO_WARM_START=True


In [10]:

# --- IMPORTS & ENV ---
import os, gc, time, math, random, json
import numpy as np, pandas as pd
from pathlib import Path
from datetime import datetime
from typing import List
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, RocCurveDisplay

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
import multiprocessing as mp

def set_seed(s=42):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)
    torch.backends.cudnn.deterministic=True; torch.backends.cudnn.benchmark=False
set_seed(RANDOM_SEED)

use_mps = hasattr(torch.backends,"mps") and torch.backends.mps.is_available()
has_cuda = torch.cuda.is_available()
if has_cuda: DEVICE=torch.device("cuda")
elif use_mps: DEVICE=torch.device("mps")
else: DEVICE=torch.device("cpu")
PIN_MEM = True if DEVICE.type=="cuda" else False

RUN_STAMP=time.strftime("%Y-%m-%d_%H-%M-%S")
ROOT_DIR=Path(f"runs/{RUN_STAMP}"); ROOT_DIR.mkdir(parents=True, exist_ok=True)
print("Device:", DEVICE, "| pin_memory:", PIN_MEM, "| run:", ROOT_DIR)


Device: mps | pin_memory: False | run: runs/2025-08-12_11-02-40


In [11]:

# --- DATA PREP ---
train=pd.read_csv(TRAIN_PATH); test=pd.read_csv(TEST_PATH)
assert TARGET_COL in train and ID_COL in train and ID_COL in test

feature_cols=[c for c in train.columns if c not in [TARGET_COL,ID_COL]]
missing=[c for c in feature_cols if c not in test.columns]; assert not missing, f"Missing in test: {missing}"

obj_cols=[c for c in feature_cols if train[c].dtype=='object']
lowcard=[c for c in feature_cols if str(train[c].dtype).startswith('int') and train[c].nunique()<=30]
cat_cols=sorted(list(set(obj_cols+lowcard)))
num_cols=sorted([c for c in feature_cols if c not in cat_cols])

RARE_NAME="__RARE__"; MIN_CAT_COUNT=25
def apply_rare(s, m=MIN_CAT_COUNT):
    v=s.value_counts(); rare=v[v<m].index
    return s.where(~s.isin(rare), RARE_NAME)

encoders={}
for c in cat_cols:
    tr=apply_rare(train[c].astype(str)); te=apply_rare(test[c].astype(str))
    le=LabelEncoder(); le.fit(pd.concat([tr,te],axis=0).fillna("NA"))
    encoders[c]=le; train[c]=le.transform(tr.fillna("NA")); test[c]=le.transform(te.fillna("NA"))

scaler=None
if len(num_cols)>0:
    scaler=StandardScaler()
    train[num_cols]=scaler.fit_transform(train[num_cols]); test[num_cols]=scaler.transform(test[num_cols])

cat_cardinalities=[int(train[c].nunique()) for c in cat_cols]
y=train[TARGET_COL].values.astype(np.float32)

meta=dict(cat_cols=cat_cols,num_cols=num_cols,cat_cardinalities=cat_cardinalities,feature_cols=feature_cols)
(json.dump(meta, open(ROOT_DIR/'data_meta.json','w'), indent=2))


In [12]:

# --- MODELS & HELPERS ---
class TabDataset(Dataset):
    def __init__(self, df, y=None, num_cols=None, cat_cols=None):
        self.num=df[num_cols].values.astype(np.float32) if num_cols else np.zeros((len(df),0),np.float32)
        self.cat=df[cat_cols].values.astype(np.int64) if cat_cols else np.zeros((len(df),0),np.int64)
        self.y=y.astype(np.float32) if y is not None else None
    def __len__(self): return len(self.num)
    def __getitem__(self, i):
        if self.y is None: return self.num[i], self.cat[i]
        return self.num[i], self.cat[i], self.y[i]

class EarlyStopper:
    def __init__(self, patience=10, mode="max", min_delta=1e-6):
        self.p=patience; self.mode=mode; self.md=min_delta
        self.best=-np.inf if mode=="max" else np.inf; self.count=0; self.state=None
    def step(self, metric, model):
        imp=(metric>self.best+self.md) if self.mode=="max" else (metric<self.best-self.md)
        if imp: self.best=metric; self.count=0; self.state={k:v.cpu().clone() for k,v in model.state_dict().items()}; return True
        self.count+=1; return False
    def stop(self): return self.count>=self.p

def epoch_loop(model, loader, crit, opt=None, dev=DEVICE, clip=None):
    train=(opt is not None); model.train() if train else model.eval()
    losses=[]; preds=[]; targs=[]
    for b in loader:
        if train: x_num,x_cat,y=b
        else:
            try: x_num,x_cat,y=b
            except: x_num,x_cat=b; y=None
        x_num=x_num.to(dev); x_cat=x_cat.to(dev); 
        if y is not None: y=y.to(dev)
        with torch.set_grad_enabled(train):
            logit=model(x_num,x_cat); prob=torch.sigmoid(logit); loss=crit(logit,y) if y is not None else None
        if train:
            opt.zero_grad(); loss.backward(); 
            if clip is not None: nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()
        if loss is not None: losses.append(loss.item()); targs.append(y.detach().cpu().numpy())
        preds.append(prob.detach().cpu().numpy())
    preds=np.concatenate(preds) if preds else np.array([])
    y_true=np.concatenate(targs) if targs else None
    return (float(np.mean(losses)) if losses else None), preds, y_true

class MLPNet(nn.Module):
    def __init__(self, num_dim, cats, hidden, drop=0.25, emb_drop=0.05, in_drop=0.05):
        super().__init__()
        self.hc=len(cats)>0; self.hn=num_dim>0
        self.in_drop=nn.Dropout(in_drop) if in_drop>0 and self.hn else nn.Identity()
        if self.hc:
            self.embs=nn.ModuleList([nn.Embedding(c, int(min(64,max(4,round(1.6*(c**0.56)))))) for c in cats])
            self.emb_drop=nn.Dropout(emb_drop) if emb_drop>0 else nn.Identity()
            emb_total=sum([e.embedding_dim for e in self.embs])
        else:
            self.embs=None; self.emb_drop=nn.Identity(); emb_total=0
        in_dim=(num_dim if self.hn else 0)+emb_total
        L=[]; p=in_dim
        for h in hidden: L+=[nn.Linear(p,h), nn.BatchNorm1d(h), nn.GELU(), nn.Dropout(drop)]; p=h
        L+=[nn.Linear(p,1)]; self.mlp=nn.Sequential(*L)
    def forward(self, x_num, x_cat):
        feats=[]
        if self.hc:
            em=[emb(x_cat[:,i]) for i,emb in enumerate(self.embs)]
            cf=torch.cat(em,dim=1); feats.append(self.emb_drop(cf))
        if self.hn: feats.append(self.in_drop(x_num))
        x=torch.cat(feats,dim=1) if len(feats)>1 else feats[0]
        return self.mlp(x).squeeze(1)

class TabTransformer(nn.Module):
    def __init__(self, num_dim, cats, d_model, nhead, nlayers, drop, emb_drop, in_drop, head_layers):
        super().__init__()
        self.hc=len(cats)>0; self.hn=num_dim>0
        self.in_drop=nn.Dropout(in_drop) if in_drop>0 and self.hn else nn.Identity()
        if self.hc:
            self.cat_embs=nn.ModuleList([nn.Embedding(c, d_model) for c in cats])
            self.emb_drop=nn.Dropout(emb_drop) if emb_drop>0 else nn.Identity()
        else:
            self.cat_embs=None; self.emb_drop=nn.Identity()
        enc_layer=nn.TransformerEncoderLayer(d_model=d_model,nhead=nhead,dropout=drop,batch_first=True,activation='gelu')
        self.encoder=nn.TransformerEncoder(enc_layer,num_layers=nlayers)
        self.num_proj=nn.Linear(num_dim,d_model) if self.hn else None
        L=[]; in_dim=d_model + (d_model if self.hn else 0)
        for h in head_layers: L+=[nn.Linear(in_dim,h), nn.BatchNorm1d(h), nn.GELU(), nn.Dropout(drop)]; in_dim=h
        L+=[nn.Linear(in_dim,1)]; self.head=nn.Sequential(*L)
    def forward(self, x_num, x_cat):
        feats=[]
        if self.hc:
            toks=[e(x_cat[:,i]).unsqueeze(1) for i,e in enumerate(self.cat_embs)]
            tok=torch.cat(toks,dim=1); tok=self.emb_drop(tok)
            enc=self.encoder(tok); feats.append(enc.mean(dim=1))
        if self.hn: feats.append(self.in_drop(self.num_proj(x_num)))
        x=torch.cat(feats,dim=1) if len(feats)>1 else feats[0]
        return self.head(x).squeeze(1)

class FTTransformer(nn.Module):
    def __init__(self, num_dim, cats, d_model, nhead, nlayers, drop, emb_drop, in_drop, head_layers):
        super().__init__()
        self.nd=num_dim; self.cd=len(cats)
        self.num_proj=nn.Linear(1,d_model) if num_dim>0 else None
        self.num_drop=nn.Dropout(in_drop) if in_drop>0 else nn.Identity()
        self.cat_embs=nn.ModuleList([nn.Embedding(c,d_model) for c in cats]) if len(cats)>0 else None
        self.emb_drop=nn.Dropout(emb_drop) if emb_drop>0 else nn.Identity()
        self.cls=nn.Parameter(torch.zeros(1,1,d_model))
        enc_layer=nn.TransformerEncoderLayer(d_model=d_model,nhead=nhead,dropout=drop,batch_first=True,activation='gelu')
        self.encoder=nn.TransformerEncoder(enc_layer,num_layers=nlayers)
        L=[]; in_dim=d_model
        for h in head_layers: L+=[nn.Linear(in_dim,h), nn.BatchNorm1d(h), nn.GELU(), nn.Dropout(drop)]; in_dim=h
        L+=[nn.Linear(in_dim,1)]; self.head=nn.Sequential(*L)
    def forward(self, x_num, x_cat):
        B = x_num.shape[0] if x_num.ndim>0 else x_cat.shape[0]
        toks=[]
        if self.nd>0:
            x=x_num.unsqueeze(-1); x=self.num_proj(x); x=self.num_drop(x); toks.append(x)
        if self.cd>0:
            ct=[e(x_cat[:,i]).unsqueeze(1) for i,e in enumerate(self.cat_embs)]
            ct=torch.cat(ct,dim=1); ct=self.emb_drop(ct); toks.append(ct)
        tok=torch.cat(toks,dim=1) if len(toks)>1 else toks[0]
        cls=self.cls.expand(B,1,-1); tok=torch.cat([cls,tok],dim=1)
        enc=self.encoder(tok); return self.head(enc[:,0,:]).squeeze(1)


In [13]:

# --- WARM-START SCAN ---
def find_best_prior_run():
    base=Path("runs")
    if not base.exists(): return (None, None)
    best=-1.0; bestp=None
    for p in sorted(base.iterdir()):
        if not p.is_dir(): continue
        oof=p/"oof_predictions.csv"; metrics=p/"metrics.csv"
        try:
            if oof.exists():
                df=pd.read_csv(oof)
                if "oof" in df and TARGET_COL in df:
                    auc=roc_auc_score(df[TARGET_COL], df["oof"])
                    if auc>best: best=auc; bestp=p
            elif metrics.exists():
                m=pd.read_csv(metrics)
                if "best_val_auc" in m: 
                    val=m["best_val_auc"].dropna().max()
                    if val>best: best=val; bestp=p
        except Exception: pass
    return (bestp, best)

BEST_PRIOR=find_best_prior_run() if AUTO_WARM_START else (None,None)
print("Best prior run:", BEST_PRIOR)


Best prior run: (PosixPath('runs/2025-08-12_09-12-57'), 0.9625154905879971)


In [16]:

# --- TRAIN FUNCTION (for multiprocessing) ---
def train_model(model_name, arch_cfg, root_dir, device_str, pin_mem, seed=RANDOM_SEED):
    set_seed(seed)
    import torch, torch.nn as nn
    from torch.utils.data import DataLoader
    import numpy as np, pandas as pd
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_auc_score, RocCurveDisplay
    import matplotlib.pyplot as plt
    from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
    from pathlib import Path
    import json, time

    DEVICE=torch.device(device_str)

    meta=json.load(open(root_dir/'data_meta.json'))
    cat_cols=meta["cat_cols"]; num_cols=meta["num_cols"]; cat_cardinalities=meta["cat_cardinalities"]; feature_cols=meta["feature_cols"]
    
    # Use the already preprocessed train/test data from global scope
    global train, test, y
    train_data = train.copy()
    test_data = test.copy()
    y_data = y.copy()

    skf=StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=seed)
    mdir=root_dir/model_name; (mdir/"folds").mkdir(parents=True, exist_ok=True); (mdir/"figs").mkdir(exist_ok=True)

    pos_weight=None
    if USE_CLASS_WEIGHTS:
        pr=train_data[TARGET_COL].mean(); pw=max(1e-6,(1.0-pr)/max(1e-6,pr))
        pos_weight=torch.tensor([pw],dtype=torch.float32, device=DEVICE)

    oof=np.zeros(len(train_data),dtype=np.float32)
    test_preds=np.zeros((len(test_data),N_SPLITS),dtype=np.float32)

    warm_dir=None
    if AUTO_WARM_START and BEST_PRIOR and BEST_PRIOR[0] is not None:
        warm_dir=BEST_PRIOR[0]/model_name/"folds"
        if not warm_dir.exists(): 
            wd=BEST_PRIOR[0]/"folds"
            warm_dir=wd if wd.exists() else None

    for fold,(tr_idx,va_idx) in enumerate(skf.split(train_data[feature_cols], train_data[TARGET_COL])):
        tr_df=train_data.iloc[tr_idx].reset_index(drop=True); va_df=train_data.iloc[va_idx].reset_index(drop=True)
        tr_ds=TabDataset(tr_df, tr_df[TARGET_COL].values, num_cols, cat_cols)
        va_ds=TabDataset(va_df, va_df[TARGET_COL].values, num_cols, cat_cols)
        te_ds=TabDataset(test_data, None, num_cols, cat_cols)
        tr_loader=DataLoader(tr_ds,batch_size=BATCH_SIZE,shuffle=True,num_workers=0,pin_memory=pin_mem)
        va_loader=DataLoader(va_ds,batch_size=BATCH_SIZE,shuffle=False,num_workers=0,pin_memory=pin_mem)
        te_loader=DataLoader(te_ds,batch_size=BATCH_SIZE,shuffle=False,num_workers=0,pin_memory=pin_mem)

        # Build model
        if model_name=="modelA":
            model=MLPNet(len(num_cols), cat_cardinalities, A_HIDDEN, A_DROPOUT, A_EMB_DROPOUT, A_INPUT_DROPOUT).to(DEVICE)
        elif model_name=="modelB":
            model=TabTransformer(len(num_cols), cat_cardinalities, B_D_MODEL,B_N_HEAD,B_N_LAYERS,B_DROPOUT,B_EMB_DROPOUT,B_INPUT_DROPOUT,B_MLP_HEAD).to(DEVICE)
        else:
            model=FTTransformer(len(num_cols), cat_cardinalities, C_D_MODEL,C_N_HEAD,C_N_LAYERS,C_DROPOUT,C_EMB_DROPOUT,C_INPUT_DROPOUT,C_MLP_HEAD).to(DEVICE)

        crit=nn.BCEWithLogitsLoss(pos_weight=pos_weight) if pos_weight is not None else nn.BCEWithLogitsLoss()
        opt=torch.optim.AdamW(model.parameters(), lr=BASE_LR, weight_decay=WEIGHT_DECAY)
        sch=torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, T_0=COSINE_T0, T_mult=COSINE_T_MULT, eta_min=MIN_LR)
        swa_model=AveragedModel(model) if USE_SWA else None
        swa_sch=SWALR(opt, swa_lr=SWA_LR) if USE_SWA else None

        # Warm start
        if warm_dir is not None and (warm_dir/f"fold_{fold}"/"model.pth").exists():
            try:
                state=torch.load(warm_dir/f"fold_{fold}"/"model.pth", map_location="cpu")
                model.load_state_dict(state, strict=False)
                print(f"[{model_name}] Warm-started fold {fold}")
            except Exception as e:
                print(f"[{model_name}] Warm-start failed:", e)

        es=EarlyStopper(patience=PATIENCE,mode="max"); best=-np.inf; logs=[]
        for epoch in range(1,EPOCHS+1):
            tr_loss,_,_=epoch_loop(model,tr_loader,crit,opt,DEVICE,GRAD_CLIP)
            va_loss,va_pred,va_true=epoch_loop(model,va_loader,crit,None,DEVICE)
            va_auc=roc_auc_score(va_true,va_pred)
            sch.step(epoch+fold)
            if USE_SWA and epoch>=SWA_START_EPOCH: swa_model.update_parameters(model); swa_sch.step()
            logs.append(dict(epoch=epoch,train_loss=tr_loss,val_loss=va_loss,val_auc=float(va_auc),lr=float(opt.param_groups[0]['lr'])))
            if es.step(va_auc, model): best=va_auc
            if es.stop(): 
                print(f"[{model_name}] fold {fold} early stop @ {epoch}, best={best:.6f}"); break

        if USE_SWA:
            # Custom BN update for tabular models
            swa_model.train()
            with torch.no_grad():
                for x_num, x_cat, _ in tr_loader:
                    x_num = x_num.to(DEVICE)
                    x_cat = x_cat.to(DEVICE)
                    swa_model(x_num, x_cat)
            
            # Evaluate SWA model
            swa_model.eval()
            with torch.no_grad():
                p=[]; t=[]
                for x_num,x_cat,yb in va_loader:
                    x_num=x_num.to(DEVICE); x_cat=x_cat.to(DEVICE)
                    logits=swa_model(x_num,x_cat); p.append(torch.sigmoid(logits).cpu().numpy()); t.append(yb.numpy())
                p=np.concatenate(p); t=np.concatenate(t); auc_swa=roc_auc_score(t,p)
            if auc_swa>=best:
                model.load_state_dict(swa_model.state_dict()); best=auc_swa; print(f"[{model_name}] SWA kept/improved {best:.6f}")

        fold_dir=mdir/"folds"/f"fold_{fold}"; fold_dir.mkdir(parents=True, exist_ok=True)
        pd.DataFrame(logs).to_csv(fold_dir/"train_log.csv", index=False)
        torch.save(model.state_dict(), fold_dir/"model.pth")

        _, va_pred, va_true = epoch_loop(model,va_loader,crit,None,DEVICE); oof[va_idx]=va_pred.squeeze()
        _, te_pred, _ = epoch_loop(model,te_loader,crit,None,DEVICE); test_preds[:,fold]=te_pred.squeeze()

        fig,ax=plt.subplots(); RocCurveDisplay.from_predictions(va_true,va_pred,ax=ax); ax.set_title(f"{model_name} Fold {fold} ROC")
        fig.savefig(fold_dir/"roc_curve.png",bbox_inches="tight"); plt.close(fig)

        df=pd.read_csv(fold_dir/"train_log.csv")
        fig,ax=plt.subplots(); ax.plot(df["epoch"],df["train_loss"],label="train"); ax.plot(df["epoch"],df["val_loss"],label="val")
        ax.legend(); ax.set_title(f"{model_name} Fold {fold} Loss"); fig.savefig(fold_dir/"loss_curve.png",bbox_inches="tight"); plt.close(fig)

        fig,ax=plt.subplots(); ax.plot(df["epoch"],df["val_auc"],label="val_auc"); ax.legend(); ax.set_title(f"{model_name} Fold {fold} AUC")
        fig.savefig(fold_dir/"val_auc_curve.png",bbox_inches="tight"); plt.close(fig)

    oof_auc=roc_auc_score(y_data,oof); print(f"[{model_name}] OOF AUC:", oof_auc)
    pd.DataFrame({ID_COL:train_data[ID_COL].values,"oof":oof,TARGET_COL:y_data}).to_csv(mdir/"oof_predictions.csv",index=False)
    sub=pd.DataFrame({ID_COL:test_data[ID_COL].values,TARGET_COL:test_preds.mean(axis=1)}); sub.to_csv(mdir/"submission.csv",index=False)

    fig,ax=plt.subplots(); RocCurveDisplay.from_predictions(y_data,oof,ax=ax); ax.set_title(f"{model_name} OOF ROC")
    fig.savefig(mdir/"figs"/"oof_roc_curve.png",bbox_inches="tight"); plt.close(fig)
    json.dump({"oof_auc":float(oof_auc)}, open(mdir/"score.json","w"))


In [17]:

# --- SEQUENTIAL TRAINING (Notebook-Safe) ---
archA=dict(hidden=A_HIDDEN, drop=A_DROPOUT, emb_drop=A_EMB_DROPOUT, in_drop=A_INPUT_DROPOUT)
archB=dict(d_model=B_D_MODEL,nhead=B_N_HEAD,nlayers=B_N_LAYERS,drop=B_DROPOUT,emb_drop=B_EMB_DROPOUT,in_drop=B_INPUT_DROPOUT,head_layers=B_MLP_HEAD)
archC=dict(d_model=C_D_MODEL,nhead=C_N_HEAD,nlayers=C_N_LAYERS,drop=C_DROPOUT,emb_drop=C_EMB_DROPOUT,in_drop=C_INPUT_DROPOUT,head_layers=C_MLP_HEAD)

json.dump(archA, open(ROOT_DIR/"arch_modelA.json","w"), indent=2)
json.dump(archB, open(ROOT_DIR/"arch_modelB.json","w"), indent=2)
json.dump(archC, open(ROOT_DIR/"arch_modelC.json","w"), indent=2)

# Train models sequentially (safer for notebooks)
for name,cfg in [("modelA",archA),("modelB",archB),("modelC",archC)]:
    print(f"\n🚀 Starting {name}...")
    train_model(name, cfg, ROOT_DIR, str(DEVICE), PIN_MEM, RANDOM_SEED)
    print(f"✅ {name} completed!")
    
print("🎉 All models finished!")



🚀 Starting modelA...
[modelA] Warm-start failed: Error(s) in loading state_dict for MLPNet:
	size mismatch for mlp.0.weight: copying a param with shape torch.Size([512, 47]) from checkpoint, the shape in current model is torch.Size([1024, 47]).
	size mismatch for mlp.0.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([1024]).
	size mismatch for mlp.9.weight: copying a param with shape torch.Size([1, 128]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for mlp.9.bias: copying a param with shape torch.Size([1]) from checkpoint, the shape in current model is torch.Size([256]).
[modelA] Warm-start failed: Error(s) in loading state_dict for MLPNet:
	size mismatch for mlp.0.weight: copying a param with shape torch.Size([512, 47]) from checkpoint, the shape in current model is torch.Size([1024, 47]).
	size mismatch for mlp.0.bias: copying a param with shape torch.Size([512]) from checkpoint, the sha

KeyboardInterrupt: 

In [None]:

# --- COMPARISON + ENSEMBLE ---
rows=[]; subs=[]; oofs=[]
for name in ["modelA","modelB","modelC"]:
    mdir=ROOT_DIR/name
    sc=json.load(open(mdir/"score.json"))
    rows.append({"model":name,"oof_auc":sc["oof_auc"],"path":mdir.as_posix()})
    s=pd.read_csv(mdir/"submission.csv").set_index(ID_COL).rename(columns={TARGET_COL:f"y_{name}"})
    subs.append(s)
    o=pd.read_csv(mdir/"oof_predictions.csv")[[ID_COL,"oof"]].set_index(ID_COL).rename(columns={"oof":f"oof_{name}"})
    oofs.append(o)

cmp=pd.DataFrame(rows).sort_values("oof_auc",ascending=False)
display(cmp)
cmp.to_csv(ROOT_DIR/"model_comparison.csv",index=False)

ens=subs[0].join(subs[1],how="inner").join(subs[2],how="inner")
ens["y"]=ens.mean(axis=1); ens[["y"]].reset_index().to_csv(ROOT_DIR/"submission_ensemble_mean.csv",index=False)

oof_ens=oofs[0].join(oofs[1],how="inner").join(oofs[2],how="inner")
oof_ens["oof"]=oof_ens.mean(axis=1)
base=pd.DataFrame({ID_COL:pd.read_csv(TRAIN_PATH)[ID_COL].values,TARGET_COL:pd.read_csv(TRAIN_PATH)[TARGET_COL].values}).set_index(ID_COL)
full=base.join(oof_ens,how="inner"); auc_ens=roc_auc_score(full[TARGET_COL].values, full["oof"].values)
print("Ensemble OOF AUC:", auc_ens)
full.reset_index()[[ID_COL,"oof",TARGET_COL]].to_csv(ROOT_DIR/"oof_ensemble.csv",index=False)

fig,ax=plt.subplots(); RocCurveDisplay.from_predictions(full[TARGET_COL].values, full["oof"].values, ax=ax)
ax.set_title("Ensemble OOF ROC"); fig.savefig(ROOT_DIR/"figs"/"oof_ensemble_roc.png",bbox_inches="tight"); plt.close(fig)


FileNotFoundError: [Errno 2] No such file or directory: 'runs/2025-08-12_11-01-44/modelA/score.json'