load OOF/test arrays + labels

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from scipy.stats import rankdata

ROOT = Path.cwd().resolve().parents[0]
DATA_DIR = ROOT / "data" / "raw"
REPORTS_DIR = ROOT / "reports"

train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")

id_col = sub.columns[0]
target_col = sub.columns[1]
y = (train[target_col] == "Presence").astype(int).values

oof_cat = np.load(REPORTS_DIR / "oof_catboost.npy")
test_cat = np.load(REPORTS_DIR / "test_catboost.npy")

oof_lgb = np.load(REPORTS_DIR / "oof_lgbm.npy")
test_lgb = np.load(REPORTS_DIR / "test_lgbm.npy")

print("Loaded:", oof_cat.shape, oof_lgb.shape, test_cat.shape, test_lgb.shape)
print("AUC cat OOF:", roc_auc_score(y, oof_cat))
print("AUC lgb OOF:", roc_auc_score(y, oof_lgb))


Loaded: (630000,) (630000,) (270000,) (270000,)
AUC cat OOF: 0.9554845267311664
AUC lgb OOF: 0.9551498401438248


weight search (prob + rank)

In [2]:
def rank01(x: np.ndarray) -> np.ndarray:
    r = rankdata(x, method="average")
    return r / len(r)

def best_weight(oof_a, oof_b, y, mode="prob"):
    best_auc = -1.0
    best_w = None
    for w in np.linspace(0, 1, 201):  # step 0.005
        if mode == "prob":
            pred = w * oof_a + (1 - w) * oof_b
        else:
            pred = w * rank01(oof_a) + (1 - w) * rank01(oof_b)
        auc = roc_auc_score(y, pred)
        if auc > best_auc:
            best_auc = auc
            best_w = w
    return best_auc, best_w

auc_prob, w_prob = best_weight(oof_cat, oof_lgb, y, mode="prob")
auc_rank, w_rank = best_weight(oof_cat, oof_lgb, y, mode="rank")

print("Best PROB: AUC=", round(float(auc_prob), 6), "w_cat=", round(float(w_prob), 3))
print("Best RANK: AUC=", round(float(auc_rank), 6), "w_cat=", round(float(w_rank), 3))


Best PROB: AUC= 0.955485 w_cat= 0.99
Best RANK: AUC= 0.955485 w_cat= 0.975


That result tells us something very clear:

CatBoost and LGBM are extremely similar (highly correlated)

The best blend is basically CatBoost-only:

Prob blend: best at w_cat = 0.99

Rank blend: best at w_cat = 0.975

And the best blended OOF AUC is exactly the same (0.955485) — meaning blending isn’t adding signal, just tiny noise.

So: don’t expect blending to beat your CatBoost GPU LB by much (if at all).
Still, it’s worth submitting the best blend once because LB can behave slightly differently than OOF.

Prob blend (w_cat=0.99)

In [3]:
import pandas as pd
from pathlib import Path
import numpy as np

ROOT = Path.cwd().resolve().parents[0]
DATA_DIR = ROOT / "data" / "raw"
REPORTS_DIR = ROOT / "reports"

test  = pd.read_csv(DATA_DIR / "test.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")
id_col = sub.columns[0]
target_col = sub.columns[1]

test_cat = np.load(REPORTS_DIR / "test_catboost.npy")
test_lgb = np.load(REPORTS_DIR / "test_lgbm.npy")

w_cat = 0.99
pred = w_cat * test_cat + (1 - w_cat) * test_lgb

out_path = REPORTS_DIR / "sub_oofbest_prob_wcat_99.csv"
pd.DataFrame({id_col: test[id_col], target_col: pred}).to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: C:\Dev\kaggle-ps-s6e2-heart\reports\sub_oofbest_prob_wcat_99.csv


Rank blend (w_cat=0.975)

In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import rankdata

ROOT = Path.cwd().resolve().parents[0]
DATA_DIR = ROOT / "data" / "raw"
REPORTS_DIR = ROOT / "reports"

test  = pd.read_csv(DATA_DIR / "test.csv")
sub   = pd.read_csv(DATA_DIR / "sample_submission.csv")
id_col = sub.columns[0]
target_col = sub.columns[1]

test_cat = np.load(REPORTS_DIR / "test_catboost.npy")
test_lgb = np.load(REPORTS_DIR / "test_lgbm.npy")

def rank01(x):
    r = rankdata(x, method="average")
    return r / len(r)

w_cat = 0.975
pred = w_cat * rank01(test_cat) + (1 - w_cat) * rank01(test_lgb)

out_path = REPORTS_DIR / "sub_oofbest_rank_wcat_97_5.csv"
pd.DataFrame({id_col: test[id_col], target_col: pred}).to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: C:\Dev\kaggle-ps-s6e2-heart\reports\sub_oofbest_rank_wcat_97_5.csv
