In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ============================================================
# 1. LOAD DATA
# ============================================================

train = pd.read_csv("aluminum_coldRoll_train.csv")
test  = pd.read_csv("aluminum_coldRoll_testNoY.csv")

target_col = "y_passXtremeDurability"
id_col = "ID"

X = train.drop(columns=[target_col])
y = train[target_col]
X_test = test.copy()

# ============================================================
# 2. ONE-HOT ENCODE CATEGORICALS (FOR BOTH MODELS)
# ============================================================

cat_cols = [
    "alloy",
    "cutTemp",
    "rollTemp",
    "topEdgeMicroChipping",
    "blockSource",
    "machineRestart",
    "contourDefNdx",
]

# get_dummies handles cats; numeric cols pass through
X_enc = pd.get_dummies(X, columns=cat_cols)
X_test_enc = pd.get_dummies(X_test, columns=cat_cols)

# align columns between train and test
X_test_enc = X_test_enc.reindex(columns=X_enc.columns, fill_value=0)

# ============================================================
# 3. TRAIN/VAL SPLIT
# ============================================================

X_tr, X_val, y_tr, y_val = train_test_split(
    X_enc, y, test_size=0.3, random_state=101, stratify=y
)

# ============================================================
# 4. MODEL FACTORIES WITH YOUR TUNED-ish SETTINGS
# ============================================================

def make_xgb(seed):
    return XGBClassifier(
        n_estimators=450,
        max_depth=3,
        learning_rate=0.1,
        subsample=1.0,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1,
        min_child_weight=3,
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        random_state=seed,
    )

def make_lgbm(seed):
    return LGBMClassifier(
        n_estimators=450,
        max_depth=3,
        learning_rate=0.1,
        subsample=1.0,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1,
        objective="binary",
        random_state=seed,
        n_jobs=-1,
    )

seeds = [101, 202, 303, 404, 505]

# ============================================================
# 5. TRAIN ENSEMBLES PER MODEL TYPE (XGB & LGBM)
# ============================================================

val_preds_xgb = []
val_preds_lgb = []

print("Fitting XGBoost and LightGBM models on train split...")

for seed in seeds:
    # XGBoost
    xgb = make_xgb(seed)
    xgb.fit(X_tr, y_tr)
    val_prob_xgb = xgb.predict_proba(X_val)[:, 1]
    val_preds_xgb.append(val_prob_xgb)

    # LightGBM
    lgbm = make_lgbm(seed)
    lgbm.fit(X_tr, y_tr)
    val_prob_lgb = lgbm.predict_proba(X_val)[:, 1]
    val_preds_lgb.append(val_prob_lgb)

val_preds_xgb = np.vstack(val_preds_xgb)  # shape: (n_seeds, n_val)
val_preds_lgb = np.vstack(val_preds_lgb)

# average within each model type
val_mean_xgb = val_preds_xgb.mean(axis=0)
val_mean_lgb = val_preds_lgb.mean(axis=0)

# ============================================================
# 6. SEARCH BEST BLEND WEIGHT ON VALIDATION LOG-LOSS
#    blend = w * XGB + (1 - w) * LGBM
# ============================================================

best_w = None
best_ll = float("inf")

for w in np.linspace(0.0, 1.0, 51):  # 0.00, 0.02, ..., 1.00
    blended = w * val_mean_xgb + (1.0 - w) * val_mean_lgb
    ll = log_loss(y_val, blended)
    if ll < best_ll:
        best_ll = ll
        best_w = w

print(f"\nBest blend weight w for XGB: {best_w:.3f}")
print(f"Validation log-loss with blend: {best_ll:.6f}")

# ============================================================
# 7. TRAIN ON FULL DATA AND APPLY SAME BLENDING TO TEST
# ============================================================

test_preds_xgb = []
test_preds_lgb = []

print("\nFitting XGBoost and LightGBM models on FULL data...")

for seed in seeds:
    # XGBoost full-data model
    xgb = make_xgb(seed)
    xgb.fit(X_enc, y)
    test_prob_xgb = xgb.predict_proba(X_test_enc)[:, 1]
    test_preds_xgb.append(test_prob_xgb)

    # LightGBM full-data model
    lgbm = make_lgbm(seed)
    lgbm.fit(X_enc, y)
    test_prob_lgb = lgbm.predict_proba(X_test_enc)[:, 1]
    test_preds_lgb.append(test_prob_lgb)

test_preds_xgb = np.vstack(test_preds_xgb)
test_preds_lgb = np.vstack(test_preds_lgb)

test_mean_xgb = test_preds_xgb.mean(axis=0)
test_mean_lgb = test_preds_lgb.mean(axis=0)

# apply best blend weight
test_blend = best_w * test_mean_xgb + (1.0 - best_w) * test_mean_lgb

# clip for safety
eps = 1e-6
test_blend = np.clip(test_blend, eps, 1 - eps)

submission = pd.DataFrame({
    "ID": X_test[id_col],
    "y_passXtremeDurability": test_blend
})

submission.to_csv("team16_xgb_lgbm_blend.csv", index=False)
print("\nSaved submission as team16_xgb_lgbm_blend.csv")
print(submission.head())


ModuleNotFoundError: No module named 'numpy'