In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from xgboost import XGBClassifier

# load data
train = pd.read_csv("aluminum_coldRoll_train.csv")
test  = pd.read_csv("aluminum_coldRoll_testNoY.csv")

target_col = "y_passXtremeDurability"
id_col = "ID"

X = train.drop(columns=[target_col])
y = train[target_col]
X_test = test.copy()

# preprocessing and feature types
cat_cols = [
    "alloy",
    "cutTemp",
    "rollTemp",
    "topEdgeMicroChipping",
    "blockSource",
    "machineRestart",
    "contourDefNdx",
]

num_cols = [c for c in X.columns if c not in cat_cols + [id_col]]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

# this creates our tuned parameters
def make_model(seed):
    xgb = XGBClassifier(
        n_estimators=450,
        max_depth=3,
        learning_rate=0.1,
        subsample=1.0,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1,
        min_child_weight=3,
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",   # you can try "exact" if you want
        random_state=seed,
    )

    model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("xgb", xgb),
    ])
    return model

# train validation and split for ensemble verification
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.3, random_state=101, stratify=y
)

seeds = [101, 202, 303, 404, 505]

val_preds = []

for seed in seeds:
    print(f"Fitting model with seed {seed} on train split...")
    model = make_model(seed)
    model.fit(X_tr, y_tr)
    val_prob = model.predict_proba(X_val)[:, 1]
    val_preds.append(val_prob)

# average validation predictions across models
val_preds = np.vstack(val_preds)  # shape: (n_models, n_val_samples)
val_mean_prob = val_preds.mean(axis=0)

val_logloss = log_loss(y_val, val_mean_prob)
print(f"\nEnsemble validation log-loss: {val_logloss:.6f}")

# train ensemble on full training data and predict on test
test_preds = []

for seed in seeds:
    print(f"Fitting model with seed {seed} on FULL training data...")
    model = make_model(seed)
    model.fit(X, y)
    test_prob = model.predict_proba(X_test)[:, 1]
    test_preds.append(test_prob)

test_preds = np.vstack(test_preds)
test_mean_prob = test_preds.mean(axis=0)

# clip for safety
eps = 1e-6
test_mean_prob = np.clip(test_mean_prob, eps, 1 - eps)

submission = pd.DataFrame({
    "ID": X_test[id_col],
    "y_passXtremeDurability": test_mean_prob
})

submission.to_csv("team16_xgb_ensemble.csv", index=False)
print("\nSaved submission as team16_xgb_ensemble.csv")
print(submission.head())
