In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

train = pd.read_csv("aluminum_coldRoll_train.csv")
test  = pd.read_csv("aluminum_coldRoll_testNoY.csv")]
# don't need example submission for modeling, but I used it to check format
example_sub = pd.read_csv("aluminum_coldRoll_example.csv") 

# analysis showed no missing values in train or test
target_col = "y_passXtremeDurability"
id_col = "ID"

X = train.drop(columns=[target_col])
y = train[target_col]

X_test = test.copy()

# define categorical and numerical columns
cat_cols = [
    "alloy",
    "cutTemp",
    "rollTemp",
    "topEdgeMicroChipping",
    "blockSource",
    "machineRestart",
    "contourDefNdx",
]
num_cols = [c for c in X.columns if c not in cat_cols + [id_col]]

# preprocessing pipeline for categorical and numerical features
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

# this is the XGBoost model with tuned hyperparameters
## you guys can change these if you want to try different settings and get better performance
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    random_state=101,
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("xgb", xgb)
])

# train-validation split for local evaluation (30% val, stratified)
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.3, random_state=101, stratify=y
)

model.fit(X_tr, y_tr)

val_pred_prob = model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_pred_prob)
print(f"Validation AUC: {val_auc:.4f}")

# retrain on full training data for final model
model.fit(X, y)

# predict on test set for submission
test_pred_prob = model.predict_proba(X_test)[:, 1]

# clip probabilities to avoid exact 0 or 1 predictions (which can be problematic for log-loss and a good practice/safeguard)
eps = 1e-6
test_pred_prob = test_pred_prob.clip(eps, 1 - eps)

# create submission dataframe
submission = pd.DataFrame({
    "ID": X_test[id_col],
    "y_passXtremeDurability": test_pred_prob
})

print(submission.head())

# save submission to csv file (don't change the file name, this is required for submission)
submission.to_csv("team16_xgb_submission.csv", index=False)


Validation AUC: 0.8822
       ID  y_passXtremeDurability
0  160001                0.004856
1  160002                0.052003
2  160003                0.883085
3  160004                0.709794
4  160005                0.816278
