In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

# ======================================================
# LOAD DATA
# ======================================================
train = pd.read_csv("aluminum_coldRoll_train.csv")
test = pd.read_csv("aluminum_coldRoll_testNoY.csv")

X = train.drop(columns=["y_passXtremeDurability"])
y = train["y_passXtremeDurability"]

# Encode categorical variables
for col in X.select_dtypes(include="object").columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)

# ======================================================
# YOUR BEST HYPERPARAMETERS
# ======================================================
best_params = {
    "eta": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 1.0,
    "gamma": 2,
    "min_child_weight": 5,
    "n_estimators": 800
}

# ======================================================
# 5-FOLD CROSS-VALIDATION LOG-LOSS
# ======================================================
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
losses = []

for train_idx, val_idx in kf.split(X_scaled, y):

    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBClassifier(
        eta = best_params["eta"],
        max_depth = best_params["max_depth"],
        subsample = best_params["subsample"],
        colsample_bytree = best_params["colsample_bytree"],
        gamma = best_params["gamma"],
        min_child_weight = best_params["min_child_weight"],
        n_estimators = best_params["n_estimators"],
        eval_metric = "logloss",
        use_label_encoder=False
    )

    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    losses.append(log_loss(y_val, preds))

avg_loss = np.mean(losses)
print(f"\nAverage Log-Loss = {avg_loss:.5f}")

# ======================================================
# TRAIN FINAL MODEL ON FULL TRAINING DATA
# ======================================================
final_model = XGBClassifier(
    eta = best_params["eta"],
    max_depth = best_params["max_depth"],
    subsample = best_params["subsample"],
    colsample_bytree = best_params["colsample_bytree"],
    gamma = best_params["gamma"],
    min_child_weight = best_params["min_child_weight"],
    n_estimators = best_params["n_estimators"],
    eval_metric="logloss",
    use_label_encoder=False
)

final_model.fit(X_scaled, y)

# ======================================================
# EXPORT CSV
# ======================================================
test_preds = final_model.predict_proba(test_scaled)[:,1]

output = pd.DataFrame({"y_passXtremeDurability": test_preds})
output.to_csv("final.csv", index=False)

print("\nCSV file created: final.csv")
