In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# ======================================================
# LOAD + PREPROCESS
# ======================================================
train = pd.read_csv("aluminum_coldRoll_train.csv")
test  = pd.read_csv("aluminum_coldRoll_testNoY.csv")

X = train.drop(columns=["y_passXtremeDurability"])
y = train["y_passXtremeDurability"]

# Encode categorical variables
combined = pd.concat([X, test], axis=0)
for col in combined.select_dtypes(include="object").columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

X_encoded = combined.iloc[:len(train), :]

# ======================================================
# KNN MODEL
# ======================================================
def run_knn(X, y, k=5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    losses = []

    for train_idx, val_idx in kf.split(X_scaled, y):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]

        losses.append(log_loss(y_val, preds))

    print(f"KNN (k={k}) Log-Loss: {np.mean(losses):.6f}")

run_knn(X_encoded, y, k=5)
