In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# ======================================================
# LOAD + PREPROCESS
# ======================================================
train = pd.read_csv("aluminum_coldRoll_train.csv")
test  = pd.read_csv("aluminum_coldRoll_testNoY.csv")

X = train.drop(columns=["y_passXtremeDurability"])
y = train["y_passXtremeDurability"]

combined = pd.concat([X, test], axis=0)
for col in combined.select_dtypes(include="object").columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

X_encoded = combined.iloc[:len(train), :]

# ======================================================
# RANDOM FOREST
# ======================================================
def run_rf(X, y):
    X_np = np.array(X)

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    losses = []

    for train_idx, val_idx in kf.split(X_np, y):
        X_train, X_val = X_np[train_idx], X_np[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            min_samples_split=4,
            min_samples_leaf=2,
            max_features="sqrt",
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]

        losses.append(log_loss(y_val, preds))

    print(f"Random Forest Log-Loss: {np.mean(losses):.6f}")

run_rf(X_encoded, y)
