In [1]:
import pandas as pd 

test = pd.read_csv("data/aluminum_coldRoll_testNoY.csv")
train = pd.read_csv("data/aluminum_coldRoll_train.csv")
alloy = pd.read_csv("data/ally.csv")

In [2]:
# Ensure alloy column is string (important!)
train["alloy"] = train["alloy"].astype(str)
test["alloy"] = test["alloy"].astype(str)
alloy["alloy"] = alloy["alloy"].astype(str)

# Merge the composition columns into train/test
train = train.merge(alloy, on="alloy", how="left")
test = test.merge(alloy, on="alloy", how="left")
test_ids = test["ID"]

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      160000 non-null  int64  
 1   alloy                   160000 non-null  object 
 2   cutTemp                 160000 non-null  object 
 3   rollTemp                160000 non-null  object 
 4   firstPassRollPressure   160000 non-null  int64  
 5   secondPassRollPressure  160000 non-null  int64  
 6   topEdgeMicroChipping    160000 non-null  object 
 7   blockSource             160000 non-null  object 
 8   machineRestart          160000 non-null  object 
 9   contourDefNdx           160000 non-null  int64  
 10  clearPassNdx            160000 non-null  float64
 11  y_passXtremeDurability  160000 non-null  int64  
 12  Cu                      160000 non-null  float64
 13  Mg                      160000 non-null  float64
 14  Mn                  

In [7]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss


# =========================================================
# 1. PREP DATA
# =========================================================

df = train.copy()

target = "y_passXtremeDurability"
categorical_cols = ["alloy", "cutTemp", "rollTemp",
                    "topEdgeMicroChipping", "blockSource", "machineRestart"]

# Label encode categoricals
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))


X = df.drop(columns=[target, "ID"])
y = df[target]


# =========================================================
# 2. LIGHTGBM PARAMS
# =========================================================

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "max_depth": -1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "min_data_in_leaf": 50,
    "cat_l2": 10,
    "cat_smooth": 10,
    "seed": 42,
    "verbosity": -1
}


# =========================================================
# 3. STRATIFIED 5-FOLD CV
# =========================================================

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_pred = np.zeros(len(df))
fold_losses = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\n==================== FOLD {fold+1} ====================")

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=categorical_cols
    )

    valid_data = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=categorical_cols
    )

    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        callbacks=[lgb.log_evaluation(period=200)]
    )

    # Predict on validation fold
    valid_pred = model.predict(X_valid)
    oof_pred[valid_idx] = valid_pred

    # Compute fold log loss
    fold_loss = log_loss(y_valid, valid_pred)
    fold_losses.append(fold_loss)
    print(f"Fold {fold+1} LogLoss = {fold_loss:.5f}")


# =========================================================
# 4. RESULTS
# =========================================================

print("\n==================== LIGHTGBM 5-FOLD RESULTS ====================")
print("Fold LogLoss:", fold_losses)
print("Mean LogLoss:", np.mean(fold_losses))
print("StdDev:", np.std(fold_losses))


# Add OOF predictions for downstream analysis
df["oof_pred"] = oof_pred
print("\nOOF predictions added to df as column 'oof_pred'")



Fold 1 LogLoss = 0.44789

Fold 2 LogLoss = 0.44810

Fold 3 LogLoss = 0.44406

Fold 4 LogLoss = 0.44989

Fold 5 LogLoss = 0.44817

Fold LogLoss: [0.4478901439376981, 0.4480969221955147, 0.44406000485417335, 0.449891202244508, 0.4481741252794832]
Mean LogLoss: 0.4476224797022755
StdDev: 0.001920384424127764

OOF predictions added to df as column 'oof_pred'


In [10]:
import optuna
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss


# =========================================================
# 1. PREP DATA
# =========================================================

df = train.copy()

target = "y_passXtremeDurability"
categorical_cols = ["alloy", "cutTemp", "rollTemp",
                    "topEdgeMicroChipping", "blockSource", "machineRestart"]

# Label encode categoricals
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

X = df.drop(columns=[target, "ID"])
y = df[target]


# =========================================================
# 2. OPTUNA OBJECTIVE FUNCTION
# =========================================================

def objective(trial):

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "seed": 42,
        
        # Hyperparameters to tune â†“
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 120),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "cat_l2": trial.suggest_float("cat_l2", 0.0, 20.0),
        "cat_smooth": trial.suggest_float("cat_smooth", 0.0, 20.0),
    }

    # 5-fold CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    fold_losses = []

    for train_idx, valid_idx in skf.split(X, y):

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_data = lgb.Dataset(
            X_train, y_train, categorical_feature=categorical_cols
        )
        valid_data = lgb.Dataset(
            X_valid, y_valid, categorical_feature=categorical_cols
        )

        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False)
            ],
        )

        preds = model.predict(X_valid)
        loss = log_loss(y_valid, preds)
        fold_losses.append(loss)

    return np.mean(fold_losses)


# =========================================================
# 3. RUN OPTUNA STUDY
# =========================================================

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40, show_progress_bar=True)

print("\n==================== BEST PARAMETERS ====================")
print(study.best_params)
print("\nBest CV LogLoss =", study.best_value)


[I 2025-11-29 11:29:09,422] A new study created in memory with name: no-name-8795fd82-4f3e-4625-9021-0c1e28b2a5cc


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-11-29 11:29:15,846] Trial 0 finished with value: 0.4956714076543567 and parameters: {'learning_rate': 0.014929821183551656, 'num_leaves': 63, 'max_depth': 11, 'feature_fraction': 0.7990188904449786, 'bagging_fraction': 0.7347601473022568, 'bagging_freq': 7, 'min_data_in_leaf': 24, 'lambda_l1': 0.00035880580984464427, 'lambda_l2': 4.372876788456346e-05, 'cat_l2': 1.9553739164928396, 'cat_smooth': 19.703160445068367}. Best is trial 0 with value: 0.4956714076543567.
[I 2025-11-29 11:29:18,003] Trial 1 finished with value: 0.5343254322937263 and parameters: {'learning_rate': 0.016609641913127336, 'num_leaves': 155, 'max_depth': 3, 'feature_fraction': 0.8381617417075253, 'bagging_fraction': 0.8151863440293132, 'bagging_freq': 5, 'min_data_in_leaf': 35, 'lambda_l1': 2.4570458486997784, 'lambda_l2': 0.0012727180573610755, 'cat_l2': 2.810506706950755, 'cat_smooth': 5.177589216843952}. Best is trial 0 with value: 0.4956714076543567.
[I 2025-11-29 11:29:19,312] Trial 2 finished with valu

In [11]:
best_params = study.best_params
best_params.update({
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "seed": 42
})

train_data_full = lgb.Dataset(X, y, categorical_feature=categorical_cols)

final_model = lgb.train(
    best_params,
    train_data_full,
    num_boost_round=2000,
    callbacks=[lgb.log_evaluation(period=200)]
)

print("\nFinal model trained on full dataset.")



Final model trained on full dataset.


In [17]:
# =============================
# FIX: Encode test categorical columns
# =============================

test_df_fixed = test.copy()   # keep original safe

for col in categorical_cols:
    le = LabelEncoder()
    le.fit(train[col].astype(str))   # fit on **train only**
    test_df_fixed[col] = le.transform(test_df_fixed[col].astype(str))

# =============================
# Prepare test features
# =============================
X_test = test_df_fixed.drop(columns=["ID"])

# =============================
# Predict
# =============================
test_pred = final_model.predict(X_test)

# =============================
# Build Submission
# =============================
submission = pd.DataFrame({
    "ID": test_df_fixed["ID"],
    "y_passXtremeDurability": test_pred
})

submission.to_csv("submission_lightgbm.csv", index=False)

print("Saved submission_lightgbm.csv")


Saved submission_lightgbm.csv
