In [1]:
import pandas as pd 

test = pd.read_csv("data/aluminum_coldRoll_testNoY.csv")
train = pd.read_csv("data/aluminum_coldRoll_train.csv")
alloy = pd.read_csv("data/ally.csv")

In [2]:
# Ensure alloy column is string (important!)
train["alloy"] = train["alloy"].astype(str)
test["alloy"] = test["alloy"].astype(str)
alloy["alloy"] = alloy["alloy"].astype(str)

# Merge the composition columns into train/test
train = train.merge(alloy, on="alloy", how="left")
test = test.merge(alloy, on="alloy", how="left")
test_ids = test["ID"]

In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 21 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      160000 non-null  int64  
 1   alloy                   160000 non-null  object 
 2   cutTemp                 160000 non-null  object 
 3   rollTemp                160000 non-null  object 
 4   firstPassRollPressure   160000 non-null  int64  
 5   secondPassRollPressure  160000 non-null  int64  
 6   topEdgeMicroChipping    160000 non-null  object 
 7   blockSource             160000 non-null  object 
 8   machineRestart          160000 non-null  object 
 9   contourDefNdx           160000 non-null  int64  
 10  clearPassNdx            160000 non-null  float64
 11  Cu                      160000 non-null  float64
 12  Mg                      160000 non-null  float64
 13  Mn                      160000 non-null  float64
 14  Si                  

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      160000 non-null  int64  
 1   alloy                   160000 non-null  object 
 2   cutTemp                 160000 non-null  object 
 3   rollTemp                160000 non-null  object 
 4   firstPassRollPressure   160000 non-null  int64  
 5   secondPassRollPressure  160000 non-null  int64  
 6   topEdgeMicroChipping    160000 non-null  object 
 7   blockSource             160000 non-null  object 
 8   machineRestart          160000 non-null  object 
 9   contourDefNdx           160000 non-null  int64  
 10  clearPassNdx            160000 non-null  float64
 11  y_passXtremeDurability  160000 non-null  int64  
 12  Cu                      160000 non-null  float64
 13  Mg                      160000 non-null  float64
 14  Mn                  

In [6]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

# -------------------------
# Identify features
# -------------------------

target = "y_passXtremeDurability"

categorical_cols = [
    "alloy",
    "cutTemp",
    "rollTemp",
    "topEdgeMicroChipping",
    "blockSource",
    "machineRestart"
]

numeric_cols = [
    "firstPassRollPressure",
    "secondPassRollPressure",
    "contourDefNdx",
    "clearPassNdx",
    "Cu","Mg","Mn","Si","Zn","Cr","Fe","Ti","Ag","Zr"
]

features = categorical_cols + numeric_cols

X = train[features]
y = train[target]

test_X = test[features]

# -------------------------
# CatBoost Model Settings
# -------------------------

cat_model_params = {
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "random_seed": 42,
    "iterations": 3000,
    "learning_rate": 0.03,
    "depth": 8,
    "l2_leaf_reg": 3,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 1,
    "task_type": "CPU",
    "verbose": 200
}

# -------------------------
# K-Fold CV
# -------------------------

kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_pred = np.zeros(len(train))
fold_losses = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X), 1):
    print(f"\n===== Fold {fold} =====")

    X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
    val_pool = Pool(X_val, y_val, cat_features=categorical_cols)

    model = CatBoostClassifier(**cat_model_params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    # Predict probability of class 1
    val_pred = model.predict_proba(val_pool)[:, 1]

    # Log loss for this fold
    loss = log_loss(y_val, val_pred)
    fold_losses.append(loss)
    print(f"Fold {fold} LogLoss = {loss:.5f}")

    # Store OOF predictions
    oof_pred[valid_idx] = val_pred

# -------------------------
# Overall CV Performance
# -------------------------

print("\n===== CatBoost CV Results =====")
print("Fold LogLosses:", fold_losses)
print("Mean LogLoss:", np.mean(fold_losses))
print("StdDev:", np.std(fold_losses))

# -------------------------
# Train final model on full data
# -------------------------

full_pool = Pool(X, y, cat_features=categorical_cols)
final_model = CatBoostClassifier(**cat_model_params)
final_model.fit(full_pool, verbose=200)





===== Fold 1 =====
0:	learn: 0.6790796	test: 0.6791940	best: 0.6791940 (0)	total: 138ms	remaining: 6m 53s
200:	learn: 0.4230294	test: 0.4330748	best: 0.4330748 (200)	total: 8.91s	remaining: 2m 4s
400:	learn: 0.4147764	test: 0.4302714	best: 0.4302559 (394)	total: 17.1s	remaining: 1m 50s
600:	learn: 0.4082924	test: 0.4302811	best: 0.4300474 (520)	total: 25.9s	remaining: 1m 43s
800:	learn: 0.4026065	test: 0.4304950	best: 0.4300474 (520)	total: 34.8s	remaining: 1m 35s
1000:	learn: 0.3973703	test: 0.4310673	best: 0.4300474 (520)	total: 43.7s	remaining: 1m 27s
1200:	learn: 0.3923364	test: 0.4317640	best: 0.4300474 (520)	total: 52.6s	remaining: 1m 18s
1400:	learn: 0.3876160	test: 0.4324086	best: 0.4300474 (520)	total: 1m 2s	remaining: 1m 10s
1600:	learn: 0.3827519	test: 0.4329412	best: 0.4300474 (520)	total: 1m 10s	remaining: 1m 2s
1800:	learn: 0.3782314	test: 0.4334240	best: 0.4300474 (520)	total: 1m 20s	remaining: 53.6s
2000:	learn: 0.3739311	test: 0.4341491	best: 0.4300474 (520)	total: 1m

<catboost.core.CatBoostClassifier at 0x11247eb40>

In [7]:
# -------------------------
# Test set predictions
# -------------------------

test_pool = Pool(test_X, cat_features=categorical_cols)
test_pred = final_model.predict_proba(test_pool)[:, 1]

submission = pd.DataFrame({
    "ID": test["ID"],
    "y_passXtremeDurability": test_pred
})

submission.to_csv("submission_catboost.csv", index=False)
print("Saved submission_catboost.csv")

Saved submission_catboost.csv


In [9]:
import optuna
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

# -------------------------
# Data setup
# -------------------------

target = "y_passXtremeDurability"

categorical_cols = [
    "alloy",
    "cutTemp",
    "rollTemp",
    "topEdgeMicroChipping",
    "blockSource",
    "machineRestart",
]

numeric_cols = [
    "firstPassRollPressure",
    "secondPassRollPressure",
    "contourDefNdx",
    "clearPassNdx",
    "Cu","Mg","Mn","Si","Zn","Cr","Fe","Ti","Ag","Zr"
]

features = categorical_cols + numeric_cols

X = train[features]
y = train[target]

test_X = test[features]

# -------------------------
# Optuna objective
# -------------------------

def objective(trial):
    # Hyperparameter search space
    params = {
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "random_seed": 42,
    "task_type": "CPU",
    "verbose": False,
    "iterations": 3000,
    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
    "depth": trial.suggest_int("depth", 4, 10),
    "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0, log=True),
    "random_strength": trial.suggest_float("random_strength", 0.0, 5.0),
    "border_count": trial.suggest_int("border_count", 32, 255),
    "bootstrap_type": trial.suggest_categorical(
        "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
    ),
}

    # Only valid for Bayesian
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0.0, 5.0)

    # Only valid for Bernoulli
    if params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.5, 1.0)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_losses = []

    for train_idx, valid_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
        val_pool = Pool(X_val, y_val, cat_features=categorical_cols)

        model = CatBoostClassifier(**params)
        model.fit(
            train_pool,
            eval_set=val_pool,
            use_best_model=True,
            early_stopping_rounds=200,
        )

        val_pred = model.predict_proba(val_pool)[:, 1]
        loss = log_loss(y_val, val_pred)
        fold_losses.append(loss)

    mean_loss = float(np.mean(fold_losses))
    return mean_loss

# -------------------------
# Run Optuna study
# -------------------------

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # bump n_trials up if you have compute

print("Best LogLoss:", study.best_value)
print("Best Params:", study.best_params)

best_params = study.best_params.copy()
# Add fixed params that we didn't tune
best_params.update({
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "random_seed": 42,
    "task_type": "CPU",
    "iterations": 3000,
    "verbose": 200
})

# -------------------------
# Train final model on full data
# -------------------------

full_pool = Pool(X, y, cat_features=categorical_cols)
final_model = CatBoostClassifier(**best_params)
final_model.fit(
    full_pool,
    eval_set=None,  # you could hold out a small validation set if you want
)

# -------------------------
# Predict on test set
# -------------------------

test_pool = Pool(test_X, cat_features=categorical_cols)
test_pred = final_model.predict_proba(test_pool)[:, 1]

submission = pd.DataFrame({
    "ID": test["ID"],
    "y_passXtremeDurability": test_pred
})

submission.to_csv("submission_catboost_optuna.csv", index=False)
print("Saved submission_catboost_optuna.csv")


[I 2025-11-28 11:41:09,623] A new study created in memory with name: no-name-2877643a-445d-4644-bac4-679ca7671d0a
[I 2025-11-28 11:43:47,161] Trial 0 finished with value: 0.4239247188465523 and parameters: {'learning_rate': 0.048005750653714645, 'depth': 6, 'l2_leaf_reg': 3.230836366495558, 'random_strength': 4.208508416515103, 'border_count': 227, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.4057120085882898}. Best is trial 0 with value: 0.4239247188465523.
[I 2025-11-28 11:51:47,362] Trial 1 finished with value: 0.4257792653953011 and parameters: {'learning_rate': 0.019457905977363302, 'depth': 10, 'l2_leaf_reg': 1.0426508318286716, 'random_strength': 1.5938452630299549, 'border_count': 207, 'bootstrap_type': 'Bernoulli', 'subsample': 0.9873706067290364}. Best is trial 0 with value: 0.4239247188465523.
[I 2025-11-28 11:54:49,742] Trial 2 finished with value: 0.4237145044394953 and parameters: {'learning_rate': 0.0457663924533457, 'depth': 5, 'l2_leaf_reg': 5.350196121005697

Best LogLoss: 0.42344609050982107
Best Params: {'learning_rate': 0.05160866875760279, 'depth': 4, 'l2_leaf_reg': 3.61016777833069, 'random_strength': 2.838580404393465, 'border_count': 148, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.1598689013819204}
0:	learn: 0.6738835	total: 32.5ms	remaining: 1m 37s
200:	learn: 0.4361478	total: 6.28s	remaining: 1m 27s
400:	learn: 0.4244302	total: 12.7s	remaining: 1m 22s
600:	learn: 0.4222583	total: 19.4s	remaining: 1m 17s
800:	learn: 0.4212344	total: 26.2s	remaining: 1m 11s
1000:	learn: 0.4204892	total: 32.9s	remaining: 1m 5s
1200:	learn: 0.4198466	total: 39.7s	remaining: 59.4s
1400:	learn: 0.4192226	total: 46.5s	remaining: 53.1s
1600:	learn: 0.4185725	total: 53.4s	remaining: 46.7s
1800:	learn: 0.4179942	total: 1m	remaining: 40.2s
2000:	learn: 0.4174657	total: 1m 7s	remaining: 33.6s
2200:	learn: 0.4169293	total: 1m 14s	remaining: 27s
2400:	learn: 0.4164808	total: 1m 21s	remaining: 20.3s
2600:	learn: 0.4160194	total: 1m 28s	remaining: 13.5

In [14]:
# Get best 10 trials
top_trials = sorted(study.trials, key=lambda t: t.value)[:10]

print("Top 10 trial scores:")
for t in top_trials:
    print(t.number, t.value)


Top 10 trial scores:
11 0.42344609050982107
12 0.42350705026150504
24 0.423557194462875
14 0.4235618266638001
22 0.42357177233762433
19 0.42362547499153785
2 0.4237145044394953
16 0.42381970663562407
21 0.4238532133669176
29 0.4238703580388797


In [15]:
# Base fixed params
base_params = {
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "iterations": 3000,
    "random_seed": 42,
    "task_type": "CPU",
    "verbose": False
}

models = []

for trial in top_trials:
    params = trial.params.copy()
    params.update(base_params)
    
    model = CatBoostClassifier(**params)
    model.fit(Pool(X, y, cat_features=categorical_cols))
    models.append(model)

print(f"Trained {len(models)} models from top Optuna trials.")


Trained 10 models from top Optuna trials.


In [18]:
print("Train columns:", list(X.columns))
print("Test_X columns:", list(test_X.columns))


Train columns: ['ID', 'alloy', 'cutTemp', 'rollTemp', 'firstPassRollPressure', 'secondPassRollPressure', 'topEdgeMicroChipping', 'blockSource', 'machineRestart', 'contourDefNdx', 'clearPassNdx', 'Cu', 'Mg', 'Mn', 'Si', 'Zn', 'Cr', 'Fe', 'Ti', 'Ag', 'Zr']
Test_X columns: ['alloy', 'cutTemp', 'rollTemp', 'topEdgeMicroChipping', 'blockSource', 'machineRestart', 'firstPassRollPressure', 'secondPassRollPressure', 'contourDefNdx', 'clearPassNdx', 'Cu', 'Mg', 'Mn', 'Si', 'Zn', 'Cr', 'Fe', 'Ti', 'Ag', 'Zr']


In [20]:
test_X_with_ID = pd.concat([test["ID"], test[features]], axis=1)


In [21]:
test_pool = Pool(test_X_with_ID, cat_features=categorical_cols)


In [22]:
all_preds = []

for model in models:
    p = model.predict_proba(test_pool)[:, 1]
    all_preds.append(p)

ensemble_pred = np.mean(np.column_stack(all_preds), axis=1)


In [23]:
submission = pd.DataFrame({
    "ID": test["ID"],
    "y_passXtremeDurability": ensemble_pred
})

submission.to_csv("submission_top10_optuna_ensemble.csv", index=False)
print("Saved submission_top10_optuna_ensemble.csv")


Saved submission_top10_optuna_ensemble.csv
