## Loading Required Libraries/Packages

In [1]:
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

## Preprocessing

In [7]:
# Loading our datasets

train_df = pd.read_csv("aluminum_coldRoll_train.csv")
test_df = pd.read_csv("aluminum_coldRoll_testNoY.csv")

# Separating target data from test data

y = train_df["y_passXtremeDurability"]
X = train_df.drop(columns = ["y_passXtremeDurability", "ID"])
test_ids = test_df["ID"]
X_test = test_df.drop(columns = "ID")

# Converting to category data type for LightGBM (one of two boosting models utilized)

cat_cols = X.select_dtypes(include = "object").columns.tolist()
for col in cat_cols:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

## LightGBM Preprocessing

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 35, stratify = y)

# Creating our LightGBM train and validation datasets
lgb_train = lgb.Dataset(X_train, label = y_train, categorical_feature = cat_cols)
lgb_val = lgb.Dataset(X_val, label = y_val, categorical_feature = cat_cols)

### Defining LightGBM Hyperparameters

In [9]:
params_lgb = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,
    "num_leaves": 5,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 125,
    "lambda_l2": 1.2,
    "seed": 42,
    "verbose": -1
}

### Training Our Model

In [17]:
lgb_model = lgb.train(
    params_lgb,
    lgb_train,
    num_boost_round = 5000,
    valid_sets = lgb_val,
    valid_names = "val",
    callbacks = [
        lgb.early_stopping(stopping_rounds = 100),
        lgb.log_evaluation(period = 100)
    ]
)

val_predictions = lgb_model.predict(X_val, num_iteration = lgb_model.best_iteration)
print("LightGBM Validation Set Logloss:", log_loss(y_val, val_predictions))

Training until validation scores don't improve for 100 rounds
[100]	val's binary_logloss: 0.484646
[200]	val's binary_logloss: 0.451004
[300]	val's binary_logloss: 0.438361
[400]	val's binary_logloss: 0.432899
[500]	val's binary_logloss: 0.430429
[600]	val's binary_logloss: 0.429258
[700]	val's binary_logloss: 0.4287
[800]	val's binary_logloss: 0.428536
[900]	val's binary_logloss: 0.428356
[1000]	val's binary_logloss: 0.428282
[1100]	val's binary_logloss: 0.428232
Early stopping, best iteration is:
[1082]	val's binary_logloss: 0.428227
LightGBM Validation Set Logloss: 0.4282273429164156


### Fitting Our LightGBM Model

Now that we have obtained a validation set logloss for one particular seed (42), let's average our predictions across multiple seeds in an attempt to reduce variance.

In [20]:
best_iteration = lgb_model.best_iteration

seeds = [0, 1, 2, 3, 4, 5, 6, 7]
lgb_test_predictions = []

for s in seeds:
    params_lgb_copy = params_lgb.copy()
    params_lgb_copy["seed"] = s

    print(f"Training LGB Model with seed {s}")
    lgb_full_model = lgb.train(
        params_lgb_copy,
    lgb.Dataset(X, label = y, categorical_feature = cat_cols),
    num_boost_round = best_iteration
    )

    lgb_test_predictions.append(lgb_full_model.predict(X_test, num_iteration = best_iteration))

lgb_seed_ensemble_predictions = np.mean(lgb_test_predictions, axis = 0)

Training LGB Model with seed 0
Training LGB Model with seed 1
Training LGB Model with seed 2
Training LGB Model with seed 3
Training LGB Model with seed 4
Training LGB Model with seed 5
Training LGB Model with seed 6
Training LGB Model with seed 7


## Defining Hyperparameters for CatBoost

In [21]:
params_cat = {
    "depth": 4,
    "learning_rate": 0.05,
    "iterations": 1150,
    "l2_leaf_reg": 4,
    "random_strength": 5,
    "loss_function": "Logloss",
    "eval_metric": "Logloss",
    "random_seed": 42,
    "verbose": 100
}

### Fitting Our CatBoost Model

In [26]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7]

cat_test_preds = []

for s in seeds:
    print(f"Training CatBoost with seed {s}")
    cat_params_copy = params_cat.copy()
    cat_params_copy["random_seed"] = s

    train_pool = Pool(X, y, cat_features = cat_cols)
    test_pool  = Pool(X_test, cat_features = cat_cols)
    
    cat_model = CatBoostClassifier(**cat_params_copy)
    cat_model.fit(train_pool)      
    
    cat_probabilities = cat_model.predict_proba(test_pool)[:, 1]
    cat_test_preds.append(cat_probabilities)

Training CatBoost with seed 0
0:	learn: 0.6745550	total: 398ms	remaining: 7m 37s
100:	learn: 0.4596262	total: 18.4s	remaining: 3m 11s
200:	learn: 0.4402235	total: 34.9s	remaining: 2m 44s
300:	learn: 0.4297828	total: 52.7s	remaining: 2m 28s
400:	learn: 0.4255617	total: 1m 11s	remaining: 2m 13s
500:	learn: 0.4237595	total: 1m 29s	remaining: 1m 55s
600:	learn: 0.4229638	total: 1m 46s	remaining: 1m 36s
700:	learn: 0.4223474	total: 2m 4s	remaining: 1m 19s
800:	learn: 0.4218932	total: 2m 26s	remaining: 1m 3s
900:	learn: 0.4215605	total: 2m 47s	remaining: 46.4s
1000:	learn: 0.4212723	total: 3m 9s	remaining: 28.1s
1100:	learn: 0.4209971	total: 3m 29s	remaining: 9.34s
1149:	learn: 0.4208827	total: 3m 39s	remaining: 0us
Training CatBoost with seed 1
0:	learn: 0.6764131	total: 250ms	remaining: 4m 47s
100:	learn: 0.4590692	total: 18.3s	remaining: 3m 10s
200:	learn: 0.4400427	total: 36.7s	remaining: 2m 53s
300:	learn: 0.4295037	total: 55.5s	remaining: 2m 36s
400:	learn: 0.4254775	total: 1m 15s	rema

In [27]:
cat_seed_ensemble_predictions = np.mean(cat_test_preds, axis = 0)

### Stacking our Models to Generate Final Predictions

In [28]:
final_probs = 0.8 * cat_seed_ensemble_predictions + (1 - 0.8) * lgb_seed_ensemble_predictions

In [29]:
submission = pd.DataFrame({
    "ID": test_ids,
    "y_passXtremeDurability": final_probs
})

In [30]:
submission.head()

Unnamed: 0,ID,y_passXtremeDurability
0,160001,0.003709
1,160002,0.051257
2,160003,0.937125
3,160004,0.715209
4,160005,0.808375


Example Submission: submission.to_csv("Team20_CatBoost+LightGBMAlpha080+SeedBoth_Dec1#3.csv", index=False)