In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import optuna

## Load data

In [32]:
df = pd.read_csv("../data/train.csv", index_col="PassengerId")

In [33]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## EDA

In [34]:
# checking numeric variables
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [35]:
# checking non-numeric variables, how many distinct values they have
df.describe(include="O")

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [36]:
# check how many missing values are there
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [37]:
TARGET = "Survived"
NUM_FEATURES = df.select_dtypes("number").drop(TARGET, axis=1).columns.tolist()
CAT_FEATURES = ["Sex", "Embarked"]
FEATURES = NUM_FEATURES + CAT_FEATURES

In [38]:
df[CAT_FEATURES] = df[CAT_FEATURES].astype("category")

## Split data

In [39]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=23)

In [41]:
df_train.shape, df_test.shape

((712, 11), (179, 11))

## Hyperparameter tuning

In [49]:
study = optuna.create_study(direction="maximize", storage="sqlite:///optuna.db", pruner=optuna.pruners.HyperbandPruner())

[I 2025-01-11 10:08:25,633] A new study created in RDB with name: no-name-d44c280f-087b-43cf-86d7-0e9caab7ca2f


In [50]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.00001, 0.5, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 40),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 100),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "verbose": -1,
        "random_state": 23,
    }

    aucs = []
    for i, (train_idx, val_idx) in enumerate(KFold(n_splits=5, random_state=23, shuffle=True).split(df_train)):
        train, val = df.iloc[train_idx], df.iloc[val_idx]

        model = LGBMClassifier(**params)
        model.fit(train[FEATURES], train[TARGET])
        y_pred = model.predict_proba(val[FEATURES])[:, 1]

        auc = roc_auc_score(val[TARGET], y_pred)
        trial.report(auc, i)

        if trial.should_prune():
            raise optuna.TrialPruned()
        
        aucs.append(auc)

    return sum(aucs) / len(aucs)

In [52]:
study.optimize(objective, n_trials=100)

[I 2025-01-11 10:09:56,444] Trial 100 pruned. 
[I 2025-01-11 10:09:57,159] Trial 101 pruned. 
[I 2025-01-11 10:09:59,068] Trial 102 pruned. 
[I 2025-01-11 10:09:59,218] Trial 103 pruned. 
[I 2025-01-11 10:09:59,583] Trial 104 finished with value: 0.8496301855166722 and parameters: {'n_estimators': 71, 'learning_rate': 0.12054913338705231, 'max_depth': 14, 'num_leaves': 109, 'min_child_samples': 58, 'subsample': 0.9843643970882175, 'colsample_bytree': 0.7595513239058082}. Best is trial 22 with value: 0.85765229174194.
[I 2025-01-11 10:10:00,469] Trial 105 pruned. 
[I 2025-01-11 10:10:02,206] Trial 106 finished with value: 0.8528369118435941 and parameters: {'n_estimators': 485, 'learning_rate': 0.05258018200207243, 'max_depth': 6, 'num_leaves': 138, 'min_child_samples': 67, 'subsample': 0.8588106975595923, 'colsample_bytree': 0.9832777276565996}. Best is trial 22 with value: 0.85765229174194.
[I 2025-01-11 10:10:02,855] Trial 107 pruned. 
[I 2025-01-11 10:10:03,885] Trial 108 finished w

In [53]:
optuna.visualization.plot_parallel_coordinate(study)

## Train

In [59]:
model = LGBMClassifier(**study.best_params, verbose=-1, random_state=23)

In [60]:
model.fit(df_train[FEATURES], df_train[TARGET])

## Evaluate

In [61]:
df_train["pred"] = model.predict_proba(df_train[FEATURES])[:, 1]
df_test["pred"] = model.predict_proba(df_test[FEATURES])[:, 1]

In [62]:
roc_auc_score(df_train[TARGET], df_train["pred"]), roc_auc_score(df_test[TARGET], df_test["pred"])

(np.float64(0.9717120312966216), np.float64(0.8512228260869565))