In [43]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [44]:
# Feature Engineering
target = 'Transported'

train[["Deck", "CabinNum", "Side"]] = train["Cabin"].str.split("/", expand=True)
train["TotalSpend"] = train[
    ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
].sum(axis=1)

test[["Deck", "CabinNum", "Side"]] = test["Cabin"].str.split("/", expand=True)
test["TotalSpend"] = test[
    ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
].sum(axis=1)

X = train.drop(columns=['PassengerId', 'Name', target])
y = train[target]
# 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor_tree = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols),
])

## Random Forest

In [45]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor_tree),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1,
    ))
])

rf_scores = cross_val_score(
    rf_model, X, y, cv=10, scoring="accuracy", n_jobs=-1
)
print("Random Forest CV:", rf_scores.mean())

Random Forest CV: 0.7908707326428845


## Gradient Boosting

In [46]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = Pipeline(steps=[
    ("preprocessor", preprocessor_tree),
    ("classifier", GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3,
        random_state=42,
    ))
])

gb_scores = cross_val_score(
    gb_model, X, y, cv=5, scoring="accuracy", n_jobs=-1
)
print("Gradient Boosting CV:", gb_scores.mean())

Gradient Boosting CV: 0.7947800112626399


## LightGBM

In [47]:
from lightgbm import LGBMClassifier

lgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor_tree),
    ("classifier", LGBMClassifier(
        n_estimators=300,
        learning_rate=0.01,
        num_leaves=20,
        random_state=42,
    ))
])

lgb_scores = cross_val_score(
    lgb_model, X, y, cv=5, scoring="accuracy", n_jobs=-1
)
print("LightGBM CV:", lgb_scores.mean())



LightGBM CV: 0.799726242414096




In [48]:
print(f"Random Forest: {rf_scores.mean():.4f}")
print(f"Gradient Boosting: {gb_scores.mean():.4f}")
print(f"LightGBM: {lgb_scores.mean():.4f}")

Random Forest: 0.7909
Gradient Boosting: 0.7948
LightGBM: 0.7997


In [49]:
lgb_model.fit(X, y)
X_test = test.drop(columns=['PassengerId', 'Name'])

[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1664
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [50]:
test_preds = lgb_model.predict(X_test)



In [52]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": test_preds
})

submission.to_csv("data/submission_lgbm.csv", index=False)