In [None]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train_y = train[["Transported"]].astype(int)
test_y = test[["PassengerId"]]

train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0])
train['GroupSize'] = train.groupby('Group')['PassengerId'].transform('count')
train = train.drop('Group', axis=1)

test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0])
test['GroupSize'] = test.groupby('Group')['PassengerId'].transform('count')
test = test.drop('Group', axis=1)

def get_cabin_side(cabin):
    if pd.isna(cabin):  
        return None  
    side = cabin.split('/')[-1]
    if side == 'P':
        return 0
    elif side == 'S':
        return 1
    else:
        return None

train['CabinSide'] = train['Cabin'].apply(get_cabin_side)
test['CabinSide'] = test['Cabin'].apply(get_cabin_side)

def get_cabin_deck(cabin):
    if pd.isna(cabin): 
        return None 
    try:
        deck = cabin.split('/')[0]
        return deck
    except:
        return None

train['CabinDeck'] = train['Cabin'].apply(get_cabin_deck)
test['CabinDeck'] = test['Cabin'].apply(get_cabin_deck)

train['VIP'] = train['VIP'].fillna(0)
train["VIP"] = train["VIP"].astype(int)

train['CryoSleep'] = train['CryoSleep'].fillna(0)
train["CryoSleep"] = train["CryoSleep"].astype(int)

test['VIP'] = test['VIP'].fillna(0)
test["VIP"] = test["VIP"].astype(int)

test['CryoSleep'] = test['CryoSleep'].fillna(0)
test["CryoSleep"] = test["CryoSleep"].astype(int)


train = train.drop(columns=["Name"])
train = train.drop(columns=["PassengerId"])
train = train.drop(columns=["Transported"])

test = test.drop(columns=["Name"])
test = test.drop(columns=["PassengerId"])

train = train.drop(columns=["Cabin"])
test = test.drop(columns=["Cabin"])

In [6]:
num_cols = [cname for cname in train.columns if train[cname].dtype in ["int64", "float64"]]
cat_cols = [cname for cname in train.columns if train[cname].dtype == "object"]

from sklearn.pipeline import Pipeline
num_transformer = Pipeline(steps=[("scaler", StandardScaler())])

cat_transformer = Pipeline(
    steps=[
        (
            "onehot",
            OneHotEncoder(drop='if_binary', handle_unknown="ignore")
        )
    ]
)

ct = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols),
    ],
    remainder="passthrough"
)

train = ct.fit_transform(train)
test = ct.fit_transform(test)

print(train.shape, test.shape)

(8693, 27) (4277, 27)


In [46]:
classifiers = {
    "Forest": RandomForestClassifier(
        n_estimators=400, max_depth=13, random_state=1, min_samples_split=10, min_samples_leaf=3
    ),
}

FOLDS = 10
y = train_y.values

preds = np.zeros(test.shape[0])
preds_train = np.zeros(train.shape[0])

In [47]:
for key, classifier in classifiers.items():
    start = time.time()

    cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

    score = 0
    for fold, (train_idx, val_idx) in enumerate(cv.split(train, y)):
        X_train, X_valid = train[train_idx], train[val_idx]
        y_train, y_valid = y[train_idx], y[val_idx]

        clf = classifier
        clf.fit(X_train, y_train)

        preds += clf.predict_proba(test)[:, 1]
        preds_train += clf.predict_proba(train)[:, 1]
        score += clf.score(X_valid, y_valid)
        print("Average infold validation accuracy: ", key, fold, np.round(100 * clf.score(X_valid, y_valid), 2))
        
    score = score / FOLDS
    stop = time.time()
    print(np.round(100 * score, 2))
    print(np.round(stop - start) / 60, 2)

  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 0 81.26


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 1 80.57


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 2 81.15


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 3 80.44


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 4 82.05


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 5 80.09


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 6 79.4


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 7 78.94


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 8 81.36


  return fit_method(estimator, *args, **kwargs)


Average infold validation accuracy:  Forest 9 79.75
80.5
0.38333333333333336 2


In [48]:
preds = preds / (FOLDS * len(classifiers))
preds_train = preds_train / (FOLDS * len(classifiers))

output = pd.DataFrame(
    {
        "PassengerId": test_y["PassengerId"],
        "Transported": (preds > 0.5)
    }
)
output.to_csv("submission.csv", index=False)