In [469]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [521]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [522]:
train_y = train[["Transported"]].astype(int)
test_y = test[["PassengerId"]]

In [None]:
# train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0])

# def extract_last_name(name):
#     if isinstance(name, str):
#         return name.split(' ')[-1].strip()
#     else:
#         return None

# train['LastName'] = train['Name'].apply(extract_last_name)

# def get_family_size(row):
#     group = row['Group']
#     lastName = row['LastName']
#     family_size = len(train[(train['Group'] == group) & (train['LastName'] == lastName)])
#     return family_size

# train['FamilySize'] = train.apply(get_family_size, axis=1)
# train['Alone'] = train['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
# train = train.drop(['Group', 'LastName'], axis=1)

In [None]:
# test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0])
# test['LastName'] = test['Name'].apply(extract_last_name)

# def get_family_size_test(row):
#     group = row['Group']
#     lastName = row['LastName']
#     family_size = len(test[(test['Group'] == group) & (test['LastName'] == lastName)])
#     return family_size

# test['FamilySize'] = test.apply(get_family_size_test, axis=1)
# test['Alone'] = test['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
# test = test.drop(['Group', 'LastName'], axis=1)

In [None]:

train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0])
train['GroupSize'] = train.groupby('Group')['PassengerId'].transform('count')
train = train.drop('Group', axis=1)

test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0])
test['GroupSize'] = test.groupby('Group')['PassengerId'].transform('count')
test = test.drop('Group', axis=1)

def get_cabin_side(cabin):
    if pd.isna(cabin):  
        return None  
    side = cabin.split('/')[-1]
    if side == 'P':
        return 0
    elif side == 'S':
        return 1
    else:
        return None

train['CabinSide'] = train['Cabin'].apply(get_cabin_side)
test['CabinSide'] = test['Cabin'].apply(get_cabin_side)

def get_cabin_deck(cabin):
    if pd.isna(cabin): 
        return None 
    try:
        # deck = cabin.split('/')[0]
        # return deck
        parts = cabin.split('/')
        return f"{parts[0]}/{parts[1]}"
    except:
        return None

train['CabinDeck'] = train['Cabin'].apply(get_cabin_deck)
test['CabinDeck'] = test['Cabin'].apply(get_cabin_deck)



In [526]:
train['VIP'] = train['VIP'].fillna(0)
train["VIP"] = train["VIP"].astype(int)

train['CryoSleep'] = train['CryoSleep'].fillna(0)
train["CryoSleep"] = train["CryoSleep"].astype(int)

test['VIP'] = test['VIP'].fillna(0)
test["VIP"] = test["VIP"].astype(int)

test['CryoSleep'] = test['CryoSleep'].fillna(0)
test["CryoSleep"] = test["CryoSleep"].astype(int)

# train['RoomService'] = train['RoomService'].fillna(0.0)
# train['FoodCourt'] = train['FoodCourt'].fillna(0.0)
# train['ShoppingMall'] = train['ShoppingMall'].fillna(0.0)
# train['Spa'] = train['Spa'].fillna(0.0)
# train['VRDeck'] = train['VRDeck'].fillna(0.0)

train = train.drop(columns=["Name"])
train = train.drop(columns=["PassengerId"])
train = train.drop(columns=["Transported"])

test = test.drop(columns=["Name"])
test = test.drop(columns=["PassengerId"])

train = train.drop(columns=["Cabin"])
test = test.drop(columns=["Cabin"])



In [527]:
train[["RoomService_log"]] = np.log10(1 + train[["RoomService"]])
train[["FoodCourt_log"]] = np.log10(1 + train[["FoodCourt"]])
train[["ShoppingMall_log"]] = np.log10(1 + train[["ShoppingMall"]])
train[["Spa_log"]] = np.log10(1 + train[["Spa"]])
train[["VRDeck_log"]] = np.log10(1 + train[["VRDeck"]])

test[["RoomService_log"]] = np.log10(1 + test[["RoomService"]])
test[["FoodCourt_log"]] = np.log10(1 + test[["FoodCourt"]])
test[["ShoppingMall_log"]] = np.log10(1 + test[["ShoppingMall"]])
test[["Spa_log"]] = np.log10(1 + test[["Spa"]])
test[["VRDeck_log"]] = np.log10(1 + test[["VRDeck"]])

train = train.drop(columns=["RoomService"])
train = train.drop(columns=["FoodCourt"])
train = train.drop(columns=["ShoppingMall"])
train = train.drop(columns=["Spa"])
train = train.drop(columns=["VRDeck"])

test = test.drop(columns=["RoomService"])
test = test.drop(columns=["FoodCourt"])
test = test.drop(columns=["ShoppingMall"])
test = test.drop(columns=["Spa"])
test = test.drop(columns=["VRDeck"])




In [528]:
num_cols = [cname for cname in train.columns if train[cname].dtype in ["int64", "float64"]]
cat_cols = [cname for cname in train.columns if train[cname].dtype == "object"]

In [529]:
from sklearn.pipeline import Pipeline
num_transformer = Pipeline(steps=[("scaler", StandardScaler())])

cat_transformer = Pipeline(
    steps=[
        (
            "onehot",
            OneHotEncoder(drop='if_binary', handle_unknown="ignore")
        )
    ]
)

In [530]:
ct = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols),
    ],
    remainder="passthrough"
)

In [531]:
train = ct.fit_transform(train)
test = ct.fit_transform(test)

# combined_transformed = ct.fit_transform(combined)

# train = combined_transformed[:train_len]
# test = combined_transformed[train_len:]


In [532]:
print(train.shape, test.shape)

(8693, 29) (4277, 29)


In [533]:
classifiers = {
    "LGBM": LGBMClassifier(
        learning_rate=0.01, max_depth=8, n_estimators=500, random_state=0, num_leaves=31
    ),
}

In [534]:

FOLDS = 10
y = train_y.values

In [535]:
preds = np.zeros(test.shape[0])
preds_train = np.zeros(train.shape[0])

In [536]:
for key, classifier in classifiers.items():
    start = time.time()

    cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

    score = 0
    for fold, (train_idx, val_idx) in enumerate(cv.split(train, y)):
        X_train, X_valid = train[train_idx], train[val_idx]
        y_train, y_valid = y[train_idx], y[val_idx]

        clf = classifier
        clf.fit(X_train, y_train)

        preds += clf.predict_proba(test)[:, 1]
        preds_train += clf.predict_proba(train)[:, 1]
        score += clf.score(X_valid, y_valid)
        print("Average infold validation accuracy: ", key, fold, np.round(100 * clf.score(X_valid, y_valid), 2))
        
    score = score / FOLDS
    stop = time.time()
    print(np.round(100 * score, 2))
    print(np.round(stop - start) / 60, 2)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 0 80.46


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 1 80.0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 2 81.38


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 3 79.06


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 4 81.47


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 5 80.78


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 6 80.9


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 7 78.94


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 8 81.01
Average infold validation accuracy:  LGBM 9 79.75
80.37
0.1 2




In [537]:
preds = preds / (FOLDS * len(classifiers))
preds_train = preds_train / (FOLDS * len(classifiers))

In [538]:
output = pd.DataFrame(
    {
        "PassengerId": test_y["PassengerId"],
        "Transported": (preds > 0.5)
    }
)
output.to_csv("submission.csv", index=False)