In [163]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split

In [164]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [165]:
train_y = train[["Transported"]].astype(int)
test_y = test[["PassengerId"]]

In [None]:

train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0])
train['GroupSize'] = train.groupby('Group')['PassengerId'].transform('count')
train = train.drop('Group', axis=1)

test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0])
test['GroupSize'] = test.groupby('Group')['PassengerId'].transform('count')
test = test.drop('Group', axis=1)

def get_cabin_side(cabin):
    if pd.isna(cabin):  
        return None 
    side = cabin.split('/')[-1]
    if side == 'P':
        return 0
    elif side == 'S':
        return 1
    else:
        return None

train['CabinSide'] = train['Cabin'].apply(get_cabin_side)
test['CabinSide'] = test['Cabin'].apply(get_cabin_side)

def get_cabin_deck(cabin):
    if pd.isna(cabin): 
        return None
    try:
        parts = cabin.split('/')
        return f"{parts[0]}"
    except:
        return None

train['CabinDeck'] = train['Cabin'].apply(get_cabin_deck)
test['CabinDeck'] = test['Cabin'].apply(get_cabin_deck)



In [None]:
train['VIP'] = train['VIP'].fillna(0)
train["VIP"] = train["VIP"].astype(int)

train['CryoSleep'] = train['CryoSleep'].fillna(0)
train["CryoSleep"] = train["CryoSleep"].astype(int)

test['VIP'] = test['VIP'].fillna(0)
test["VIP"] = test["VIP"].astype(int)

test['CryoSleep'] = test['CryoSleep'].fillna(0)
test["CryoSleep"] = test["CryoSleep"].astype(int)

train = train.drop(columns=["Name", "PassengerId", "Transported", "Cabin"])
test = test.drop(columns=["Name", "PassengerId", "Cabin"])


In [168]:
train[["RoomService_log"]] = np.log10(1 + train[["RoomService"]])
train[["FoodCourt_log"]] = np.log10(1 + train[["FoodCourt"]])
train[["ShoppingMall_log"]] = np.log10(1 + train[["ShoppingMall"]])
train[["Spa_log"]] = np.log10(1 + train[["Spa"]])
train[["VRDeck_log"]] = np.log10(1 + train[["VRDeck"]])

test[["RoomService_log"]] = np.log10(1 + test[["RoomService"]])
test[["FoodCourt_log"]] = np.log10(1 + test[["FoodCourt"]])
test[["ShoppingMall_log"]] = np.log10(1 + test[["ShoppingMall"]])
test[["Spa_log"]] = np.log10(1 + test[["Spa"]])
test[["VRDeck_log"]] = np.log10(1 + test[["VRDeck"]])

train = train.drop(["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"], axis=1)
test = test.drop(["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"], axis=1)


In [169]:
num_cols = [cname for cname in train.columns if train[cname].dtype in ["int64", "float64"]]
cat_cols = [cname for cname in train.columns if train[cname].dtype == "object"]

In [170]:
from sklearn.pipeline import Pipeline
num_transformer = Pipeline(steps=[("scaler", StandardScaler())])

cat_transformer = Pipeline(
    steps=[
        (
            "onehot",
            OneHotEncoder(drop='if_binary', handle_unknown="ignore")
        )
    ]
)

In [171]:
ct = ColumnTransformer(
    transformers = [
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols),
    ],
    remainder="passthrough"
)

In [172]:
train = ct.fit_transform(train)
test = ct.fit_transform(test)


In [173]:
print(train.shape, test.shape)

(8693, 27) (4277, 27)


In [None]:
classifiers = {
    "LGBM": LGBMClassifier(
        learning_rate=0.00974924788169623, max_depth=9, n_estimators=474, random_state=0, num_leaves=19
    ),
}

In [175]:

FOLDS = 10
y = train_y.values

In [None]:

X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(train, train_y, test_size=0.2, random_state=42)

def optimize_lgbm(trial):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'random_state': 0
    }

    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train_opt, y_train_opt)

    y_pred = lgb_model.predict(X_test_opt)
    accuracy = accuracy_score(y_test_opt, y_pred)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(optimize_lgbm, n_trials=100)

print('Лучшие гиперпараметры:', study.best_params)
print('Лучшая оценка:', study.best_value)

classifiers = {
    "LGBM": lgb.LGBMClassifier(
        learning_rate=study.best_params['learning_rate'],
        max_depth=study.best_params['max_depth'],
        n_estimators=study.best_params['n_estimators'],
        num_leaves=study.best_params['num_leaves'],
        random_state=0
    ),
}

In [177]:
preds = np.zeros(test.shape[0])
preds_train = np.zeros(train.shape[0])

In [178]:
for key, classifier in classifiers.items():
    start = time.time()

    cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)

    score = 0
    for fold, (train_idx, val_idx) in enumerate(cv.split(train, y)):
        X_train, X_valid = train[train_idx], train[val_idx]
        y_train, y_valid = y[train_idx], y[val_idx]

        clf = classifier
        clf.fit(X_train, y_train)

        preds += clf.predict_proba(test)[:, 1]
        preds_train += clf.predict_proba(train)[:, 1]
        score += clf.score(X_valid, y_valid)
        print("Average infold validation accuracy: ", key, fold, np.round(100 * clf.score(X_valid, y_valid), 2))
        
    score = score / FOLDS
    stop = time.time()
    print(np.round(100 * score, 2))
    print(np.round(stop - start) / 60, 2)



[LightGBM] [Info] Number of positive: 3940, number of negative: 3883
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000431 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1406
[LightGBM] [Info] Number of data points in the train set: 7823, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503643 -> initscore=0.014573
[LightGBM] [Info] Start training from score 0.014573


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 0 80.23
[LightGBM] [Info] Number of positive: 3940, number of negative: 3883
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1407
[LightGBM] [Info] Number of data points in the train set: 7823, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503643 -> initscore=0.014573
[LightGBM] [Info] Start training from score 0.014573


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 1 80.34
[LightGBM] [Info] Number of positive: 3940, number of negative: 3883
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1406
[LightGBM] [Info] Number of data points in the train set: 7823, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503643 -> initscore=0.014573
[LightGBM] [Info] Start training from score 0.014573


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 2 81.03
[LightGBM] [Info] Number of positive: 3941, number of negative: 3883
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1407
[LightGBM] [Info] Number of data points in the train set: 7824, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503707 -> initscore=0.014826
[LightGBM] [Info] Start training from score 0.014826


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 3 79.29
[LightGBM] [Info] Number of positive: 3941, number of negative: 3883
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1406
[LightGBM] [Info] Number of data points in the train set: 7824, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503707 -> initscore=0.014826
[LightGBM] [Info] Start training from score 0.014826


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 4 80.67
[LightGBM] [Info] Number of positive: 3940, number of negative: 3884
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1406
[LightGBM] [Info] Number of data points in the train set: 7824, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503579 -> initscore=0.014315
[LightGBM] [Info] Start training from score 0.014315


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 5 80.78
[LightGBM] [Info] Number of positive: 3940, number of negative: 3884
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1407
[LightGBM] [Info] Number of data points in the train set: 7824, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503579 -> initscore=0.014315
[LightGBM] [Info] Start training from score 0.014315


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 6 80.09
[LightGBM] [Info] Number of positive: 3940, number of negative: 3884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1407
[LightGBM] [Info] Number of data points in the train set: 7824, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503579 -> initscore=0.014315
[LightGBM] [Info] Start training from score 0.014315


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 7 79.63
[LightGBM] [Info] Number of positive: 3940, number of negative: 3884
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1407
[LightGBM] [Info] Number of data points in the train set: 7824, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503579 -> initscore=0.014315
[LightGBM] [Info] Start training from score 0.014315


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Average infold validation accuracy:  LGBM 8 82.05
[LightGBM] [Info] Number of positive: 3940, number of negative: 3884
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1407
[LightGBM] [Info] Number of data points in the train set: 7824, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503579 -> initscore=0.014315
[LightGBM] [Info] Start training from score 0.014315
Average infold validation accuracy:  LGBM 9 80.44
80.46
0.08333333333333333 2




In [179]:
preds = preds / (FOLDS * len(classifiers))
preds_train = preds_train / (FOLDS * len(classifiers))

In [180]:
output = pd.DataFrame(
    {
        "PassengerId": test_y["PassengerId"],
        "Transported": (preds > 0.5)
    }
)
output.to_csv("submission.csv", index=False)