## Instration

In [None]:
%%writefile requirements.txt
pandas
xgboost
lightgbm
catboost
scikit-learn
optuna
matplotlib
numpy

In [None]:
%pip install -U --user -r requirements.txt

## Import library

In [None]:
import pandas as pd
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_curve
import optuna
import matplotlib.pyplot as plt
import numpy as np
import warnings

## Settings

In [None]:
warnings.simplefilter('ignore')

## Fetch Dataset

### Check if kaggle command works

In [None]:
! kaggle -h

### Download dataset

In [None]:
! kaggle competitions download -c spaceship-titanic

### Unzip Dataset with Windows

In [None]:
! call powershell -command "Expand-Archive spaceship-titanic.zip data"

### Unzip Dataset with Mac

In [None]:
! unzip spaceship-titanic.zip -d data

## Read data from csv

In [None]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [None]:
df_train.head()

## Fill in missing values

In [None]:
df_train.isnull().sum()

In [None]:
df_train["Age"] = df_train["Age"].fillna(df_train["Age"].mean())
df_train["Cabin"] = df_train["Cabin"].fillna(method='ffill')
df_train["Destination"] = df_train["Destination"].fillna(method='ffill')
df_train["HomePlanet"] = df_train["HomePlanet"].fillna(method='ffill')
df_train["RoomService"] = df_train["RoomService"].fillna(df_train["RoomService"].median())
df_train["FoodCourt"] = df_train["FoodCourt"].fillna(df_train["FoodCourt"].median())
df_train["ShoppingMall"] = df_train["ShoppingMall"].fillna(df_train["ShoppingMall"].median())
df_train["Spa"] = df_train["Spa"].fillna(df_train["Spa"].median())
df_train["VRDeck"] = df_train["VRDeck"].fillna(df_train["VRDeck"].median())
df_train["CryoSleep"] = df_train["CryoSleep"].fillna(False)
df_train["VIP"] = df_train["VIP"].fillna(False)

In [None]:
df_test["Age"] = df_test["Age"].fillna(df_train["Age"].mean())
df_test["Cabin"] = df_test["Cabin"].fillna(method='ffill')
df_test["Destination"] = df_test["Destination"].fillna(method='ffill')
df_test["HomePlanet"] = df_test["HomePlanet"].fillna(method='ffill')
df_test["RoomService"] = df_test["RoomService"].fillna(df_train["RoomService"].median())
df_test["FoodCourt"] = df_test["FoodCourt"].fillna(df_train["FoodCourt"].median())
df_test["ShoppingMall"] = df_test["ShoppingMall"].fillna(df_train["ShoppingMall"].median())
df_test["Spa"] = df_test["Spa"].fillna(df_train["Spa"].median())
df_test["VRDeck"] = df_test["VRDeck"].fillna(df_train["VRDeck"].median())
df_test["CryoSleep"] = df_test["CryoSleep"].fillna(False)
df_test["VIP"] = df_test["VIP"].fillna(False)

## Encode categorical variables

### HomePlanet

In [None]:
print(df_train["HomePlanet"].value_counts())
print("=====================================")
hp_le = LabelEncoder()
df_train['HomePlanet'] = hp_le.fit_transform(df_train['HomePlanet'])
df_test['HomePlanet'] = hp_le.fit_transform(df_test['HomePlanet'])
print(df_train["HomePlanet"].value_counts())

### Destination

In [None]:
print(df_train["Destination"].value_counts())
print("=====================================")
de_le = LabelEncoder()
df_train['Destination'] = de_le.fit_transform(df_train['Destination'])
df_test['Destination'] = de_le.fit_transform(df_test['Destination'])
print(df_train["Destination"].value_counts())

### Cabin

In [None]:
decks = ["A", "B", "C", "D", "E", "F", "G", "T"]
sides = ["P", "S"]
def EncodeCabin(value):
    deck, num, side = value.split("/")
    return [decks.index(deck), int(num), sides.index(side)]
df_train["Cabin"] = df_train["Cabin"].map(EncodeCabin)
df_test["Cabin"] = df_test["Cabin"].map(EncodeCabin)

### Id

In [None]:
group_count_dict = {}
for p_id in list(df_train["PassengerId"]) + list(df_test["PassengerId"]):
    group = str(p_id).split("_")[0]
    if group not in group_count_dict:
        group_count_dict[group] = 0
    group_count_dict[group] += 1
def EncodeId(value):
    return group_count_dict[value.split("_")[0]]
df_train["PassengerId"] = df_train["PassengerId"].map(EncodeId)
df_test["PassengerId"] = df_test["PassengerId"].map(EncodeId)

## Create feature

In [None]:
df_train["Amount_of_money"] = df_train["RoomService"] + df_train["FoodCourt"] + df_train["ShoppingMall"] + df_train["Spa"] + df_train["VRDeck"]
df_test["Amount_of_money"] = df_test["RoomService"] + df_test["FoodCourt"] + df_test["ShoppingMall"] + df_test["Spa"] + df_test["VRDeck"]

## Learning

### Prepare data

In [None]:
dataX = [[p_id, hp, cs]  + cabin + [des, age, rs, fc, sm, spa, vr, am] for p_id, hp, cs, cabin, des, age, vip, rs, fc, sm, spa, vr, name, trans, am in df_train.to_numpy()]
dataY = [[int(trans)] for p_id, hp, cs, cabin, des, age, vip, rs, fc, sm, spa, vr, name, trans, am in df_train.to_numpy()]

### GBT

In [None]:
num_block = 4
test_data_num = len(dataX) // num_block

accs = []
for i in range(len(dataX) // test_data_num):
    # データ分割
    trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
    trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
    validX = dataX[test_data_num*i:test_data_num*(i+1)]
    validY = dataY[test_data_num*i:test_data_num*(i+1)]
    
    #学習
    gbt_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.00457544, max_depth=19, random_state=0)
    gbt_model.fit(trainX, trainY)

    # 予測
    predY = gbt_model.predict(validX)

    # 評価    
    accs.append(accuracy_score(validY, predY))
    
print("acc : ", np.average(accs))

### XGBoost

In [None]:
num_block = 4
test_data_num = len(dataX) // num_block

accs = []
for i in range(len(dataX) // test_data_num):
    # データ分割
    trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
    trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
    validX = dataX[test_data_num*i:test_data_num*(i+1)]
    validY = dataY[test_data_num*i:test_data_num*(i+1)]
    
    #学習
    xgb_model = XGBClassifier(eta=0.1)
    xgb_model.fit(trainX, trainY)

    # 予測
    predY = xgb_model.predict(validX)

    # 評価    
    accs.append(accuracy_score(validY, predY))
    
print("acc : ", np.average(accs))

### LightGBM

In [None]:
num_block = 4
test_data_num = len(dataX) // num_block

accs = []
for i in range(len(dataX) // test_data_num):
    # データ分割
    trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
    trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
    validX = dataX[test_data_num*i:test_data_num*(i+1)]
    validY = dataY[test_data_num*i:test_data_num*(i+1)]
    
    #学習
    lgb_model = lgb.LGBMClassifier(metric="logloss", n_estimators=1200, learning_rate=0.00457544, max_depth=19, num_leaves=26, min_child_samples=5)
    lgb_model.fit(trainX, trainY)

    # 予測
    predY = lgb_model.predict(validX)

    # 評価    
    accs.append(accuracy_score(validY, predY))
    
print("acc : ", np.average(accs))
print(lgb_model.feature_importances_)

### CatBoost

In [None]:
num_block = 4
test_data_num = len(dataX) // num_block

accs = []
for i in range(len(dataX) // test_data_num):
    # データ分割
    trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
    trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
    validX = dataX[test_data_num*i:test_data_num*(i+1)]
    validY = dataY[test_data_num*i:test_data_num*(i+1)]
    train_pool = Pool(trainX, label=trainY)
    test_pool = Pool(validX, label=validY)
    
    #学習
    cat_model = CatBoostClassifier(eval_metric="Logloss", num_boost_round=60, logging_level='Silent')
    cat_model.fit(train_pool)

    # 予測
    predY = cat_model.predict(test_pool, prediction_type='Class')

    # 評価    
    accs.append(accuracy_score(validY, predY))
    
print("acc : ", np.average(accs))

### RandomForest

In [None]:
num_block = 4
test_data_num = len(dataX) // num_block

accs = []
for i in range(len(dataX) // test_data_num):
    # データ分割
    trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
    trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
    validX = dataX[test_data_num*i:test_data_num*(i+1)]
    validY = dataY[test_data_num*i:test_data_num*(i+1)]
    
    #学習
    rfc_models = [RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2')]
    for rfc_model in rfc_models:
        rfc_model.fit(trainX, trainY)

    # 予測
    predYs = []
    for rfc_model in rfc_models:
        predYs.append(rfc_model.predict(validX))
    predY = [1 if sum(p) >= 3 else 0 for p in zip(predYs[0], predYs[1], predYs[2], predYs[3], 
                                                  predYs[4])]

    # 評価
    accs.append(accuracy_score(validY, predY))
    
print("acc : ", np.average(accs))

## Turning Hyper Param

### LightGBM

In [None]:
def optuna_objective(trial):
    # 最適化対象パラメータ
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1)
    n_estimators = trial.suggest_int('n_estimators', 300, 2000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    num_leaves = trial.suggest_int('num_leaves', 5, 100)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 30)
    
    accs = []
    for i in range(len(dataX) // test_data_num):
        # データ分割
        trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
        trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
        validX = dataX[test_data_num*i:test_data_num*(i+1)]
        validY = dataY[test_data_num*i:test_data_num*(i+1)]

        # lightGBMの学習
        lgb_model = lgb.LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, num_leaves=num_leaves, min_child_samples=min_child_samples)
        lgb_model.fit(trainX, trainY)

        # 予測
        predY = lgb_model.predict(validX)

        # 評価    
        accs.append(accuracy_score(validY, predY))
        
    return np.average(accs)

study = optuna.create_study(direction='maximize')
study.optimize(optuna_objective, n_trials=1000)

## Submission

### Read data

In [None]:
df_sub = pd.read_csv("data/sample_submission.csv")

### Learn model

### GBT

In [None]:
gbt_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.00457544, max_depth=19, random_state=0)
gbt_model.fit(dataX, dataY)

### AdaBoost

In [None]:
xgb_model = XGBClassifier(eta=0.1)
xgb_model.fit(dataX, dataY)

#### LightGBM

In [None]:
lgb_model = lgb.LGBMClassifier(metric="logloss", n_estimators=1200, learning_rate=0.00457544, max_depth=19, num_leaves=26, min_child_samples=5)
lgb_model.fit(dataX, dataY)

#### CatBoost

In [None]:
data_pool = Pool(dataX, label=dataY)
cat_model = CatBoostClassifier(eval_metric="Logloss", num_boost_round=60, logging_level='Silent')
cat_model.fit(data_pool)

#### RandomForest

In [None]:
rfc_models = [RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2'),
                  RandomForestClassifier(n_estimators=600, max_depth=40, max_features='log2')]
for rfc_model in rfc_models:
    rfc_model.fit(dataX, dataY)

### Prepare test data

In [None]:
testX = [[p_id, hp, cs]  + cabin + [des, age, rs, fc, sm, spa, vr, am] for p_id, hp, cs, cabin, des, age, vip , rs, fc, sm, spa, vr, name, am in df_test.to_numpy()]

### Predict and ensemble

#### GBT

In [None]:
gbt_predY = list(map(bool, gbt_model.predict(testX)))

#### XGBoost

In [None]:
xgb_predY = list(map(bool, xgb_model.predict(testX)))

#### LightGBM

In [None]:
lgb_predY = list(map(bool, lgb_model.predict(testX)))

#### CatBoost

In [None]:
test_pool = Pool(testX)
cat_predY = list(map(bool, cat_model.predict(test_pool, prediction_type='Class')))

#### RandomForest

In [None]:
predYs = []
for rfc_model in rfc_models:
    predYs.append(rfc_model.predict(testX))
rfc_predY = [1 if sum(p) >= 4 else 0 for p in zip(predYs[0], predYs[1], predYs[2], predYs[3], 
                                              predYs[4], predYs[5], predYs[6])]

#### Ensemble

In [None]:
df_sub["Transported"] = [True if sum(p) >= 3 else False for p in zip(gbt_predY, xgb_predY, lgb_predY, cat_predY, rfc_predY)]

### Output submission file

In [None]:
df_sub.to_csv("data/submission.csv", index=False)