## Instration

In [None]:
%%writefile requirements.txt
pandas
lightgbm
scikit-learn
optuna
matplotlib
numpy
kaggle

In [None]:
%pip install -U --user -r requirements.txt

## Import library

In [34]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_curve
import optuna
import matplotlib.pyplot as plt
import numpy as np
import warnings

## Settings

In [35]:
warnings.simplefilter('ignore')

## Fetch Dataset

In [36]:
! kaggle competitions download -c spaceship-titanic

spaceship-titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
! call powershell -command "Expand-Archive spaceship-titanic.zip data"

## Read data from csv

In [41]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [42]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Fill in missing values

In [43]:
df_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [44]:
df_train["Age"] = df_train["Age"].fillna(df_train["Age"].mean())
df_train["Cabin"] = df_train["Cabin"].fillna(method='ffill')
df_train["Destination"] = df_train["Destination"].fillna(method='ffill')
df_train["HomePlanet"] = df_train["HomePlanet"].fillna(method='ffill')
df_train["RoomService"] = df_train["RoomService"].fillna(df_train["RoomService"].median())
df_train["FoodCourt"] = df_train["FoodCourt"].fillna(df_train["FoodCourt"].median())
df_train["ShoppingMall"] = df_train["ShoppingMall"].fillna(df_train["ShoppingMall"].median())
df_train["Spa"] = df_train["Spa"].fillna(df_train["Spa"].median())
df_train["VRDeck"] = df_train["VRDeck"].fillna(df_train["VRDeck"].median())
df_train["CryoSleep"] = df_train["CryoSleep"].fillna(False)
df_train["VIP"] = df_train["VIP"].fillna(False)

In [45]:
df_test["Age"] = df_test["Age"].fillna(df_train["Age"].mean())
df_test["Cabin"] = df_test["Cabin"].fillna(method='ffill')
df_test["Destination"] = df_test["Destination"].fillna(method='ffill')
df_test["HomePlanet"] = df_test["HomePlanet"].fillna(method='ffill')
df_test["RoomService"] = df_test["RoomService"].fillna(df_train["RoomService"].median())
df_test["FoodCourt"] = df_test["FoodCourt"].fillna(df_train["FoodCourt"].median())
df_test["ShoppingMall"] = df_test["ShoppingMall"].fillna(df_train["ShoppingMall"].median())
df_test["Spa"] = df_test["Spa"].fillna(df_train["Spa"].median())
df_test["VRDeck"] = df_test["VRDeck"].fillna(df_train["VRDeck"].median())
df_test["CryoSleep"] = df_test["CryoSleep"].fillna(False)
df_test["VIP"] = df_test["VIP"].fillna(False)

## Encode categorical variables

### HomePlanet

In [46]:
print(df_train["HomePlanet"].value_counts())
print("=====================================")
hp_le = LabelEncoder()
df_train['HomePlanet'] = hp_le.fit_transform(df_train['HomePlanet'])
df_test['HomePlanet'] = hp_le.fit_transform(df_test['HomePlanet'])
print(df_train["HomePlanet"].value_counts())

Earth     4720
Europa    2177
Mars      1796
Name: HomePlanet, dtype: int64
0    4720
1    2177
2    1796
Name: HomePlanet, dtype: int64


### Destination

In [47]:
print(df_train["Destination"].value_counts())
print("=====================================")
de_le = LabelEncoder()
df_train['Destination'] = de_le.fit_transform(df_train['Destination'])
df_test['Destination'] = de_le.fit_transform(df_test['Destination'])
print(df_train["Destination"].value_counts())

TRAPPIST-1e      6037
55 Cancri e      1840
PSO J318.5-22     816
Name: Destination, dtype: int64
2    6037
0    1840
1     816
Name: Destination, dtype: int64


### Cabin

In [48]:
decks = ["A", "B", "C", "D", "E", "F", "G", "T"]
sides = ["P", "S"]
def EncodeCabin(value):
    deck, num, side = value.split("/")
    return [decks.index(deck), int(num), sides.index(side)]
df_train["Cabin"] = df_train["Cabin"].map(EncodeCabin)
df_test["Cabin"] = df_test["Cabin"].map(EncodeCabin)

### Id

In [49]:
group_count_dict = {}
for p_id in list(df_train["PassengerId"]) + list(df_test["PassengerId"]):
    group = str(p_id).split("_")[0]
    if group not in group_count_dict:
        group_count_dict[group] = 0
    group_count_dict[group] += 1
def EncodeId(value):
    return group_count_dict[value.split("_")[0]]
df_train["PassengerId"] = df_train["PassengerId"].map(EncodeId)
df_test["PassengerId"] = df_test["PassengerId"].map(EncodeId)

## Learning

In [50]:
dataX = [[p_id, hp, cs]  + cabin + [des, age, rs, fc, sm, spa, vr] for p_id, hp, cs, cabin, des, age, vip, rs, fc, sm, spa, vr, name, trans in df_train.to_numpy()]
dataY = [[int(trans)] for p_id, hp, cs, cabin, des, age, vip, rs, fc, sm, spa, vr, name, trans in df_train.to_numpy()]

In [51]:
num_block = 4
test_data_num = len(dataX) // num_block

accs = []
for i in range(len(dataX) // test_data_num):
    # データ分割
    trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
    trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
    validX = dataX[test_data_num*i:test_data_num*(i+1)]
    validY = dataY[test_data_num*i:test_data_num*(i+1)]
    
    #学習
    lgb_model = lgb.LGBMClassifier(metric="logloss", n_estimators=1250, learning_rate=0.00457544, max_depth=19, num_leaves=26, min_child_samples=5)
    lgb_model.fit(trainX, trainY)

    # 予測
    predY = lgb_model.predict(validX)

    # 評価    
    accs.append(accuracy_score(validY, predY))
    
print("acc : ", np.average(accs))

acc :  0.8034974689369535


In [52]:
lgb_model.feature_importances_

array([ 841, 1637,  767, 2857, 4569, 1924,  859, 2784, 2487, 3003, 2099,
       3524, 3899])

## Turning Hyper Param

In [53]:
def optuna_objective(trial):
    # 最適化対象パラメータ
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1)
    n_estimators = trial.suggest_int('n_estimators', 300, 2000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    num_leaves = trial.suggest_int('num_leaves', 5, 100)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 30)
    
    accs = []
    for i in range(len(dataX) // test_data_num):
        # データ分割
        trainX = dataX[:test_data_num*i] + dataX[test_data_num*(i+1):]
        trainY = dataY[:test_data_num*i] + dataY[test_data_num*(i+1):]
        validX = dataX[test_data_num*i:test_data_num*(i+1)]
        validY = dataY[test_data_num*i:test_data_num*(i+1)]

        # lightGBMの学習
        lgb_model = lgb.LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, num_leaves=num_leaves, min_child_samples=min_child_samples)
        lgb_model.fit(trainX, trainY)

        # 予測
        predY = lgb_model.predict(validX)

        # 評価    
        accs.append(accuracy_score(validY, predY))
        
    return np.average(accs)

study = optuna.create_study(direction='maximize')
study.optimize(optuna_objective, n_trials=1000)

[32m[I 2022-11-11 15:02:46,760][0m A new study created in memory with name: no-name-716891bc-f33d-4328-b570-999b446beae9[0m
[32m[I 2022-11-11 15:02:58,692][0m Trial 0 finished with value: 0.7857800276115968 and parameters: {'learning_rate': 0.029850915286675453, 'n_estimators': 1632, 'max_depth': 8, 'num_leaves': 52, 'min_child_samples': 19}. Best is trial 0 with value: 0.7857800276115968.[0m
[32m[I 2022-11-11 15:03:02,074][0m Trial 1 finished with value: 0.7990105844454671 and parameters: {'learning_rate': 0.027622384080804387, 'n_estimators': 423, 'max_depth': 22, 'num_leaves': 36, 'min_child_samples': 14}. Best is trial 1 with value: 0.7990105844454671.[0m
[33m[W 2022-11-11 15:03:21,808][0m Trial 2 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "C:\Users\osusi\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Us

## Submission

In [54]:
df_sub = pd.read_csv("data/sample_submission.csv")

In [55]:
lgb_model = lgb.LGBMClassifier(metric="logloss", n_estimators=1250, learning_rate=0.00457544, max_depth=19, num_leaves=26, min_child_samples=5)
lgb_model.fit(dataX, dataY)

In [56]:
testX = [[p_id, hp, cs]  + cabin + [des, age, rs, fc, sm, spa, vr] for p_id, hp, cs, cabin, des, age, vip , rs, fc, sm, spa, vr, name in df_test.to_numpy()]

In [57]:
predY = list(map(bool, lgb_model.predict(testX)))

In [58]:
df_sub["Transported"] = predY

In [59]:
df_sub.to_csv("data/submission.csv", index=False)