# Load

In [1]:
import numpy as np
import pandas as pd

import datatable as dt
import gc
import re

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

import warnings
warnings.filterwarnings('ignore')

import shap
from scipy import stats
from tqdm import tqdm
import optuna
from optuna.samplers import TPESampler
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMModel

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

from collections import defaultdict
import joblib
import json

# these imports are used to convert the tree to PNG
from cairosvg import svg2png
from PIL import Image
from io import BytesIO

from pandas_profiling import ProfileReport

sns.set(style='whitegrid', font_scale=1.5)

INPUT_DIR = '../input/titanic'

TARGET = 'Survived'
RANDOM_STATE = 997
VERSION = 1

In [2]:
full_train_df = pd.read_csv(f'{INPUT_DIR}/train.csv')
full_test_df = pd.read_csv(f'{INPUT_DIR}/test.csv')

# Train set

In [3]:
full_train_df.info()
full_train_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
ProfileReport(full_train_df, progress_bar=False).to_widgets()

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

# Train LGBM

In [5]:
train_df = full_train_df.copy()
test_df = full_test_df.copy()

In [6]:
CATEGORIAL_FEATURES = [
    'Pclass',
    'Name',
    'Sex',
    'Ticket',
    'Cabin',
    'Embarked'
]
FEATURES = train_df.columns.drop(['PassengerId', TARGET])
FEATURES

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

## Define FE

In [7]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Embarked'] = df['Embarked'].fillna(
        pd.Series(np.random.choice(['S', 'C', 'Q'], p=[0.724, 0.189, 0.087], size=len(df))))
    df['Cabin'].fillna('?', inplace=True)
    
    for col_ in CATEGORIAL_FEATURES:
        df[col_] = df[col_].astype('category')
    
    return df

## Train FE

In [8]:
train_df = feature_engineering(train_df)

train_df.info()
train_df.isnull().sum() / len(train_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    category
 3   Name         891 non-null    category
 4   Sex          891 non-null    category
 5   Age          891 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int64   
 8   Ticket       891 non-null    category
 9   Fare         891 non-null    float64 
 10  Cabin        891 non-null    category
 11  Embarked     891 non-null    category
dtypes: category(6), float64(2), int64(4)
memory usage: 128.5 KB


PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

## Test FE

In [9]:
test_df = feature_engineering(test_df)

test_df.info()
test_df.isnull().sum() / len(test_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  418 non-null    int64   
 1   Pclass       418 non-null    category
 2   Name         418 non-null    category
 3   Sex          418 non-null    category
 4   Age          418 non-null    float64 
 5   SibSp        418 non-null    int64   
 6   Parch        418 non-null    int64   
 7   Ticket       418 non-null    category
 8   Fare         417 non-null    float64 
 9   Cabin        418 non-null    category
 10  Embarked     418 non-null    category
dtypes: category(6), float64(2), int64(3)
memory usage: 59.2 KB


PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.002392
Cabin          0.000000
Embarked       0.000000
dtype: float64

## Tune

In [10]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_df[FEATURES],
        train_df[TARGET],
        test_size=len(full_test_df) / len(full_train_df),
        random_state=RANDOM_STATE
    )
    lgb_train = lgb.Dataset(
        X_train, y_train, categorical_feature=CATEGORIAL_FEATURES, free_raw_data=False)
    lgb_valid = lgb.Dataset(
        X_valid, y_valid, categorical_feature=CATEGORIAL_FEATURES, free_raw_data=False)

    param = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "learning_rate": trial.suggest_uniform("learning_rate", 0.01, 1),
        "num_leaves": trial.suggest_int("num_leaves", 5, 127),
        "max_depth": trial.suggest_int("max_depth", 3, 32),
        'min_data_in_leaf': trial.suggest_int("min_data_in_leaf", 16, 127),
        'random_state': RANDOM_STATE,
    }

    model = lgb.train(param,
                      lgb_train,
                      num_boost_round=200,
                      early_stopping_rounds=10,
                      verbose_eval=10,
                      valid_sets=[lgb_train, lgb_valid])
    y_pred = model.predict(X_valid) >= 0.5
    accuracy = accuracy_score(y_valid, y_pred)

    return accuracy


study = optuna.create_study(
    direction="maximize", sampler=TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=20)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-12-08 18:20:53,374][0m A new study created in memory with name: no-name-5132e447-77e6-49da-91f1-43d625345c59[0m
[32m[I 2020-12-08 18:20:53,457][0m Trial 0 finished with value: 0.7918660287081339 and parameters: {'learning_rate': 0.451148188357339, 'num_leaves': 106, 'max_depth': 12, 'min_data_in_leaf': 93}. Best is trial 0 with value: 0.7918660287081339.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.405539	valid_1's binary_logloss: 0.473334
[20]	training's binary_logloss: 0.381259	valid_1's binary_logloss: 0.464568
[30]	training's binary_logloss: 0.367217	valid_1's binary_logloss: 0.463153
[40]	training's binary_logloss: 0.357683	valid_1's binary_logloss: 0.463152
Early stopping, best iteration is:
[31]	training's binary_logloss: 0.366282	valid_1's binary_logloss: 0.462803
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.466946	valid_1's binary_logloss: 0.501731
[20]	training's binary_logloss: 0.431692	valid_1's binary_logloss: 0.48465
[30]	training's binary_logloss: 0.409889	valid_1's binary_logloss: 0.475057
[40]	training's binary_logloss: 0.395859	valid_1's binary_logloss: 0.471372
[50]	training's binary_logloss: 0.386485	valid_1's binary_logloss: 0.468994
[60]	training's binary_logloss: 0.378952	valid_1's binary_logloss: 0.467235
[70]	tra

[32m[I 2020-12-08 18:20:53,648][0m Trial 1 finished with value: 0.8086124401913876 and parameters: {'learning_rate': 0.09735105884703202, 'num_leaves': 57, 'max_depth': 9, 'min_data_in_leaf': 79}. Best is trial 1 with value: 0.8086124401913876.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.3129	valid_1's binary_logloss: 0.440529


[32m[I 2020-12-08 18:20:53,761][0m Trial 2 finished with value: 0.8133971291866029 and parameters: {'learning_rate': 0.4795064893506527, 'num_leaves': 25, 'max_depth': 16, 'min_data_in_leaf': 36}. Best is trial 2 with value: 0.8133971291866029.[0m


Early stopping, best iteration is:
[8]	training's binary_logloss: 0.328189	valid_1's binary_logloss: 0.436141


[32m[I 2020-12-08 18:20:53,856][0m Trial 3 finished with value: 0.8205741626794258 and parameters: {'learning_rate': 0.34627320123520855, 'num_leaves': 43, 'max_depth': 14, 'min_data_in_leaf': 44}. Best is trial 3 with value: 0.8205741626794258.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.356074	valid_1's binary_logloss: 0.448992
[20]	training's binary_logloss: 0.307897	valid_1's binary_logloss: 0.438952
Early stopping, best iteration is:
[13]	training's binary_logloss: 0.336757	valid_1's binary_logloss: 0.437365
Training until validation scores don't improve for 10 rounds


[32m[I 2020-12-08 18:20:53,966][0m Trial 4 finished with value: 0.8157894736842105 and parameters: {'learning_rate': 0.5292233869945261, 'num_leaves': 84, 'max_depth': 5, 'min_data_in_leaf': 31}. Best is trial 3 with value: 0.8205741626794258.[0m


[10]	training's binary_logloss: 0.311581	valid_1's binary_logloss: 0.444752
[20]	training's binary_logloss: 0.250242	valid_1's binary_logloss: 0.451891
Early stopping, best iteration is:
[13]	training's binary_logloss: 0.288103	valid_1's binary_logloss: 0.440216


[32m[I 2020-12-08 18:20:54,066][0m Trial 5 finished with value: 0.8205741626794258 and parameters: {'learning_rate': 0.22133454130736518, 'num_leaves': 76, 'max_depth': 26, 'min_data_in_leaf': 38}. Best is trial 3 with value: 0.8205741626794258.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.383984	valid_1's binary_logloss: 0.459488
[20]	training's binary_logloss: 0.325238	valid_1's binary_logloss: 0.439788
[30]	training's binary_logloss: 0.291695	valid_1's binary_logloss: 0.4407
Early stopping, best iteration is:
[29]	training's binary_logloss: 0.294056	valid_1's binary_logloss: 0.438284
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.400506	valid_1's binary_logloss: 0.468192
[20]	training's binary_logloss: 0.348716	valid_1's binary_logloss: 0.45222
[30]	training's binary_logloss: 0.321762	valid_1's binary_logloss: 0.443229
[40]	training's binary_logloss: 0.306243	valid_1's binary_logloss: 0.441721
[50]	training's binary_logloss: 0.292916	valid_1's binary_logloss: 0.442701
Early stopping, best iteration is:
[42]	training's binary_logloss: 0.303062	valid_1's binary_logloss: 0.437851


[32m[I 2020-12-08 18:20:54,212][0m Trial 6 finished with value: 0.8133971291866029 and parameters: {'learning_rate': 0.2364543025896401, 'num_leaves': 15, 'max_depth': 10, 'min_data_in_leaf': 53}. Best is trial 3 with value: 0.8205741626794258.[0m
[32m[I 2020-12-08 18:20:54,298][0m Trial 7 finished with value: 0.7631578947368421 and parameters: {'learning_rate': 0.25673775652603426, 'num_leaves': 110, 'max_depth': 26, 'min_data_in_leaf': 117}. Best is trial 3 with value: 0.8205741626794258.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.447694	valid_1's binary_logloss: 0.500242
[20]	training's binary_logloss: 0.423217	valid_1's binary_logloss: 0.491847
[30]	training's binary_logloss: 0.410514	valid_1's binary_logloss: 0.487445
Early stopping, best iteration is:
[28]	training's binary_logloss: 0.411933	valid_1's binary_logloss: 0.487404


[32m[I 2020-12-08 18:20:54,383][0m Trial 8 finished with value: 0.8157894736842105 and parameters: {'learning_rate': 0.6347726791357056, 'num_leaves': 99, 'max_depth': 12, 'min_data_in_leaf': 19}. Best is trial 3 with value: 0.8205741626794258.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.198067	valid_1's binary_logloss: 0.484918
Early stopping, best iteration is:
[3]	training's binary_logloss: 0.335696	valid_1's binary_logloss: 0.422909
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.332263	valid_1's binary_logloss: 0.448087
[20]	training's binary_logloss: 0.292535	valid_1's binary_logloss: 0.466153
Early stopping, best iteration is:
[10]	training's binary_logloss: 0.332263	valid_1's binary_logloss: 0.448087


[32m[I 2020-12-08 18:20:54,462][0m Trial 9 finished with value: 0.7918660287081339 and parameters: {'learning_rate': 0.7549816605025265, 'num_leaves': 121, 'max_depth': 15, 'min_data_in_leaf': 58}. Best is trial 3 with value: 0.8205741626794258.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.396938	valid_1's binary_logloss: 0.486664
[20]	training's binary_logloss: 0.366999	valid_1's binary_logloss: 0.47504
[30]	training's binary_logloss: 0.346591	valid_1's binary_logloss: 0.466027


[32m[I 2020-12-08 18:20:54,573][0m Trial 10 finished with value: 0.80622009569378 and parameters: {'learning_rate': 0.9524733607334697, 'num_leaves': 46, 'max_depth': 21, 'min_data_in_leaf': 100}. Best is trial 3 with value: 0.8205741626794258.[0m


[40]	training's binary_logloss: 0.332963	valid_1's binary_logloss: 0.467013
Early stopping, best iteration is:
[33]	training's binary_logloss: 0.342175	valid_1's binary_logloss: 0.462616
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.533696	valid_1's binary_logloss: 0.554027
[20]	training's binary_logloss: 0.47207	valid_1's binary_logloss: 0.507983
[30]	training's binary_logloss: 0.438155	valid_1's binary_logloss: 0.489344
[40]	training's binary_logloss: 0.418499	valid_1's binary_logloss: 0.479131
[50]	training's binary_logloss: 0.399471	valid_1's binary_logloss: 0.467525
[60]	training's binary_logloss: 0.384325	valid_1's binary_logloss: 0.457944
[70]	training's binary_logloss: 0.373301	valid_1's binary_logloss: 0.452207
[80]	training's binary_logloss: 0.363775	valid_1's binary_logloss: 0.449566
[90]	training's binary_logloss: 0.356079	valid_1's binary_logloss: 0.447198
[100]	training's binary_logloss: 0.34927	valid_1's binary_logloss: 0.

[32m[I 2020-12-08 18:20:54,747][0m Trial 11 finished with value: 0.8181818181818182 and parameters: {'learning_rate': 0.04415589139449394, 'num_leaves': 77, 'max_depth': 32, 'min_data_in_leaf': 47}. Best is trial 3 with value: 0.8205741626794258.[0m
[32m[I 2020-12-08 18:20:54,856][0m Trial 12 finished with value: 0.8301435406698564 and parameters: {'learning_rate': 0.2925468727817966, 'num_leaves': 40, 'max_depth': 24, 'min_data_in_leaf': 19}. Best is trial 12 with value: 0.8301435406698564.[0m
[32m[I 2020-12-08 18:20:54,981][0m Trial 13 finished with value: 0.8253588516746412 and parameters: {'learning_rate': 0.3572847855787248, 'num_leaves': 37, 'max_depth': 21, 'min_data_in_leaf': 16}. Best is trial 12 with value: 0.8301435406698564.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.294721	valid_1's binary_logloss: 0.42692
Early stopping, best iteration is:
[9]	training's binary_logloss: 0.307309	valid_1's binary_logloss: 0.420314
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.247296	valid_1's binary_logloss: 0.437
Early stopping, best iteration is:
[6]	training's binary_logloss: 0.308599	valid_1's binary_logloss: 0.419741


[32m[I 2020-12-08 18:20:55,088][0m Trial 14 finished with value: 0.8349282296650717 and parameters: {'learning_rate': 0.3732247365582667, 'num_leaves': 32, 'max_depth': 20, 'min_data_in_leaf': 16}. Best is trial 14 with value: 0.8349282296650717.[0m
[32m[I 2020-12-08 18:20:55,190][0m Trial 15 finished with value: 0.8110047846889952 and parameters: {'learning_rate': 0.6414210847422301, 'num_leaves': 5, 'max_depth': 28, 'min_data_in_leaf': 19}. Best is trial 14 with value: 0.8349282296650717.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.236024	valid_1's binary_logloss: 0.449451
Early stopping, best iteration is:
[5]	training's binary_logloss: 0.324139	valid_1's binary_logloss: 0.418594
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.313903	valid_1's binary_logloss: 0.440649
[20]	training's binary_logloss: 0.275001	valid_1's binary_logloss: 0.457022
Early stopping, best iteration is:
[11]	training's binary_logloss: 0.308977	valid_1's binary_logloss: 0.440104


[32m[I 2020-12-08 18:20:55,368][0m Trial 16 finished with value: 0.8157894736842105 and parameters: {'learning_rate': 0.12533859661068678, 'num_leaves': 26, 'max_depth': 21, 'min_data_in_leaf': 68}. Best is trial 14 with value: 0.8349282296650717.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.446363	valid_1's binary_logloss: 0.487458
[20]	training's binary_logloss: 0.408662	valid_1's binary_logloss: 0.469484
[30]	training's binary_logloss: 0.387784	valid_1's binary_logloss: 0.461618
[40]	training's binary_logloss: 0.371726	valid_1's binary_logloss: 0.455402
[50]	training's binary_logloss: 0.36248	valid_1's binary_logloss: 0.454495
[60]	training's binary_logloss: 0.353431	valid_1's binary_logloss: 0.452632
[70]	training's binary_logloss: 0.345122	valid_1's binary_logloss: 0.449106
[80]	training's binary_logloss: 0.336116	valid_1's binary_logloss: 0.446365
[90]	training's binary_logloss: 0.329722	valid_1's binary_logloss: 0.444173
[100]	training's binary_logloss: 0.32458	valid_1's binary_logloss: 0.442857
[110]	training's binary_logloss: 0.319448	valid_1's binary_logloss: 0.442695
Early stopping, best iteration is:
[104]	training's binary_logloss: 0.322341	valid_1's binary_logloss: 

[32m[I 2020-12-08 18:20:55,481][0m Trial 17 finished with value: 0.8157894736842105 and parameters: {'learning_rate': 0.3736069622263566, 'num_leaves': 62, 'max_depth': 32, 'min_data_in_leaf': 23}. Best is trial 14 with value: 0.8349282296650717.[0m
[32m[I 2020-12-08 18:20:55,575][0m Trial 18 finished with value: 0.7679425837320574 and parameters: {'learning_rate': 0.5875014374669518, 'num_leaves': 6, 'max_depth': 23, 'min_data_in_leaf': 127}. Best is trial 14 with value: 0.8349282296650717.[0m
[32m[I 2020-12-08 18:20:55,672][0m Trial 19 finished with value: 0.8325358851674641 and parameters: {'learning_rate': 0.75218139012201, 'num_leaves': 27, 'max_depth': 19, 'min_data_in_leaf': 29}. Best is trial 14 with value: 0.8349282296650717.[0m


Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.422336	valid_1's binary_logloss: 0.488821
[20]	training's binary_logloss: 0.407925	valid_1's binary_logloss: 0.487643
Early stopping, best iteration is:
[19]	training's binary_logloss: 0.40849	valid_1's binary_logloss: 0.485803
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.235477	valid_1's binary_logloss: 0.479031
Early stopping, best iteration is:
[3]	training's binary_logloss: 0.350908	valid_1's binary_logloss: 0.435238
Number of finished trials: 20
Best trial:
  Value: 0.8349282296650717
  Params: 
    learning_rate: 0.3732247365582667
    num_leaves: 32
    max_depth: 20
    min_data_in_leaf: 16


## Start training

In [11]:
% % time
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.2,
    'max_depth': 9,
    'num_leaves': 117,
    'min_data_in_leaf': 127,
    'random_state': RANDOM_STATE,
    'verbose': -1
}

params.update(trial.params)

X_train, X_valid, y_train, y_valid = train_test_split(
    train_df[FEATURES],
    train_df[TARGET],
    test_size=len(full_test_df) / len(full_train_df),
    random_state=RANDOM_STATE
)
lgb_train = lgb.Dataset(
    X_train, y_train, categorical_feature=CATEGORIAL_FEATURES, free_raw_data=False)
lgb_valid = lgb.Dataset(
    X_valid, y_valid, categorical_feature=CATEGORIAL_FEATURES, free_raw_data=False)

model = lgb.train(params,
                  lgb_train,
                  num_boost_round=10000,
                  early_stopping_rounds=20,
                  verbose_eval=10,
                  valid_sets=[lgb_train, lgb_valid])

y_pred = model.predict(X_valid) >= 0.5

accuracy = round(accuracy_score(y_valid, y_pred), 4)
print('\n\n>> LGBM accuracy:', accuracy)

UsageError: Line magic function `%` not found.


## Feature Importance

In [None]:
lgb.plot_importance(model, figsize=(15, 10), importance_type='split')
plt.show()

In [None]:
lgb.plot_importance(model, figsize=(15, 10), importance_type='gain')
plt.show()

In [None]:
# show tree and save as png
def save_tree_diagraph(model):
    tree_digraph = lgb.create_tree_digraph(model, show_info=['split_gain', 'internal_count'])

    tree_png = svg2png(tree_digraph._repr_svg_(), output_width=3840)
    tree_png = Image.open(BytesIO(tree_png))

    tree_png.save('tree_digraph.png')

    display(tree_png)
    
save_tree_diagraph(model)

In [None]:
shap.initjs()

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_valid)

shap.summary_plot(shap_values, X_valid, plot_type="bar")

# Submit

In [None]:
test_df[TARGET] = model.predict(test_df[FEATURES]) >= 0.5
test_df[TARGET] = test_df[TARGET].astype(np.int8)
submission_df = test_df[['PassengerId', TARGET]]
submission_df.to_csv('output.csv', index=False)
submission_df