# Titanic survivor prediction model

## import Modules & Set parameters

In [1]:
import sys
sys.path.append('..')
import multiprocessing

import time
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
%matplotlib inline
from IPython.display import display

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

import optuna

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from boruta import BorutaPy
from sklearn.feature_selection import RFECV

from functools import partial

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
N_CPU = multiprocessing.cpu_count()
N_JOBS=N_CPU - 1

RANDOM_STATE=123
N_SPLITS = 10
kfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
N_TRIALS = 10

## Load datasets

In [3]:
# Path of datasets
path_train = '../data/train_features.csv'
path_test = '../data/test_features.csv'

# Create dataframe for training data and test data
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

train_orig = train.copy()
test_orig = test.copy()

In [4]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 49 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Title          891 non-null object
LastName       891 non-null object
TicketTitle    891 non-null object
SameTicket     891 non-null int64
AgeIsnull      891 non-null int64
FamilySize     891 non-null int64
IsAlone        891 non-null int64
IsChild        891 non-null int64
LogFare        891 non-null float64
Sex_male       891 non-null int64
Em_Q           891 non-null int64
Em_C           891 non-null int64
Tit_Mrs        891 non-null int64
Tit_Miss       891 non-null int64
Tit_Rare       891 non-null int64
Tit_Mr         891 non-null int64

(None, None)

In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Tick_A,Tick_4,Tick_9,Tick_C,Tick_W,Tick_6,Tick_2,Tick_L,Tick_8,Tick_F
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,X,...,1,0,0,0,0,0,0,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,X,...,0,0,0,0,0,0,0,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,...,0,0,0,0,0,0,0,0,0,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,X,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Title,...,Tick_A,Tick_4,Tick_9,Tick_C,Tick_W,Tick_6,Tick_2,Tick_L,Tick_8,Tick_F
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,X,Mr,...,0,0,0,0,0,0,0,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,X,Mrs,...,0,0,0,0,0,0,0,0,0,0
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,X,Mr,...,0,0,0,0,0,0,1,0,0,0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,X,Mr,...,0,0,0,0,0,0,0,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,X,Mrs,...,0,0,0,0,0,0,0,0,0,0


In [7]:
columns_object = train.dtypes[train.dtypes=="object"].index.to_list()

In [8]:
# Create X vector and y vector
X_train = train.drop(['Survived','PassengerId','Fare'], axis=1).drop(columns_object,axis=1)
y_train = train['Survived']

X_test = test.drop(['PassengerId','Fare'], axis=1).drop(columns_object,axis=1)

In [9]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,SameTicket,AgeIsnull,FamilySize,IsAlone,IsChild,LogFare,...,Tick_A,Tick_4,Tick_9,Tick_C,Tick_W,Tick_6,Tick_2,Tick_L,Tick_8,Tick_F
0,3,22.0,1,0,1,0,2,0,0,0.860338,...,1,0,0,0,0,0,0,0,0,0
1,1,38.0,1,0,2,0,2,0,0,1.852988,...,0,0,0,0,0,0,0,0,0,0
2,3,26.0,0,0,1,0,1,1,0,0.898999,...,0,0,0,0,0,0,0,0,0,0
3,1,35.0,1,0,2,0,2,0,0,1.725095,...,0,0,0,0,0,0,0,0,0,0
4,3,35.0,0,0,1,0,1,1,0,0.905796,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X_test.tail()

Unnamed: 0,Pclass,Age,SibSp,Parch,SameTicket,AgeIsnull,FamilySize,IsAlone,IsChild,LogFare,...,Tick_A,Tick_4,Tick_9,Tick_C,Tick_W,Tick_6,Tick_2,Tick_L,Tick_8,Tick_F
413,3,25.0,0,0,1,1,1,1,0,0.905796,...,1,0,0,0,0,0,0,0,0,0
414,1,39.0,0,0,3,0,1,1,0,2.037028,...,0,0,0,0,0,0,0,0,0,0
415,3,38.5,0,0,1,0,1,1,0,0.860338,...,0,0,0,0,0,0,0,0,0,0
416,3,25.0,0,0,1,1,1,1,0,0.905796,...,0,0,0,0,0,0,0,0,0,0
417,3,7.0,1,1,3,1,3,0,1,1.349439,...,0,0,0,0,0,0,1,0,0,0


In [11]:
X_train.info(), X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 40 columns):
Pclass        891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
SameTicket    891 non-null int64
AgeIsnull     891 non-null int64
FamilySize    891 non-null int64
IsAlone       891 non-null int64
IsChild       891 non-null int64
LogFare       891 non-null float64
Sex_male      891 non-null int64
Em_Q          891 non-null int64
Em_C          891 non-null int64
Tit_Mrs       891 non-null int64
Tit_Miss      891 non-null int64
Tit_Rare      891 non-null int64
Tit_Mr        891 non-null int64
Cab_G         891 non-null int64
Cab_T         891 non-null int64
Cab_A         891 non-null int64
Cab_C         891 non-null int64
Cab_X         891 non-null int64
Cab_E         891 non-null int64
Cab_F         891 non-null int64
Cab_D         891 non-null int64
Tick_7        891 non-null int64
Tick_1        891 non-null int64


(None, None)

In [12]:
X_train.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'SameTicket', 'AgeIsnull',
       'FamilySize', 'IsAlone', 'IsChild', 'LogFare', 'Sex_male', 'Em_Q',
       'Em_C', 'Tit_Mrs', 'Tit_Miss', 'Tit_Rare', 'Tit_Mr', 'Cab_G', 'Cab_T',
       'Cab_A', 'Cab_C', 'Cab_X', 'Cab_E', 'Cab_F', 'Cab_D', 'Tick_7',
       'Tick_1', 'Tick_5', 'Tick_3', 'Tick_P', 'Tick_A', 'Tick_4', 'Tick_9',
       'Tick_C', 'Tick_W', 'Tick_6', 'Tick_2', 'Tick_L', 'Tick_8', 'Tick_F'],
      dtype='object')

## Feature Selection
特徴量を選択する。ここでは、borutaを用いた場合とRFECVを用いた場合とで特徴量を選択し、
あるモデル（RandomForest)で分類精度が高い方の特徴量を採用する

### Feature selection by boruta

In [13]:
estimator = RandomForestClassifier(n_jobs=-2,max_depth=5)
feat_selector_boruta = BorutaPy(estimator, n_estimators='auto', two_step=True,perc=100,verbose=0, random_state=42)
feat_selector_boruta.fit(X_train.values,y_train.values)

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                          criterion='gini', max_depth=5,
                                          max_features='auto',
                                          max_leaf_nodes=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=97, n_jobs=-2,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x139F4D160,
                                          verbose=0, warm_start=False),
         max_iter=100, n_estimators='auto', perc=100,
         random_state=R

In [14]:
X_train_selected_boruta = X_train.iloc[:,feat_selector_boruta.support_]
X_test_selected_boruta = X_test.iloc[:,feat_selector_boruta.support_]

In [15]:
display(X_train_selected_boruta.head(), X_test_selected_boruta.head())

Unnamed: 0,Pclass,Age,SameTicket,FamilySize,LogFare,Sex_male,Tit_Mrs,Tit_Miss,Tit_Mr,Cab_X
0,3,22.0,1,2,0.860338,1,0,0,1,1
1,1,38.0,2,2,1.852988,0,1,0,0,0
2,3,26.0,1,1,0.898999,0,0,1,0,1
3,1,35.0,2,2,1.725095,0,1,0,0,0
4,3,35.0,1,1,0.905796,1,0,0,1,1


Unnamed: 0,Pclass,Age,SameTicket,FamilySize,LogFare,Sex_male,Tit_Mrs,Tit_Miss,Tit_Mr,Cab_X
0,3,34.5,1,1,0.893717,1,0,0,1,1
1,3,47.0,1,2,0.845098,0,1,0,0,1
2,2,62.0,1,1,0.986212,1,0,0,1,1
3,3,27.0,1,1,0.937643,1,0,0,1,1
4,3,22.0,2,3,1.089464,0,1,0,0,1


### Feature selection by RFECV

In [16]:
estimator = RandomForestClassifier(n_jobs=-2, n_estimators=100)
feat_selector_rfecv = RFECV(estimator, step=1, cv=5, verbose=0)
feat_selector_rfecv.fit(X_train, y_train)

RFECV(cv=5,
      estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                       criterion='gini', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=-2,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False),
      min_features_to_select=1, n_jobs=None, scoring=None, step=1, verbose=0)

In [17]:
X_train_selected_rfecv = X_train.iloc[:,feat_selector_rfecv.support_]
X_test_selected_rfecv = X_test.iloc[:,feat_selector_rfecv.support_]

In [18]:
display(X_train_selected_rfecv.head(),X_test_selected_rfecv.head())

Unnamed: 0,Pclass,Age,SibSp,SameTicket,FamilySize,LogFare,Sex_male,Em_C,Tit_Mrs,Tit_Miss,Tit_Mr,Cab_X,Tick_1,Tick_3
0,3,22.0,1,1,2,0.860338,1,0,0,0,1,1,0,0
1,1,38.0,1,2,2,1.852988,0,1,1,0,0,0,0,0
2,3,26.0,0,1,1,0.898999,0,0,0,1,0,1,0,0
3,1,35.0,1,2,2,1.725095,0,0,1,0,0,0,1,0
4,3,35.0,0,1,1,0.905796,1,0,0,0,1,1,0,1


Unnamed: 0,Pclass,Age,SibSp,SameTicket,FamilySize,LogFare,Sex_male,Em_C,Tit_Mrs,Tit_Miss,Tit_Mr,Cab_X,Tick_1,Tick_3
0,3,34.5,0,1,1,0.893717,1,0,0,0,1,1,0,1
1,3,47.0,1,1,2,0.845098,0,0,1,0,0,1,0,1
2,2,62.0,0,1,1,0.986212,1,0,0,0,1,1,0,0
3,3,27.0,0,1,1,0.937643,1,0,0,0,1,1,0,1
4,3,22.0,1,2,3,1.089464,0,0,1,0,0,1,0,1


### Rank of feature importance

In [19]:
feat_df_boruta =pd.DataFrame(feat_selector_boruta.ranking_,index=X_train.columns,columns=["rank_boruta"]).sort_values(by="rank_boruta")
feat_df_rfecv = pd.DataFrame(feat_selector_rfecv.ranking_,index=X_train.columns,columns=["rank_rfecv"]).sort_values(by="rank_rfecv")

In [20]:
pd.concat([feat_df_boruta, feat_df_rfecv],axis=1,sort=False)

Unnamed: 0,rank_boruta,rank_rfecv
Pclass,1,1
Age,1,1
SameTicket,1,1
Cab_X,1,1
FamilySize,1,1
Tit_Miss,1,1
Tit_Mr,1,1
LogFare,1,1
Sex_male,1,1
Tit_Mrs,1,1


## Effect Evaluation of feature selection
Random Forest ClassifierでのCVスコアがどの程度変化するかで効果を見てみる。ここでは、optunaを用いてハイパーパラメータの最適化を行う

In [21]:
def objective_RFC(X, y, trial):
    '''
    trial:set of hyperparameter    
    '''
    # hyper param
    max_depth = trial.suggest_int('max_depth', 3, 10) 
    n_estimators = trial.suggest_int('n_estimators', 10, 100) 
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'auto', 'log2'])
 
    # model    
    classifier = RandomForestClassifier(
        max_depth=max_depth,
        n_estimators=n_estimators,
        max_features=max_features,
        random_state=RANDOM_STATE,
        verbose=0)
    
    model = Pipeline([("scaler", None), ("clf",classifier)])
    scores = cross_validate(model, X=X, y=y, cv=kfold, n_jobs=N_JOBS)
    return 1.0 - scores['test_score'].mean()

In [22]:
dict_feature={
    'X_train':X_train,
    'X_train_selected_boruta':X_train_selected_boruta,
    'X_train_selected_rfecv':X_train_selected_rfecv
}

for name, X in dict_feature.items():
    f = partial(objective_RFC, X, y_train)
    # 最適化のセッションを作る
    study = optuna.create_study()
    study.optimize(f, n_trials=N_TRIALS,n_jobs=1)
    
    # vector種別を表示
    print("===================")
    print("Vector:",name)

    # 最適化したパラメータを出力する
    print('Best params:', study.best_params)
    print("Best Score: ", 1.0 - study.best_value)

[32m[I 2019-11-22 10:37:53,454][0m Finished trial#0 resulted in value: 0.1817410055612303. Current best value is 0.1817410055612303 with parameters: {'max_depth': 9, 'n_estimators': 51, 'max_features': 'auto'}.[0m
[32m[I 2019-11-22 10:37:55,545][0m Finished trial#1 resulted in value: 0.1806296107138804. Current best value is 0.1806296107138804 with parameters: {'max_depth': 10, 'n_estimators': 93, 'max_features': 'sqrt'}.[0m
[32m[I 2019-11-22 10:37:57,286][0m Finished trial#2 resulted in value: 0.16934258313471795. Current best value is 0.16934258313471795 with parameters: {'max_depth': 4, 'n_estimators': 56, 'max_features': 'auto'}.[0m
[32m[I 2019-11-22 10:37:58,834][0m Finished trial#3 resulted in value: 0.17381199636817635. Current best value is 0.16934258313471795 with parameters: {'max_depth': 4, 'n_estimators': 56, 'max_features': 'auto'}.[0m
[32m[I 2019-11-22 10:38:00,733][0m Finished trial#4 resulted in value: 0.16707013959822947. Current best value is 0.167070139

Vector: X_train
Best params: {'max_depth': 5, 'n_estimators': 78, 'max_features': 'log2'}
Best Score:  0.8329298604017705


[32m[I 2019-11-22 10:38:11,088][0m Finished trial#0 resulted in value: 0.1772962773805471. Current best value is 0.1772962773805471 with parameters: {'max_depth': 10, 'n_estimators': 53, 'max_features': 'sqrt'}.[0m
[32m[I 2019-11-22 10:38:13,369][0m Finished trial#1 resulted in value: 0.17162751106571328. Current best value is 0.17162751106571328 with parameters: {'max_depth': 7, 'n_estimators': 73, 'max_features': 'log2'}.[0m
[32m[I 2019-11-22 10:38:15,013][0m Finished trial#2 resulted in value: 0.16932981500397237. Current best value is 0.16932981500397237 with parameters: {'max_depth': 4, 'n_estimators': 36, 'max_features': 'auto'}.[0m
[32m[I 2019-11-22 10:38:16,587][0m Finished trial#3 resulted in value: 0.17498552945182166. Current best value is 0.16932981500397237 with parameters: {'max_depth': 4, 'n_estimators': 36, 'max_features': 'auto'}.[0m
[32m[I 2019-11-22 10:38:18,383][0m Finished trial#4 resulted in value: 0.16820621949835424. Current best value is 0.1682062

Vector: X_train_selected_boruta
Best params: {'max_depth': 5, 'n_estimators': 86, 'max_features': 'auto'}
Best Score:  0.8362881625241176


[32m[I 2019-11-22 10:38:28,782][0m Finished trial#0 resulted in value: 0.172763874702077. Current best value is 0.172763874702077 with parameters: {'max_depth': 8, 'n_estimators': 89, 'max_features': 'sqrt'}.[0m
[32m[I 2019-11-22 10:38:30,822][0m Finished trial#1 resulted in value: 0.17273890591306318. Current best value is 0.17273890591306318 with parameters: {'max_depth': 5, 'n_estimators': 75, 'max_features': 'log2'}.[0m
[32m[I 2019-11-22 10:38:32,563][0m Finished trial#2 resulted in value: 0.18172823743048472. Current best value is 0.17273890591306318 with parameters: {'max_depth': 5, 'n_estimators': 75, 'max_features': 'log2'}.[0m
[32m[I 2019-11-22 10:38:34,462][0m Finished trial#3 resulted in value: 0.17501191692202922. Current best value is 0.17273890591306318 with parameters: {'max_depth': 5, 'n_estimators': 75, 'max_features': 'log2'}.[0m
[32m[I 2019-11-22 10:38:36,428][0m Finished trial#4 resulted in value: 0.17609777550788785. Current best value is 0.1727389059

Vector: X_train_selected_rfecv
Best params: {'max_depth': 8, 'n_estimators': 99, 'max_features': 'auto'}
Best Score:  0.8339652139371241


- 特徴量選択してもスコアは大きく低下しないことを確認
- 以降では、borutaで抽出した特徴量を用いることとする

## Define Model
- X_train_selected_borutaを用いる
- 複数のアルゴリズムでのスコアを確認し、上位のアルゴリズムを選別する
- ハイパーパラメータのチューニングはGridsearchを利用して実施

In [23]:
X_train_selected = X_train_selected_boruta
X_test_selected = X_test_selected_boruta

In [24]:
N_FEATURES=X_train_selected.columns.shape[0]
N_FEATURES

10

### GaussianNB (Gaussian Naive Bayes)

In [25]:
# Define model
model_NB = Pipeline([("scaler",StandardScaler()),("clf",GaussianNB())])

# Hyper parameters
param_grid_NB = {}

### KNeighborsClassifier

In [26]:
# Define model
model_kNN = Pipeline([("scaler",StandardScaler()),("clf",KNeighborsClassifier())])

# Hyper parameters
param_grid_kNN = {'clf__n_neighbors' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                 'clf__weights' : ['uniform','distance']
                }

### Logistic Regression

In [27]:
# Define model
model_LR = Pipeline([("scaler",StandardScaler()),("clf",LogisticRegression(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_LR = {'clf__C' : [0.01, 0.05, 0.1, 0.5, 1.0, 5, 10, 50, 100],
                 'clf__solver' : ['newton-cg','lbfgs','liblinear','sag','saga']
                }

### DecisionTreeClassifier

In [28]:
# Define model
model_DT = Pipeline([("scaler",None),("clf",DecisionTreeClassifier(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_DT = {'clf__max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                 'clf__max_features' : ['sqrt', 'auto', 'log2'],
                }

### ExtraTreesClassifier

In [29]:
# Define model
model_ET = Pipeline([("scaler",None),("clf",ExtraTreesClassifier(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_ET = {'clf__max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                 'clf__n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                 'clf__max_features' : ['sqrt', 'auto', 'log2'],
                }

### RFC (Random Forest Classifier)

In [30]:
# Define model
model_RFC = Pipeline([("scaler",None),("clf",RandomForestClassifier(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_RFC = {'clf__max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                 'clf__n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                 'clf__max_features' : ['sqrt', 'auto', 'log2'],
                }

### LinearSVC

In [31]:
# Define model
model_LinearSVC = Pipeline([("scaler",StandardScaler()),("clf",LinearSVC(max_iter=10000,random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_LinearSVC = {
    'clf__C' : [0.1, 0.2, 0.4, 0.8, 1, 2, 4, 8, 10],
    'clf__loss' : ['hinge','squared_hinge'],
}

### SVC

In [32]:
# Define model
model_SVC = Pipeline([("scaler",StandardScaler()),("clf",SVC(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_SVC = {
    'clf__C' : [0.1, 0.2, 0.4, 0.8, 1, 2, 4, 8, 10],
    'clf__gamma' : ['scale',0.001,0.01,0.1,1.0],
    'clf__kernel' : ['rbf','poly','sigmoid'],
}

### AdaBoost

In [33]:
# Define model
model_AdB = Pipeline([("scaler",StandardScaler()),("clf",AdaBoostClassifier(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_AdB = {
    'clf__algorithm' : ['SAMME', 'SAMME.R'],
    'clf__learning_rate' : [0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0],
    'clf__n_estimators' : [10,20, 30, 40, 50, 60, 70, 80, 90, 100],
}

### LGBMClassifier

In [34]:
# Define model
model_LGBM = Pipeline([("scaler",None),("clf",LGBMClassifier(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_LGBM = {
    'clf__learning_rate' : [0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0],
    'clf__n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
}

### XGBoost

In [35]:
# Define model
model_XGB = Pipeline([("scaler",None),("clf",XGBClassifier(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_XGB = {
    'clf__learning_rate' : [0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0],
    'clf__n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
}

### GradientBoostingClassifier

In [36]:
# Define model
model_GBC = Pipeline([("scaler",None),("clf",GradientBoostingClassifier(random_state=RANDOM_STATE))])

# Hyper parameters
param_grid_GBC = {
    'clf__learning_rate' : [0.001, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0],
    'clf__n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
}

### Deep Neural Network model using keras

In [37]:
# Keras model
def dnn_model(activation="relu", optimizer='adam', num_input=N_FEATURES):
    
    # Initializing our DNN
    model = Sequential()
    
    # Adding the input layer and the first hidden layer of our ANN with dropout
    model.add(Dense(units=32, kernel_initializer='glorot_uniform', activation=activation, input_shape=(num_input,)))
    
    # Add other layers, it is not necessary to pass the shape because there is a layer before
    model.add(Dense(units=64, kernel_initializer='glorot_uniform', activation=activation))
    model.add(Dropout(rate=0.5))
    
    model.add(Dense(units=64, kernel_initializer='glorot_uniform', activation=activation))
    model.add(Dropout(rate=0.5))
    
    # Adding the output layer
    model.add(Dense(units=1, kernel_initializer='glorot_uniform', activation='sigmoid'))
    
    # Compiling the DNN
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


# Define model
model_DNN = Pipeline([
    ("scaler",StandardScaler()),
    ("clf",KerasClassifier(build_fn=dnn_model,verbose=0))
])

# Hyper parameters
param_grid_DNN = {
    'clf__epochs' : [10, 30, 50],
    'clf__batch_size' : [16, 32],
}

## Model Comparison

In [38]:
models =[
    {'name':'GaussianNB','model':model_NB, 'param_grid':param_grid_NB},
    {'name':'KNeighborsClassifier','model':model_kNN, 'param_grid':param_grid_kNN},
    {'name':'LogisticRegression','model':model_LR, 'param_grid':param_grid_LR},
    {'name':'DecisionTreeClassifier','model':model_DT, 'param_grid':param_grid_DT},
    {'name':'ExtraTreesClassifier','model':model_ET, 'param_grid':param_grid_ET},
    {'name':'RandomForestClassifier','model':model_RFC, 'param_grid':param_grid_RFC},
    {'name':'LinearSVC','model':model_LinearSVC, 'param_grid':param_grid_LinearSVC},
    {'name':'SVC','model':model_SVC, 'param_grid':param_grid_SVC},
    {'name':'AdaBoostClassifier','model':model_AdB, 'param_grid':param_grid_AdB},
    {'name':'LGBMClassifier','model':model_LGBM, 'param_grid':param_grid_LGBM},    
    {'name':'XGBClassifier','model':model_XGB, 'param_grid':param_grid_XGB},  
    {'name':'GradientBoostingClassifier','model':model_GBC, 'param_grid':param_grid_GBC},
    {'name':'DeepNeuralNetwork','model':model_DNN, 'param_grid':param_grid_DNN},
]

df_ModelEvaluation = pd.DataFrame({'ModelName':[],'BestParameters': [],'BestScore': [], 'BestModel':[]})

# search best hyper parameters
for index, dict_model in enumerate(models):
    model_name = dict_model['name']
    model = dict_model['model']
    param_grid = dict_model['param_grid']
 
    print('========================================')
    print('Model Name: ', model_name)
    
    # grid search
    t_start = time.time()    
    
    if model_name == 'DeepNeuralNetwork':
        gs = GridSearchCV(model, param_grid = param_grid, cv=kfold, scoring="accuracy", n_jobs= 1, verbose = 0)
    else:
        gs = GridSearchCV(model, param_grid = param_grid, cv=kfold, scoring="accuracy", n_jobs= N_JOBS, verbose = 0)

    gs.fit(X_train_selected,y_train)
    t_end = time.time() 
    
    # extract best model
    params_best = gs.best_params_
    score_best = gs.best_score_
    model_best = gs.best_estimator_
    print('Best params:', params_best)
    print("Best Score: ", score_best)
    print("Best model: ", model_best)
    print("Elaspled time: ", t_end-t_start)
    
    # 最適化したパラメータとスコアをdataframeに格納する
    df_ModelEvaluation.loc[index]=[model_name, params_best, score_best, model_best]


Model Name:  GaussianNB
Best params: {}
Best Score:  0.792368125701459
Best model:  Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)
Elaspled time:  1.4769668579101562
Model Name:  KNeighborsClassifier
Best params: {'clf__n_neighbors': 7, 'clf__weights': 'uniform'}
Best Score:  0.8271604938271605
Best model:  Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=7, p=2,
                                      weights='uniform'))],
         verbose=False)
Elaspled time:  1.9998209476470947
Model Name:  LogisticRegressio



Best params: {'clf__max_depth': 6, 'clf__max_features': 'sqrt', 'clf__n_estimators': 50}
Best Score:  0.835016835016835
Best model:  Pipeline(memory=None,
         steps=[('scaler', None),
                ('clf',
                 ExtraTreesClassifier(bootstrap=False, class_weight=None,
                                      criterion='gini', max_depth=6,
                                      max_features='sqrt', max_leaf_nodes=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=50, n_jobs=None,
                                      oob_score=False, random_state=123,
                                      verbose=0, warm_start=False))],
         verbose=False)
Elaspled time:  88.45047330856323
Model Name:  RandomF



Best params: {'clf__max_depth': 5, 'clf__max_features': 'sqrt', 'clf__n_estimators': 20}
Best Score:  0.8395061728395061
Best model:  Pipeline(memory=None,
         steps=[('scaler', None),
                ('clf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=5,
                                        max_features='sqrt',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=20, n_jobs=None,
                                        oob_score=False, random_state=123,
                                        verbose=0, warm_start=False))],
         verbose=F



Best params: {'clf__C': 0.4, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}
Best Score:  0.8338945005611672
Best model:  Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 SVC(C=0.4, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='scale',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=123, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)
Elaspled time:  117.16552495956421
Model Name:  AdaBoostClassifier




Best params: {'clf__algorithm': 'SAMME.R', 'clf__learning_rate': 0.2, 'clf__n_estimators': 90}
Best Score:  0.8249158249158249
Best model:  Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=0.2, n_estimators=90,
                                    random_state=123))],
         verbose=False)
Elaspled time:  64.84659814834595
Model Name:  LGBMClassifier
Best params: {'clf__learning_rate': 0.02, 'clf__n_estimators': 100}
Best Score:  0.8305274971941639
Best model:  Pipeline(memory=None,
         steps=[('scaler', None),
                ('clf',
                 LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                colsample_bytree=1.0, importance_type='split',
                                learning_rate=0.02, max_depth=-1,
       



Best params: {'clf__learning_rate': 0.05, 'clf__n_estimators': 60}
Best Score:  0.8327721661054994
Best model:  Pipeline(memory=None,
         steps=[('scaler', None),
                ('clf',
                 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                            learning_rate=0.05, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=60,
                                            n_iter_no_change=None,
                                  

In [39]:
df_ModelEvaluation.sort_values(by='BestScore', ascending=False)

Unnamed: 0,ModelName,BestParameters,BestScore,BestModel
5,RandomForestClassifier,"{'clf__max_depth': 5, 'clf__max_features': 'sq...",0.839506,"(None, (DecisionTreeClassifier(class_weight=No..."
10,XGBClassifier,"{'clf__learning_rate': 0.5, 'clf__n_estimators...",0.836139,"(None, XGBClassifier(base_score=0.5, booster='..."
4,ExtraTreesClassifier,"{'clf__max_depth': 6, 'clf__max_features': 'sq...",0.835017,"(None, (ExtraTreeClassifier(class_weight=None,..."
7,SVC,"{'clf__C': 0.4, 'clf__gamma': 'scale', 'clf__k...",0.833895,"(StandardScaler(copy=True, with_mean=True, wit..."
11,GradientBoostingClassifier,"{'clf__learning_rate': 0.05, 'clf__n_estimator...",0.832772,"(None, ([DecisionTreeRegressor(criterion='frie..."
9,LGBMClassifier,"{'clf__learning_rate': 0.02, 'clf__n_estimator...",0.830527,"(None, LGBMClassifier(boosting_type='gbdt', cl..."
12,DeepNeuralNetwork,"{'clf__batch_size': 16, 'clf__epochs': 10}",0.830527,"(StandardScaler(copy=True, with_mean=True, wit..."
1,KNeighborsClassifier,"{'clf__n_neighbors': 7, 'clf__weights': 'unifo...",0.82716,"(StandardScaler(copy=True, with_mean=True, wit..."
8,AdaBoostClassifier,"{'clf__algorithm': 'SAMME.R', 'clf__learning_r...",0.824916,"(StandardScaler(copy=True, with_mean=True, wit..."
3,DecisionTreeClassifier,"{'clf__max_depth': 5, 'clf__max_features': 'sq...",0.823793,"(None, DecisionTreeClassifier(class_weight=Non..."


## Voting Classifier

上位３つのモデル（RFC,XGB, ET）の多数決モデルを作成する

In [40]:
# Define Hybrid model
model_RFC = df_ModelEvaluation[df_ModelEvaluation['ModelName']=='RandomForestClassifier'].BestModel.values[0]
model_XGB = df_ModelEvaluation[df_ModelEvaluation['ModelName']=='XGBClassifier'].BestModel.values[0]
model_ET = df_ModelEvaluation[df_ModelEvaluation['ModelName']=='ExtraTreesClassifier'].BestModel.values[0]

VC_model = VotingClassifier(
    estimators=[
        ('RFC', model_RFC),
        ('XGB', model_XGB),
        ('ET', model_ET),
    ],
    voting='hard',
    weights=[1,1,1]
)

# Evaluate Hybrid model
score = cross_val_score(VC_model, X=X_train_selected, y=y_train, cv=kfold, scoring = "accuracy",n_jobs=N_JOBS)
print('Hybrid model score: ', score.mean(),score.std())


Hybrid model score:  0.8385481216660992 0.043581205573152766


#### 考察
- Hybrid modelのスコアはRandomForestベースのモデルよりもスコアが低い
- RandomForestベースのモデルで推論することにする

## Best model

In [41]:
model_best = model_RFC

## Prediction

In [42]:
X_test_selected.columns, X_train_selected.columns

(Index(['Pclass', 'Age', 'SameTicket', 'FamilySize', 'LogFare', 'Sex_male',
        'Tit_Mrs', 'Tit_Miss', 'Tit_Mr', 'Cab_X'],
       dtype='object'),
 Index(['Pclass', 'Age', 'SameTicket', 'FamilySize', 'LogFare', 'Sex_male',
        'Tit_Mrs', 'Tit_Miss', 'Tit_Mr', 'Cab_X'],
       dtype='object'))

In [43]:
y_predict = model_best.predict(X_test_selected)

In [44]:
result_df = test.copy()
result_df['Survived'] = y_predict.astype(int)
result_df

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Title,...,Tick_4,Tick_9,Tick_C,Tick_W,Tick_6,Tick_2,Tick_L,Tick_8,Tick_F,Survived
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,X,Mr,...,0,0,0,0,0,0,0,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0000,X,Mrs,...,0,0,0,0,0,0,0,0,0,0
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,X,Mr,...,0,0,0,0,0,1,0,0,0,0
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,X,Mr,...,0,0,0,0,0,0,0,0,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,X,Mrs,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",25.0,0,0,A.5. 3236,8.0500,X,Mr,...,0,0,0,0,0,0,0,0,0,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",39.0,0,0,PC 17758,108.9000,C105,Dona,...,0,0,0,0,0,0,0,0,0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",38.5,0,0,SOTON/O.Q. 3101262,7.2500,X,Mr,...,0,0,0,0,0,0,0,0,0,0
416,1308,3,"Ware, Mr. Frederick",25.0,0,0,359309,8.0500,X,Mr,...,0,0,0,0,0,0,0,0,0,0


## Create Kaggle Submission File and Submit

In [47]:
result_df.to_csv('../submission/submission.csv', columns=['PassengerId', 'Survived'], index=False)

In [None]:
!cat '../submission/submission.csv'

In [None]:
!kaggle competitions submit -c titanic -f '../submission/submission.csv' -m "test"

In [None]:
!kaggle competitions submissions -c titanic

## Result of public score
public score 0.77990