In [9]:
import pandas as pd 

df_titanic = pd.read_csv('data/titanic.csv', index_col='PassengerId')
df_titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
df_titanic.isna().sum().to_frame().T

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,0,0,177,0,0,0,0,687,2


Categorical columns의 결측치는 **mode**로 처리한다

In [11]:
df_titanic['Embarked'].mode()[0] 

'S'

In [12]:
df_titanic = pd.get_dummies(
    df_titanic.assign(
        Embarked=lambda x: x['Embarked'].fillna(x['Embarked'].mode().iloc[0]),
        Age=lambda x: x['Age'].fillna(x['Age'].mean()),
    ), columns=['Pclass', 'Embarked']
).assign(
    Sex=lambda x: x['Sex'] == 'female'
)

X_num = ['Age', 'SibSp', 'Parch', 'Fare']
X_dum = [i for i in df_titanic.columns if i.startswith('Pclass') or i.startswith('Sex') or i.startswith('Embarked')]
X_all = X_num + X_dum
X_all 

['Age',
 'SibSp',
 'Parch',
 'Fare',
 'Sex',
 'Pclass_1',
 'Pclass_2',
 'Pclass_3',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [29]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_titanic, train_size=0.8, stratify=df_titanic['Survived'], random_state=123)

In [13]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=1)
model.fit(df_titanic[X_all], df_titanic['Survived'])

DecisionTreeClassifier(max_depth=1)

In [30]:
from sklearn.model_selection import GridSearchCV
import numpy as np

params = {
    'max_depth': np.arange(2, 13),
    'max_leaf_nodes': np.arange(2, 10)**2,
    'min_samples_leaf': np.arange(2, 64, 4),
    'criterion': ['gini', 'entropy']
}

In [31]:
%%time
model = DecisionTreeClassifier(random_state=123)
gs = GridSearchCV(estimator=model, param_grid=params, cv=4, n_jobs=-1, verbose=1, scoring='f1')
gs.fit(df_train[X_all], df_train['Survived'])

Fitting 4 folds for each of 2816 candidates, totalling 11264 fits
Wall time: 17.6 s


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(random_state=123),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
                         'max_leaf_nodes': array([ 4,  9, 16, 25, 36, 49, 64, 81], dtype=int32),
                         'min_samples_leaf': array([ 2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62])},
             scoring='f1', verbose=1)

In [32]:
score_df = pd.DataFrame(gs.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_leaf_nodes,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00525,0.000434,0.003499,0.000502,gini,2,4,2,"{'criterion': 'gini', 'max_depth': 2, 'max_lea...",0.788321,0.618182,0.65625,0.7,0.690688,0.063368,1697
1,0.006752,0.001482,0.005249,0.003343,gini,2,4,6,"{'criterion': 'gini', 'max_depth': 2, 'max_lea...",0.788321,0.618182,0.65625,0.7,0.690688,0.063368,1697
2,0.00475,0.000432,0.0035,0.000499,gini,2,4,10,"{'criterion': 'gini', 'max_depth': 2, 'max_lea...",0.788321,0.618182,0.65625,0.7,0.690688,0.063368,1697
3,0.00575,0.001479,0.00375,0.00083,gini,2,4,14,"{'criterion': 'gini', 'max_depth': 2, 'max_lea...",0.788321,0.618182,0.65625,0.7,0.690688,0.063368,1697
4,0.005751,0.001919,0.004498,0.001117,gini,2,4,18,"{'criterion': 'gini', 'max_depth': 2, 'max_lea...",0.791367,0.618182,0.65625,0.7,0.69145,0.064544,1681


In [33]:
topk_df = score_df.nlargest(5, 'mean_test_score')
topk_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_leaf_nodes,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
1328,0.0055,0.002062,0.002251,0.0004331881,gini,12,25,2,"{'criterion': 'gini', 'max_depth': 12, 'max_le...",0.816,0.748092,0.779412,0.78125,0.781188,0.024033,1
1200,0.00475,0.000433,0.002499,0.0005008581,gini,11,25,2,"{'criterion': 'gini', 'max_depth': 11, 'max_le...",0.816,0.748092,0.761194,0.78125,0.776634,0.025613,2
944,0.005249,0.001639,0.003251,0.0004326719,gini,9,25,2,"{'criterion': 'gini', 'max_depth': 9, 'max_lea...",0.825397,0.748092,0.740157,0.777778,0.772856,0.033419,3
2464,0.0045,0.0005,0.00325,0.0004333945,entropy,10,16,2,"{'criterion': 'entropy', 'max_depth': 10, 'max...",0.848485,0.763359,0.722689,0.754098,0.772158,0.046574,4
2592,0.006499,0.003202,0.003002,2.598106e-07,entropy,11,16,2,"{'criterion': 'entropy', 'max_depth': 11, 'max...",0.848485,0.763359,0.722689,0.754098,0.772158,0.046574,4


In [34]:
topk_df.iloc[0]['params']

{'criterion': 'gini',
 'max_depth': 12,
 'max_leaf_nodes': 25,
 'min_samples_leaf': 2}

In [35]:
dt_best = gs.best_estimator_
dt_best

DecisionTreeClassifier(max_depth=12, max_leaf_nodes=25, min_samples_leaf=2,
                       random_state=123)

In [38]:
from sklearn.metrics import classification_report

print(classification_report(dt_best.predict(df_test[X_all]), df_test['Survived']))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85       112
           1       0.74      0.76      0.75        67

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [41]:
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score

roc_auc_score(dt_best.predict(df_test[X_all]), df_test['Survived'])

0.8002398720682303

In [42]:
confusion_matrix(dt_best.predict(df_test[X_all]), df_test['Survived'])

array([[94, 18],
       [16, 51]], dtype=int64)

In [43]:
accuracy_score(dt_best.predict(df_test[X_all]), df_test['Survived'])

0.8100558659217877