## 3.1.3 의사결정 트리

In [1]:
import pandas as pd
df = pd.read_csv('./titanic/train.csv', index_col='PassengerId')
print(df.head())

             Survived  Pclass  \
PassengerId                     
1                   0       3   
2                   1       1   
3                   1       3   
4                   1       1   
5                   0       3   

                                                          Name     Sex   Age  \
PassengerId                                                                    
1                                      Braund, Mr. Owen Harris    male  22.0   
2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
3                                       Heikkinen, Miss. Laina  female  26.0   
4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
5                                     Allen, Mr. William Henry    male  35.0   

             SibSp  Parch            Ticket     Fare Cabin Embarked  
PassengerId                                                          
1                1      0         A/5 21171   7.2500   NaN        S

In [2]:
df = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df = df.dropna() 
X = df.drop('Survived', axis=1)
y = df['Survived']

### Split train/test dataset

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### DecisionTreeClassifier

In [14]:
import time
start = time.time()

from sklearn import tree
model = tree.DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.3f}'.format(accuracy))

print('Elased time: %0.2fs' % (time.time()-start))

Accuracy: 0.832
Elased time: 0.01s


### confusion_matrix

In [15]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survival', 'Predicted Survival'],
    index=['True Not Survival', 'True Survival']
)

Unnamed: 0,Predicted Not Survival,Predicted Survival
True Not Survival,98,14
True Survival,16,51


### classification_report

In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred = y_predict))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85       112
           1       0.76      0.75      0.75        67

    accuracy                           0.82       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.82      0.82      0.82       179



### Hyperparameter Optimization

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

def randomized_search_clf(params, runs=20, clf=DecisionTreeClassifier(random_state=2)):
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, n_jobs=-1, random_state=2)
    
    rand_clf.fit(X_train, y_train)

    best_model = rand_clf.best_estimator_    
    
    best_score = rand_clf.best_score_
    print("Best Score: {:.3f}".format(best_score))

    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.3f}'.format(accuracy))
        
    return best_model

In [18]:
from sklearn.metrics import accuracy_score

best_model = randomized_search_clf(
    params={
        'criterion':['entropy', 'gini'],
        'splitter':['random', 'best'],
        'min_samples_split':[2, 3, 4, 5, 6, 8, 10],
        'min_samples_leaf':[1, 0.01, 0.02, 0.03, 0.04],
        'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
        'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
        'max_features':['sqrt', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
        'max_depth':[None, 2,4,6,8],
        'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]
    })

Best Score: 0.789
Accuracy: 0.838


In [9]:
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.3f}'.format(accuracy))

leaf_node_count = 0
tree = best_model.tree_
for i in range(tree.node_count):
    if (tree.children_left[i] == -1) and (tree.children_right[i] == -1):
        leaf_node_count += 1
print('# of Leaf Nodes:', leaf_node_count)

Accuracy: 0.838
# of Leaf Nodes: 16


### Verify the new hyperparameters

In [10]:
params = best_model.get_params(deep=True)
print(params)

criterion = params['criterion']
splitter = params['splitter']
min_samples_split = params['min_samples_split']
min_samples_leaf = params['min_samples_leaf']
min_impurity_decrease = params['min_impurity_decrease']
max_leaf_nodes = params['max_leaf_nodes']
max_features = params['max_features']
max_depth = params['max_depth']
min_weight_fraction_leaf = params['min_weight_fraction_leaf']
# random_state = params['random_state']

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 0.75, 'max_leaf_nodes': 25, 'min_impurity_decrease': 0.0005, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0075, 'random_state': 2, 'splitter': 'best'}


In [11]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(
    criterion=criterion, splitter=splitter, min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf, min_impurity_decrease=min_impurity_decrease,
    max_leaf_nodes=max_leaf_nodes, max_features=max_features, max_depth=max_depth, 
    min_weight_fraction_leaf=min_weight_fraction_leaf)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.3f}'.format(accuracy))

leaf_node_count = 0
tree = model.tree_
for i in range(tree.node_count):
    if (tree.children_left[i] == -1) and (tree.children_right[i] == -1):
        leaf_node_count += 1
print('# of Leaf Nodes:', leaf_node_count)

Accuracy: 0.821
# of Leaf Nodes: 15


In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred = y_predict))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85       112
           1       0.76      0.75      0.75        67

    accuracy                           0.82       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.82      0.82      0.82       179

