# Decision Tree - 의사결정나무

---

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# 1 Decision Tree

- Gini Index
- Entroy Index
- Information Gain

### 1.1 결정 트리 복잡도 제어 - Overfitting 방지

#### 1.1.1 max_depth 조절

In [None]:
# Visual Python: Machine Learning > Data Sets
from sklearn.datasets import load_breast_cancer

ldata = load_breast_cancer()
# Create DataFrame
df_ldata = pd.DataFrame(data=ldata.data, columns=ldata.feature_names)
df_ldata['target'] = ldata.target
df_ldata

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_ldata[['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness',
                                                              'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 
                                                              'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 
                                                              'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 
                                                              'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 
                                                              'worst concave points', 'worst symmetry', 'worst fractal dimension']],
                                                              df_ldata['target'])

In [None]:
# Visual Python: Machine Learning > Classifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
tree.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Model Info
scores = tree.score(X_train, y_train)
scores

In [None]:
# Visual Python: Machine Learning > Model Info
scores_test = tree.score(X_test, y_test)
scores_test

#### max_depth = 4

In [None]:
# Visual Python: Machine Learning > Classifier
from sklearn.tree import DecisionTreeClassifier

tree_max4 = DecisionTreeClassifier(max_depth=4)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
tree_max4.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Model Info
scores = tree_max4.score(X_train, y_train)
scores

In [None]:
# Visual Python: Machine Learning > Model Info
scores_test = tree_max4.score(X_test, y_test)
scores_test

### 1.3 결정 트리의 특성 중요도

#### Feature Importance - Tree 계열 모델에만 적용 가능
#### Permutation Feature Importance - 모든 모델에 적용 가능

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_feature_importances(model, X_train=None, sort=False):
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(len(model.feature_importances_)) ]
                        
    df_i = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * (df_i['Feature_importance'] / df_i['Feature_importance'].max())
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_plot_feature_importances(model, X_train=None, sort=False, top_count=0):
    df_i = vp_create_feature_importances(model, X_train, sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(tree, X_train, sort=True, top_count=10)

---

In [None]:
# End of file