## Used Packages

matplotlib : 시각화를 위한 그래프 라이브러리

          graph library for visualization

numpy(numerical python) : 선형대수(배열,행렬) 연산에 효과적인 라이브러리

          An effective library for linear algebra(arrays, matrices)

pandas : 구조화된 데이터를 가공하는데 효과적인 라이브러리 (dataframe)

          An effective library for structured data processing

scikit-learn : 데이터 분석을 위한 라이브러리 (numpy, scipy, matplotlib 기반)

          library for data analysis

statsmodels : 통계적 모델 추정을 위한 라이브러리

          library for statistical model estimation
          
seaborn : 다양한 색상 테마와 통계용 차트 등의 기능이 추가된 시각화 라이브러리 (matplotlib 기반)

          Visualization library with added features such as various color themes and statistical charts 

## Used Dataset
### Iris Data

종속변수(dependent variable) : 꽃의 종

독립변수(independent variable) : Petal.Length, Petal.Width

In [6]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
dfx = pd.DataFrame(iris.data[:,[2,3]]) 
dfy = pd.DataFrame(iris.target)
dfiris=pd.concat([dfx,dfy],axis=1)
dfiris.columns=['Petal.Length','Petal.Width','Species'] 
dfiris['Species'] = dfiris['Species'].map({0: "setosa", 1: "versicolor", 2: "virginica"})
dfiris

Unnamed: 0,Petal.Length,Petal.Width,Species
0,1.4,0.2,setosa
1,1.4,0.2,setosa
2,1.3,0.2,setosa
3,1.5,0.2,setosa
4,1.4,0.2,setosa
5,1.7,0.4,setosa
6,1.4,0.3,setosa
7,1.5,0.2,setosa
8,1.4,0.2,setosa
9,1.5,0.1,setosa


## Split dataset

In [7]:
from sklearn.model_selection import train_test_split # 데이터를 train과 test로 분리

x_train, x_test , y_train, y_test = train_test_split(iris.data[:,[2,3]], iris.target, test_size=0.3, random_state=1)

dfXtrain=pd.DataFrame(x_train)
dfytrain=pd.DataFrame(y_train)
dfXtest=pd.DataFrame(x_test)
dfytest=pd.DataFrame(y_test)

dftrain=pd.concat([dfXtrain,dfytrain],axis=1)
dftest=pd.concat([dfXtest,dfytest],axis=1)
dftrain.columns=['Petal.Length','Petal.Width','Species']
dftest.columns=['Petal.Length','Petal.Width','Species']

dftrain

Unnamed: 0,Petal.Length,Petal.Width,Species
0,6.9,2.3,2
1,1.7,0.3,0
2,1.4,0.2,0
3,1.4,0.3,0
4,3.9,1.4,1
5,1.5,0.2,0
6,1.3,0.2,0
7,6.7,2.2,2
8,5.4,2.1,2
9,6.3,1.8,2


 ## Decision Tree (Categorical)

In [8]:
from sklearn.tree import DecisionTreeClassifier

tree1 = DecisionTreeClassifier(criterion='gini', max_depth=1, random_state=0).fit(x_train,y_train)

## Visualization

In [11]:
%matplotlib inline

#notebook을 실행한 브라우저에서 바로 그림을 볼 수 있음

import io
import pydot #파이썬에서 dot스크립트 언어를 파싱하기 위하 도구
import graphviz # 다이어그램 형태의 그림을 생성하기 위한 도구
from IPython.core.display import Image 
from sklearn.tree import export_graphviz 


def draw_decision_tree(model):  #draw_decision_tree 함수는 의사 결정 나무의 의사 결정 과정의 세부적인 내역을 다이어그램으로 보여줌
    dot_buf = io.StringIO()   # 파일처럼 흉내내는 객체이며 문자열 데이터를 파일로 저장한 다음 여러가지 처리를 하게 됨
    export_graphviz(model, out_file=dot_buf, feature_names=iris.feature_names[2:],
                   class_names=iris.target_names, filled=True,rounded=True)

    graph = pydot.graph_from_dot_data(dot_buf.getvalue())[0] 
    image = graph.create_png()
    return Image(image)




In [12]:
draw_decision_tree(tree1)

FileNotFoundError: [Errno 2] "dot" not found in path.

In [None]:
tree1.feature_importances_ # 특성 중요도 확인

In [None]:
tree2 = DecisionTreeClassifier(criterion='gini', max_depth=2, random_state=0).fit(x_train,y_train)
draw_decision_tree(tree2)

## Result

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_train, tree2.predict(x_train)))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, tree2.predict(x_train))


In [None]:
tree2_y_pred=tree2.predict(x_test)
tree2_y_pred

In [None]:
confusion_matrix(y_test, tree2.predict(x_test))

In [None]:
print(classification_report(y_test, tree2.predict(x_test)))

## Combined code

In [None]:
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import graphviz
import matplotlib.pyplot as plt

iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=17)
 
clf = tree.DecisionTreeClassifier(random_state=17, min_impurity_decrease=0) #impurity가 이값보다 크거나 같게 감소하는 경우까지 분할
clf = clf.fit(x_train, y_train)
 
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred, target_names=iris.target_names))
print('\nAccuracy: {0:.4f}'.format(accuracy_score(y_test, y_pred)))


dot = tree.export_graphviz(clf, out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True)



graphviz.Source(dot)

In [None]:
confusion_matrix(y_test, clf.predict(x_test))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 

cm = pd.DataFrame(confusion_matrix(y_test, clf.predict(x_test)), columns=iris.target_names, index=iris.target_names)
sns.heatmap(cm, annot=True)

## Used Dataset
### 당뇨병 환자 442명의 검사 데이터 from sklearn  (442 diabetic patients data from sklearn)
종속변수(dependent variable) : 1년 후의 당뇨병 진행도 (diabetes progression after 1 year)

독립변수(independent variable) : 나이, 성별, BMI지수, 혈압 등 10개의 변수 (age, sex, BMI index, pressure pressure, etc.)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.formula.api as sm
from sklearn.model_selection import train_test_split


diabetes = datasets.load_diabetes()
dfxx=pd.DataFrame(diabetes.data)
dfyy=pd.DataFrame(diabetes.target)
dfdiabetes=pd.concat([dfxx, dfyy], axis=1)
dfdiabetes.columns=['AGE','SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y']
dfdiabetes

In [None]:
xx_train, xx_test , yy_train, yy_test = train_test_split(diabetes.data, diabetes.target, test_size=0.3, random_state=1)
dfxxtrain=pd.DataFrame(xx_train)
dfyytrain=pd.DataFrame(yy_train)
dfxxtest=pd.DataFrame(xx_test)
dfyytest=pd.DataFrame(yy_test)

dftrain=pd.concat([dfxxtrain,dfyytrain],axis=1)
dftest=pd.concat([dfxxtest,dfyytest],axis=1)
dftrain.columns=['AGE','SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y']
dftest.columns=['AGE','SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y']
dftrain

## Decision Tree(numerical)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree3 = DecisionTreeRegressor(max_depth=2, random_state=0).fit(xx_train,yy_train)

## Visualization

In [None]:
%matplotlib inline

#notebook을 실행한 브라우저에서 바로 그림을 볼 수 있음

import io
import pydot
import graphviz 
from IPython.core.display import Image 
from sklearn.tree import export_graphviz 


def draw_decision_tree(model):  #draw_decision_tree 함수는 의사 결정 나무의 의사 결정 과정의 세부적인 내역을 다이어그램으로 보여줌
    dot_buf = io.StringIO()   # 파일처럼 흉내내는 객체이며 문자열 데이터를 파일로 저장한 다음 여러가지 처리를 하게 됨
    export_graphviz(model, out_file=dot_buf, feature_names=diabetes.feature_names, filled=True,rounded=True)
    graph = pydot.graph_from_dot_data(dot_buf.getvalue())[0] 
    image = graph.create_png()
    return Image(image)



In [None]:
import matplotlibdraw_decision_tree(tree3)

In [None]:
tree3_y_pred = tree3.predict(xx_test)
tree3_y_pred

## Result

In [None]:
tree3.feature_importances_ 

In [None]:
print('MeanSquaredError: %.2f'
     % mean_squared_error(yy_test,tree3_y_pred))
print('MeanAbsoluteError: %.2f'
     % mean_absolute_error(yy_test, tree3_y_pred))
print('R Square: %.2f'
     % r2_score(yy_test, tree3_y_pred), '\n')