# 5장 트리 알고리즘

In [None]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')

print(wine.info())
print(wine.describe())

In [None]:
data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()

from sklearn.model_selection import train_test_split

train_input, test_input, train_target,test_target = train_test_split(data,target,test_size=0.2,random_state=42)

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_scaled,train_target)
print(lr.score(train_scaled,train_target))
print(lr.score(test_scaled,test_target))

print(lr.coef_,lr.intercept_)

위는 로지스틱 분류 방식인데, 각각의 가중치나 편향에 대한 의미를 직관적으로 알기 힘들다.
이러한 관점에서보면 지금부터 진행하는 트리 방식이 직관성이 더 높다. 그럼 진행해 보자.

In [None]:
#결정 트리
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_scaled,train_target)

print(dt.score(train_scaled,train_target))
print(dt.score(test_scaled,test_target))

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(10,7))
plot_tree(dt)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plot_tree(dt,max_depth=2,filled=True,feature_names=['alcohol','sugar','pH'])
plt.show()

In [None]:
dt = DecisionTreeClassifier(max_depth=3,random_state=42)
dt.fit(train_scaled,train_target)

print(dt.score(train_scaled,train_target))
print(dt.score(test_scaled,test_target))

plt.figure(figsize=(20,15))
plot_tree(dt,filled=True,feature_names=['alcohol','sugar','pH'])
plt.show()

In [None]:
# Scale 을 조정하지 않은 특성을 그대로 사용하는 편이 더 이해하기 쉬움.

dt = DecisionTreeClassifier(max_depth=3,random_state=42)
dt.fit(train_input,train_target)

print(dt.score(train_input,train_target))
print(dt.score(test_input,test_target))

plt.figure(figsize=(20,15))
plot_tree(dt,filled=True,feature_names=['alcohol','sugar','pH'])
plt.show()

print(dt.feature_importances_)

In [None]:
# 검증 세트(교차 검증을 위하여 세트를 3가지로 분할함.)

train_input, test_input, train_target,test_target = train_test_split(data,target,test_size=0.2,random_state=42)
sub_input, val_input, sub_target,val_target = train_test_split(train_input,train_target,test_size=0.2,random_state=42)

In [None]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(sub_input,sub_target)

print(dt.score(sub_input,sub_target))
print(dt.score(val_input,val_target))


from sklearn.model_selection import cross_validate

scores = cross_validate(dt,train_input,train_target) #5개의 검증 폴드로 분류하여, 검증.

print(scores)

import numpy as np


print(np.mean(scores['test_score']))

In [None]:
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt,train_input,train_target,cv=StratifiedKFold())
print(np.mean(scores['test_score']))

splitter = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_validate(dt,train_input,train_target,cv=splitter)
print(np.mean(scores['test_score']))

In [None]:
# 그리드 서치

from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease':[0.0001,0.0002,0.0003,0.0004,0.0005]}

gs = GridSearchCV(DecisionTreeClassifier(random_state=42),params,n_jobs=1)
gs.fit(train_input,train_target)

dt = gs.best_estimator_
print(dt.score(train_input,train_target))

print(gs.best_params_)

print(gs.cv_results_['mean_test_score'])

In [None]:
params = {'min_impurity_decrease':np.arange(0.0001,0.001,0.0001),
          'max_depth':range(5,20,1),
          'min_samples_split': range(2,100,10)
          }

gs = GridSearchCV(DecisionTreeClassifier(random_state=42),params,n_jobs=1)
gs.fit(train_input,train_target)

dt = gs.best_estimator_
print(dt.score(train_input,train_target))

print(gs.best_params_)

print(gs.cv_results_['mean_test_score'])

In [None]:
# 확률 분포 선택

from scipy.stats import uniform, randint

rgen = randint(0,10)
rgen.rvs(10)

np.unique(rgen.rvs(1000), return_counts=True)

ugen = uniform(0,1)

ugen.rvs(10)



In [None]:
# Random Search

params = {'min_impurity_decrease':uniform(0.0001,0.001),
          'max_depth': randint(20,50),
          'min_samples_split': range(2,25),
          'min_samples_leaf': range(1,25)
          }

from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42),params,
                        n_iter=100,n_jobs=-1,random_state=42)
gs.fit(train_input,train_target)


print(gs.best_params_)
print(np.max(gs.cv_results_['mean_test_score']))

dt = (gs.best_estimator_)

print(dt.score(test_input,test_target))