In [1]:
# Warning 생략
import warnings
warnings.filterwarnings('ignore')
# Pandas
import pandas as pd
# Numpy
import numpy as np
# ML 저장
import joblib
# 그래프
import matplotlib.pyplot as plt
# Seaborn
import seaborn as sns

# ML Data 나누기
from sklearn.model_selection import train_test_split
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# Random Forest
from sklearn.ensemble import RandomForestClassifier
# SVM
from sklearn import svm
from sklearn.svm import SVC
# Neural Net
from sklearn.neural_network import MLPClassifier
# QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier
# Gaussian Process
from sklearn.gaussian_process import GaussianProcessClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# 회귀모델
from sklearn.neighbors import KNeighborsRegressor
# 선형회귀
from sklearn.linear_model import LinearRegression
# 표준화
from sklearn.preprocessing import StandardScaler
# 로지스틱 회귀 모델
from sklearn.linear_model import LogisticRegression
# KFold
from sklearn.model_selection import StratifiedKFold

# 평균 절대값 오차
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import cross_val_score

# 확률적 경사하강법의 분류 알고리즘 적용
from sklearn.linear_model import SGDClassifier

# 교차 검증
from sklearn.model_selection import cross_validate


# Cluster
from sklearn.cluster import KMeans
# Voting
from sklearn.ensemble import VotingClassifier
# 정확도 측정
from sklearn.metrics import accuracy_score

# 한글 폰트 문제 해결 
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import platform

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
from matplotlib import font_manager, rc
# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

### 교차 검증과 그리드 서치
- **머신러닝**을 사용할 때 모델의 정확도를 측정하기 위해 반드시 사용해야 하는 방법
- **딥러닝**시에는 데이터의 크기가 크므로 이 방법은 사용할 필요가 없다.

In [3]:
wine = pd.read_csv("../Data/wine.csv")
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [4]:
# feature, target
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

### 검증 세트 추가
- 훈련(60%), 검증(20%), 테스트(20%)

In [6]:
# Train, Test( train - 80%, test - 20%)
train_input, test_input, train_target, test_target = \
    train_test_split(data, target, test_size=0.2, random_state=42)

In [7]:
# 검증, 테스트용(sub - 60% , val - 20%)
sub_input, val_input, sub_target, val_target = \
    train_test_split(train_input, train_target, test_size=0.2, random_state=42)

In [8]:
# 세트 별 크기
print("Train :", sub_input.shape)
print("Valid :", val_input.shape)
print("Test :", test_input.shape)

Train : (4157, 3)
Valid : (1040, 3)
Test : (1300, 3)


In [9]:
# 모델 tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print("Train Score :", dt.score(sub_input, sub_target))
print("Valid Score :", dt.score(val_input, val_target))

Train Score : 0.9971133028626413
Valid Score : 0.864423076923077


In [10]:
# # 교차 검증 - 60% 짜리
# from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
scores

{'fit_time': array([0.00661922, 0.00444794, 0.00482011, 0.00460792, 0.00459409]),
 'score_time': array([0.00082397, 0.000489  , 0.00056291, 0.00050402, 0.00050282]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [11]:
# 교차 검증 후의 정확도 판단(test_score의 평균값이 정확도)
np.mean(scores['test_score'])

0.855300214703487

### KFold를 이용한 방법

In [12]:
from sklearn.model_selection import StratifiedKFold
splitter = StratifiedKFold()
scores = cross_validate(dt, train_input, train_target, cv=splitter)
scores

{'fit_time': array([0.00565934, 0.00517988, 0.00478506, 0.00477791, 0.00419211]),
 'score_time': array([0.00123382, 0.00069094, 0.00054002, 0.00047588, 0.00033879]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [13]:
np.mean(scores['test_score'])

0.855300214703487

In [14]:
# KFold의 Fold중 10개로 나누어서 교차검증
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
scores

{'fit_time': array([0.01180196, 0.00525594, 0.00528502, 0.0049448 , 0.00502968,
        0.00512195, 0.0049181 , 0.00509787, 0.00477886, 0.00454092]),
 'score_time': array([0.00113392, 0.00052714, 0.00041485, 0.00043821, 0.00047517,
        0.00041795, 0.00042295, 0.00036716, 0.00029182, 0.00023293]),
 'test_score': array([0.83461538, 0.87884615, 0.85384615, 0.85384615, 0.84615385,
        0.87307692, 0.85961538, 0.85549133, 0.85163776, 0.86705202])}

In [15]:
# 정확도 오름 - 유행처럼 사용...ㅎ
np.mean(scores['test_score'])

0.8574181117533719

---
### 그리드 서치(Grid Search)를 이용한 최적의 Hyper Parameter 값 찾기
- 단계별 서치

In [17]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [18]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)   #n-jobs: pc 모두 사용>빨라짐

In [19]:
gs.fit(train_input, train_target)

In [20]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [21]:
gs.best_params_

{'min_impurity_decrease': 0.0001}

In [22]:
# 교차 검증
gs.cv_results_['mean_test_score']

array([0.86819297, 0.86453617, 0.86492226, 0.86780891, 0.86761605])