#### 머신러닝 개념(ML)
- 데이터를 기반으로 패턴을 학습하고 결과를 추론하는 알고리즘 기법
- 지도학습(Supervised Learning), 비지도학습(Un-Supervised Learning)
- 지도학습(분류, 회귀), 비지도학습(클러스터링, 차원축소)

#### 머신러닝 용어(데이터의 형식 : DataFrame)
- 피처(feature) : 데이터의 일반 속성
- 레이블, 클래스, 타겟 값, 결정 값 : 정답데이터

In [1]:
import numpy as np
import pandas as pd

In [None]:
print('numpy version - ',np.__version__)
print('pandas version - ',pd.__version__)

In [2]:
import sklearn
from sklearn.datasets import load_iris

print('sklearn version - ', sklearn.__version__)

sklearn version -  0.24.2


In [3]:
iris = load_iris()
print('type :', type(iris))
print('keys :', iris.keys())

print('feature_names : ', iris.feature_names)
print('feature_names type : ', type(iris.feature_names))

type : <class 'sklearn.utils.Bunch'>
keys : dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
feature_names :  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
feature_names type :  <class 'list'>


In [None]:
print('data : ',iris.data)
print('data type : ', type(iris.data))

In [None]:
print('target :', type(iris.target))
print('target :', iris.target)

In [None]:
print('feature, target를 이용해서 데이터 프레이믈 만들어보자 -')
print()
iris_frm = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris_frm['target'] = iris.target
iris_frm

### 지도학습 - 분류(classification)
 - step 01. 데이터 분리(training data, test data)
 - step 02. 학습데이터를 기반으로 ML 알고리즘을 적용해 학습 모델 생성
 - step 03. 테스트데이터를 기반으로 분류 예측을 수행
 - step 04. 모델의 성능평가

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.tree            import DecisionTreeClassifier
from sklearn.metrics         import accuracy_score


In [None]:
iris_feature_frm = iris_frm.iloc[:,:-1]
iris_target_frm = iris_frm.iloc[:,-1]

In [None]:
iris_feature_frm

In [None]:
print('step 01.')
print()
X_train, X_test, y_train, y_test = train_test_split(iris_feature_frm,
                                                    iris_target_frm,
                                                    test_size = 0.2,
                                                    shuffle = True,
                                                    random_state = 100)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
type(iris_target_frm)

In [None]:
print('train data :', X_train)
print('train target :', y_train)

In [None]:
print('test data :', X_test)
print('test target :', y_test)

In [None]:
print('step 02. fit()')
print()
iris_dtc_model = DecisionTreeClassifier()
iris_dtc_model.fit(X_train, y_train)

In [None]:
print('step 03. predict()')
print()

y_pred = iris_dtc_model.predict(X_test)
print('y_test : ', y_test)
print('y_pred : ', y_pred)

In [None]:
print('step 04. 예측정확도 : ')
print()
print('acc : ', accuracy_score(y_test, y_pred))

In [None]:
# display(iris_frm)
print()
print('type : ', type(iris_frm))

In [None]:
print('데이터 프레임 형식에서 학습데이터와 테스트데이터를 분리한다면?')
print()
# print('target : ')
# print(iris_frm['target'])
iris_feature_frm = iris_frm.iloc[:,[0,1,2,3] ]
# display(iris_feature_frm)
iris_target_frm = iris_frm.iloc[:, 4]
# display(iris_target_frm)

In [None]:
# X_train 에 대해서 predict 하면
y_pred = iris_dtc_model.predict(X_train)
print('y_test : ', y_train)
print('y_pred : ', y_pred, type(y_pred))

In [None]:
# 과적합 발생
acc = accuracy_score(y_train, y_pred)
print('acc : ',acc)

#### 교차검증(cross validation) - 회귀 x, 분류 0
- 과적합(overffiting)을 방지하기위한 방법
- 데이터의 편중을 막기위해서
- KFold 방식

In [5]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_validate

In [6]:
fold_iris = load_iris()

features = fold_iris.data
label    = fold_iris.target

In [None]:
features

In [12]:
print(kfold.split(features))

<generator object _BaseKFold.split at 0x000002904FE83890>


In [14]:
print('5개의 폴더 세트를 분리하여 각 폴더 세트별 정확도를 확인해보자')
cv_acc = []
kfold = KFold(n_splits=5)
# 인덱스가 안 섞임

fold_dct_model = DecisionTreeClassifier()

for train_idx, test_idx in kfold.split(features) :
    print('train idx : ', train_idx)
    print(type(train_idx))
    print('test idx : ', test_idx)
    X_train, X_val = features[train_idx], features[test_idx]
    y_train, y_val = label[train_idx], label[test_idx]
#     print('X_train :', X_train)
#     print('X_val :', X_val)
    fold_dct_model.fit(X_train, y_train)
    fold_pred = fold_dct_model.predict(X_val)
    
    acc = accuracy_score(y_val, fold_pred)
    print('acc : ', acc)
    cv_acc.append(acc)
    
print('교차검증 평균 정확도 : ', np.mean(cv_acc))    

5개의 폴더 세트를 분리하여 각 폴더 세트별 정확도를 확인해보자
train idx :  [ 30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
<class 'numpy.ndarray'>
test idx :  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
acc :  1.0
train idx :  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  60  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 1

In [None]:
print('기존 KFold 방식의 문제점 확인 :')
print()

fold_iris_frm = pd.DataFrame(data = fold_iris.data,
                            columns = fold_iris.feature_names)
fold_iris_frm['target'] = fold_iris.target
fold_iris_frm['target'].value_counts()

In [None]:
bad_fold_iris = KFold(n_splits=3)
n_iter = 0

fold_dct_model = DecisionTreeClassifier()

for train_idx, test_idx in bad_fold_iris.split(fold_iris_frm) :
    n_iter += 1
    
    label_train = fold_iris_frm['target'].iloc[train_idx]
    label_val   = fold_iris_frm.iloc[test_idx]
    print('교차검증 횟수 :', n_iter)
    print()
    print('학습 레이블 데이터 분포 : \n', label_train)
    print('검증 레이블 데이터 분포 : \n', label_val)
    

In [None]:
display(bad_fold_iris)

#### [실습]
- 아이리스 데이터를 이용하여 StratifiedkFold 교차검증을 진행해 보자
- random_state = 200
- StratifiedkFold(3,5) 평균 정확도 확인

In [None]:
fold_iris = load_iris()
n_iter = 0
features = fold_iris.data
label    = fold_iris.target

print('레이블 불균형을 해결하면서 교차검증 진행')
cv_acc = []
kfold = StratifiedKFold(n_splits=3)
# 인덱스가 섞임

fold_dct_model = DecisionTreeClassifier(random_state=200)

for train_idx, test_idx in kfold.split(features, label) :
    n_iter += 1
#     print('train idx : ', train_idx)
#     print('test idx : ', test_idx)
    X_train, X_val = features[train_idx], features[test_idx]
    y_train, y_val = label[train_idx], label[test_idx]
#     print('X_train :', X_train)
#     print('X_val :', X_val)
    fold_dct_model.fit(X_train, y_train)
    fold_pred = fold_dct_model.predict(X_val)
    
    acc = accuracy_score(y_val, fold_pred)
    print('검증횟수 {}, 교차검증 정확도 {}, 학습데이터 크기 {}, 검증데이터 크기{}'.format(n_iter, acc, len(X_train), len(y_val)))
    cv_acc.append(acc)
    
print()
print('*'*50)
print('교차검증 평균 정확도 : ', np.mean(cv_acc))    

- 위 과정을 한번에 수행하는 함수 : cross_val_score(), 셔플 돼 있음
- 인자로 예측모델, 피처세트, 레이블, 성능평가 지표, 폴더 수

In [19]:
fold_iris = load_iris()
n_iter = 0
features = fold_iris.data
label    = fold_iris.target

dt_model = DecisionTreeClassifier(random_state = 100)

In [20]:
print('성능평가 acc, 교차검증 5회 수행')
print('cross_val_score() :')
print()

scores = cross_val_score(dt_model, features, label, scoring='accuracy', cv = 5)

성능평가 acc, 교차검증 5회 수행
cross_val_score() :



In [24]:
print('type : ', type(scores))
print('data : ', scores)
print('mean : ', np.round(np.mean(scores), 2))

type :  <class 'numpy.ndarray'>
data :  [0.96666667 0.96666667 0.9        0.93333333 1.        ]
mean :  0.95


- cross validate

In [25]:
scores = cross_validate(dt_model, features, label, scoring='accuracy', cv = 5)

In [31]:
print('type : ', type(scores))
for key in scores.keys() :
    print(key)

print('fit_time :', scores['fit_time'])
print('score_time :', scores['score_time'])
print('test_score :', scores['test_score'])
print('mean : ', np.round(np.mean(scores['test_score']),2))

type :  <class 'dict'>
fit_time
score_time
test_score
fit_time : [0.00100183 0.00100088 0.00100017 0.00100112 0.        ]
score_time : [0.         0.         0.         0.         0.00100088]
test_score : [0.96666667 0.96666667 0.9        0.93333333 1.        ]
mean :  0.95
