<a href="https://colab.research.google.com/github/leo-contigo/ML/blob/main/07_%EC%98%A4%EC%B0%A8%ED%96%89%EB%A0%AC(%EC%9D%B4%EC%A7%84%EB%B6%84%EB%A5%98)Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### ML_08_수업자료.html
- GridSearchCV 그리드 서치

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
# 주피터 노트북 환경설정
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from IPython.display import set_matplotlib_formats, Image
set_matplotlib_formats("retina")

import numpy as np
import pandas as pd
import os

In [None]:
# sklearn 관련 모듈 임포트 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV, train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# 토이 데이타셋 로딩
from sklearn.datasets import load_iris

# GridSearchCV
- 그리드 서치의 목적 - 모델의 최적의 하이퍼파라미터를 찾는다.

#### 학습과정

- (1) 모델 생성

- (2) 모델에서 필요한 하이퍼 파라미터(옵션)을 딕셔너리 형태로 지정 
    - parameters = {'max_depth':[1, 2, 3], 'min_samples_split':[2,3]}


- (3) GridSearchCV 메서드 적용 
```
임포트 위치 
    - sklearn.model_selection.GridSearchCV

    - grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True, return_train_score=True)

그리드 서치 객체 = GridSearchCV함수명( 모델명 , 
                        param_grid=parameters ,
                        cv  =  교차검증 횟수 ,  refit = 재학습,
                        return_train_score=True 기본값 )

- (4) 학습 → 그리드 서치 객체.fit() 
- (5) 평가

In [None]:
# 0) 데이터 준비 

iris = load_iris()
X = iris.data
y = iris.target

# 확인
X.shape, y.shape

((150, 4), (150,))

In [None]:
# iris

In [None]:
# 1) 모델 생성 
model_knn = KNeighborsClassifier()

# 디폴트값
model_knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [None]:
# 2) GridSearchCV 메서드 적용

# n_neighbors에 들어갈 값 설정 → 하이퍼 파라미터 생성 → 딕셔너리 형태로 정의해야함
# {하이퍼 파라미터 : [실제 값...]} 홀수로 
# 변수이름 = {}

# parameters = { 'n_neighbors' : [3, 5, 7, 9, 11]} 
parameters = { 'n_neighbors' : list(range(3 , 12 , 2))}

# 그리드서치 객체 생성 
model_knn_grid = GridSearchCV( model_knn, 
                        param_grid=parameters, 
                        cv = 5, 
                        refit=True, 
                        return_train_score=True, 
                        verbose=1 )

model_knn_grid

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 11]},
             return_train_score=True, verbose=1)

In [None]:
# 3) 그리드 서치 객체로 학습 
# 그리그 서치 객체 . fit ()

# 학습용 데이타와 테스트용 데이타 분리 
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=11)

# 학습 
model_knn_grid.fit( X_train, y_train )

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 11]},
             return_train_score=True, verbose=1)

In [None]:
# 제공 속성과 메서드 확인

# 'best_params_'
# 'best_score_'
# 'best_estimator_'
# 'cv_results_' 

print(dir(model_knn_grid))

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_feature_names', '_check_n_features', '_check_refit_for_multimetric', '_estimator_type', '_format_results', '_get_param_names', '_get_tags', '_more_tags', '_pairwise', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_run_search', '_select_best_index', '_validate_data', 'best_estimator_', 'best_index_', 'best_params_', 'best_score_', 'classes_', 'cv', 'cv_results_', 'decision_function', 'error_score', 'estimator', 'fit', 'get_params', 'inverse_transform', 'multimetric_', 'n_features_in_', 'n_jobs', 'n_splits_', 'param_grid', 'pre_dispatch', '

In [None]:
# 4) 평가 → 결과물 확인

print('최적의 파라미터 => ' , model_knn_grid.best_params_)
print('최고 평가 점수 => ' , model_knn_grid.best_score_)
print('최적의 모델 => ' , model_knn_grid.best_estimator_.get_params())

최적의 파라미터 =>  {'n_neighbors': 5}
최고 평가 점수 =>  0.9583333333333334
최적의 모델 =>  {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [None]:
# 각 교차검증 결과 : cv_results_ →  딕셔너리 → 데이타프레임 
# model_knn_grid.cv_results_
df_scores = pd.DataFrame(model_knn_grid.cv_results_)
df_scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.000406,0.000498,0.00139,0.000496,3,{'n_neighbors': 3},0.916667,0.875,0.958333,1.0,...,0.95,0.048591,2,0.96875,0.979167,0.96875,0.96875,0.958333,0.96875,0.006588
1,0.000615,0.000469,0.001332,0.000377,5,{'n_neighbors': 5},0.916667,0.958333,0.958333,0.958333,...,0.958333,0.026352,1,0.96875,0.979167,0.979167,0.979167,0.958333,0.972917,0.008333
2,0.000399,0.000488,0.001199,0.000398,7,{'n_neighbors': 7},0.875,0.958333,0.958333,0.958333,...,0.95,0.040825,2,0.979167,0.96875,0.96875,0.979167,0.96875,0.972917,0.005103
3,0.000374,0.000459,0.001601,0.000482,9,{'n_neighbors': 9},0.875,0.958333,1.0,0.916667,...,0.95,0.048591,2,0.979167,0.979167,0.96875,0.979167,0.96875,0.975,0.005103
4,0.000598,0.000488,0.00103,6.6e-05,11,{'n_neighbors': 11},0.875,0.916667,1.0,0.916667,...,0.941667,0.05,5,0.979167,0.979167,0.947917,0.958333,0.96875,0.966667,0.012148


In [None]:
# 5) 최적 모델로 다시 학습시키기 
# print('최적의 모델 => ' , model_knn_grid.best_estimator_.get_params())
model_knn_grid.best_estimator_.fit(X_train, y_train)
pred = model_knn_grid.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

In [None]:
# 일반적인 방법으로 학습시키고 평가 점수 확인

model_knn = KNeighborsClassifier(n_neighbors=5)

model_knn.fit(X_train, y_train)
pred = model_knn.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

----------------------------------------------------------

# 분류(Classification) 성능 평가 지표


- 오차 행렬(Confusion Matrix)


- 오차 행렬의 결과값으로 ↓ 

- 정확도(Accuracy)
- 정밀도(Precision)
- 재현율(Recall)
- F1 Score(F-measure)
- ROC(Receiver operation characteristric curve)
- AUC(Area Under Curve)


<img src="https://images.velog.io/images/ohxhxs/post/6730406e-3c39-4fbb-a5a5-59be60607ecd/%EC%98%A4%EC%B0%A8%ED%96%89%EB%A0%AC.png">

<img src="https://images.velog.io/images/cleansky/post/5fd08ca6-7fa7-44fd-9186-48c911e6f759/image.png">

- TP (True Positive): True(진짜) Positive(양성): 양성으로 예측했고, 그게 맞다
- TN (True Negative) : True(진짜) Negative(음성): 음성으로 예측했고, 그게 맞다
- FP (False Positive) : False(가짜) Positive(양성): 양성으로 예측했고, 그게 틀리다
- FN (False Negative) : False(가짜) Negative(음성): 음성으로 예측했고, 그게 틀리다

## 정확도(Accuracy)
- 예측결과가동일한 데이터건수/전체예측데이터건수
- 직관적으로 모델 예측 성능을 나타내는 평가지표


## 정밀도(Precision)와 재현율(Recall)
- 정밀도 = TP / (FP + TP)
- 재현율 = TP / (FN + TP)


## F1 Score(F-measure)
- 정밀도와 재현율을 결합한 지표
- 정밀도와 재현율이 어느 한쪽으로 치우치지 않을 때 높은 값을 가짐
- ( (정밀도 * 리콜) / ( 정밀도 + 리콜 )) * 2


------------------------------

## 오차행렬(Confusion Matrix)

- 예측 오류가 얼마인지와 더블어 어떠한 유형의 예측 오류가 발생하고 있는지를 함께 나타낸다.
- 혼동행렬이라고도 함
- 사이킷런의 confusion_matrix 모듈 이용
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
# metrics 평가도구
# 모듈 임포트 
# accuracy_score              : 정확도는 위에 이미 임포트 함
# precision_score             : 정밀도
# recall_score                    : 재현율
# mean_squared_error : 에러율
# classification_report  : 표 스타일 

from sklearn.metrics import confusion_matrix, precision_score,recall_score, f1_score, mean_squared_error
from sklearn.metrics import classification_report


# 라벨 임포트 
from sklearn.preprocessing import LabelEncoder

## 이진분류 - 데이터 불러오기

In [None]:
# 1) 데이터 준비
# 이진 분류 대표적인 타이타닉 자료

df_t = pd.read_csv('data/titanic_book.csv')
df_t.info()  # 결측치 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# 결측치 변경( fillna ) -  Age, Cabin, Embarked

df_t['Age'].fillna(df_t['Age'].mean() , inplace=True)  # 평균값
df_t['Cabin'].fillna('N' ,  inplace=True)              # 별도의 값 'N'
df_t['Embarked'].fillna('S' ,  inplace=True)           # 최빈값

In [None]:
df_t['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
df_t.info()  # 결측치 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# 수치형 자료로 라벨링 - Sex , Cabin , Embarked  

df_t['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [None]:
# 객체 생성

encoder = LabelEncoder()

encoder.fit(df_t['Sex'])
df_t['Sex'] = encoder.transform(df_t['Sex']) # 이렇게 하면 male = 1, female =2 변경
df_t['Sex'].value_counts()

1    577
0    314
Name: Sex, dtype: int64

In [None]:
df_t['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
encoder.fit(df_t['Embarked'])
df_t['Embarked'] = encoder.transform(df_t['Embarked']) # S = 2 , C = 0 , Q = 1 으로 변경
df_t['Embarked'].value_counts()

2    646
0    168
1     77
Name: Embarked, dtype: int64

In [None]:
# Cabin 컬럼은 테이터값에서 첫 글자만 추출 → 라벨 인코더를 이용해 수치형으로 변경

df_t['Cabin'] = df_t['Cabin'].str[0]

In [None]:
df_t['Cabin'].value_counts()

N    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

In [None]:
encoder.fit(df_t['Cabin'])
df_t['Cabin'] = encoder.transform(df_t['Cabin']) 
df_t['Cabin'].value_counts()

7    687
2     59
1     47
3     33
4     32
0     15
5     13
6      4
8      1
Name: Cabin, dtype: int64

In [None]:
df_t.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
df_t.drop(columns=['PassengerId' ,  'Name' , 'Ticket'], inplace=True)

In [None]:
df_t.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

In [None]:
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     891 non-null    int32  
 8   Embarked  891 non-null    int32  
dtypes: float64(2), int32(3), int64(4)
memory usage: 52.3 KB


In [None]:
df_t.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,2
1,1,1,0,38.0,1,0,71.2833,2,0
2,1,3,0,26.0,0,0,7.925,7,2
3,1,1,0,35.0,1,0,53.1,2,2
4,0,3,1,35.0,0,0,8.05,7,2


In [None]:
# 2) 데이터 분리 → 모델 생성 후 → 학습 시키기

X = df_t.drop(columns=['Survived']).copy()
y = df_t['Survived']
X.columns , y

(Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked'], dtype='object'),
 0      0
 1      1
 2      1
 3      1
 4      0
       ..
 886    0
 887    1
 888    0
 889    1
 890    0
 Name: Survived, Length: 891, dtype: int64)

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y,  
                                                     test_size=0.2, random_state=11)

model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

KNeighborsClassifier()

In [None]:
# 3) 오차 행렬 출력하기
# confusion_matrix( y_test, 모델명.predict( X_test ))

# 예측
pred = model_knn.predict(X_test)
pred

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1], dtype=int64)

In [None]:
# confusion_matrix( y_test ,  model_knn.predict(X_test))
# 또는 

cm = confusion_matrix( y_test, pred )
cm

array([[99, 19],
       [27, 34]], dtype=int64)

In [None]:
# 오차 행렬 값을 각 변수로 할당 

TN, FP = cm[ 0 , 0 ] , cm[ 0 , 1]
FN, TP = cm[ 1 , 0 ] , cm [ 1 , 1 ]

In [None]:
TN , FP , FN , TP

(99, 19, 27, 34)

<img src="https://images.velog.io/images/cleansky/post/5fd08ca6-7fa7-44fd-9186-48c911e6f759/image.png">

## 정확도

In [None]:
# 정확도 - accuracy_score() 메서드 이용
# pred = model_knn.predict(X_test)

accuracy_score(y_test, pred)

0.7430167597765364

In [None]:
# 정확도 - 오차 행렬을 이용

(TP + TN) / (TP + FP + FN + TN)

0.7430167597765364

## 에러율

In [None]:
# 에러율 - mean_squared_error ()
# pred = model_knn.predict(X_test)

mean_squared_error(y_test, pred)

0.2569832402234637

In [None]:
# 오차행렬 이용해서 구하기 

( FP + FN ) / (TP + FP + FN + TN)

0.2569832402234637

## 정밀도 precision

In [None]:
# 정밀도 ( precision_score )
# pred = model_knn.predict(X_test)

precision_score(y_test, pred)

0.6415094339622641

In [None]:
# 오차행렬 이용해서 구하기 

TP / (TP + FP )

0.6415094339622641

## 재현율 Recall

In [None]:
# 검출력 (Recall) = sensitivity
# pred = model_knn.predict(X_test)

recall_score(y_test, pred)

0.5573770491803278

In [None]:
TP / (TP + FN )

0.5573770491803278

## F1 스코어

In [None]:
# F1 스코어
# pred = model_knn.predict(X_test)
# ( (정밀도 * 리콜) / ( 정밀도 + 리콜 )) * 2

f1_score(y_test, pred)

0.5964912280701753

In [None]:
# ( (정밀도 * 리콜) / ( 정밀도 + 리콜 )) * 2

# 정밀도
precision = precision_score(y_test, pred)

# 리콜
recall = recall_score(y_test, pred)

# 결과
((precision * recall) / (precision + recall))*2

0.5964912280701753

In [None]:
# 평가표 출력

print( classification_report(y_test, pred) )

              precision    recall  f1-score   support

           0       0.79      0.84      0.81       118
           1       0.64      0.56      0.60        61

    accuracy                           0.74       179
   macro avg       0.71      0.70      0.70       179
weighted avg       0.74      0.74      0.74       179



- 이런 오차 행렬은 병원에서 주로 쓰이는 방식

- 참고 블로그 

https://frhyme.github.io/machine-learning/clf_%ED%8F%89%EA%B0%80%ED%95%98%EA%B8%B0/

In [None]:
df_t.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

In [None]:
df_t['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64