## 1. 작동 메커니즘

#### 1. 데이터 준비 (Prepare Data)

In [None]:
# 필요 패키지 import
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False 

In [None]:
# 데이터 불러오기
data = pd.read_csv('../data/size_korea_small.csv')
data.head()

In [None]:
data.isnull().sum()

In [None]:
data['키']

In [None]:
data['몸무게']

In [None]:
data['체지방율']

In [None]:
# 이상치 발견

In [None]:
(data['체지방율']>80)

In [None]:
data[]

In [None]:
# 성별 히스토그램을 하나에 그리기

In [None]:
data[data['성별']=='남']['키'].rename('남').reset_index(drop=True)

In [None]:
# 이상치 값을 비슷한 체형의 체지방율로 채우기

In [None]:
near = ((data[data['성별']=='여'][['키','몸무게']] - [1570.0, 56.6])**2).sum(axis=1).sort_values().index[1:4]

In [None]:
data.loc[near,'체지방율'].mean()

In [None]:
data.loc[data['체지방율']>80, '체지방율'] = data.loc[near,'체지방율'].mean()

In [None]:
# 분포 확인: 간단한 탐색적 분석
plt.scatter(data['키'],data['체지방율'],c=data['성별'].replace({'남':'b','여':'r'}),alpha=0.2)

In [None]:
plt.scatter(data['키'],data['몸무게'],c=data['성별'].replace({'남':'b','여':'r'}),alpha=0.2)

In [None]:
plt.scatter(data['몸무게'],data['체지방율'],c=data['성별'].replace({'남':'b','여':'r'}),alpha=0.2)

In [None]:
# 데이터 정제 및 변환
X = 
y = 

#### 2. 클래스 선택 (Choose class of model ) 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#### 3. 알고리즘 결정 (Choose hyperparameters)

In [None]:
# 무작위 시드 발생(예측 결과 동일시)
np.random.seed(0)
knn = KNeighborsClassifier(n_neighbors=3)            # 3. 알고리즘 결정 (하이퍼파라미터 지정)


#### 4. 모델 학습 (Fit model to data)

In [None]:
model = knn.fit(X, y)                                # 4. 학습(fit)으로 모델 생성(적합)

#### 5. 모델 활용 (Apply model to new data)

In [None]:
y_predict = model.predict(X)                         # 5. 새로운 데이터 분류 예측 (양성, 음성)
y_probas = model.predict_proba(X)                    # 새로운 데이터 예측 확률 확인(probability(양성))

In [None]:
# 정확도 계산
from sklearn.metrics import accuracy_score
accuracy_score(y, y_predict)

## 2. 교차 검증(cross validation)

#### 정확도(accuracy) 계산 - 2개로 분할한 경우

In [None]:
# 데이터 1/2로 무작위 분할
from sklearn.model_selection import train_test_split
X1, X2, y1, y2 = train_test_split(X, y, train_size=0.5, random_state=0)

In [None]:
# 데이터 크기 확인
X1.shape, X2.shape, y1.shape, y2.shape
y1.value_counts(), y2.value_counts()                     # y1.value_counts(normalize=True)

In [None]:
# 정확도 계산
knn.fit(X1,y1)
knn.score(X2, y2).round(4)
knn.fit(X2,y2)
knn.score(X1, y1).round(4)

In [None]:
# sklearn의 accuracy_score를 이용한 방법
from sklearn.metrics import accuracy_score
y2_predict = knn.fit(X1, y1).predict(X2)
y1_predict = knn.fit(X2, y2).predict(X1) 
accuracy_score(y2, y2_predict).round(4)
accuracy_score(y1, y1_predict).round(4)

In [None]:
# 참고: knn.score 계산 과정
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
 
# confusion_matrix 만들기
cm = confusion_matrix(y2, knn.predict(X2))
cm

In [None]:
# knn.score가 계산
cm.diagonal().sum() / cm.sum()

In [None]:
# confusion_matri를 heatmap 형태로 그리기
plot_confusion_matrix(knn, X2, y2)

#### 정확도(accuracy) 계산 – 전체 데이터를 K개로 분할한 경우

In [None]:
# K-Fold를 이용한 교차 검증 (K=5인 경우)
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=5, shuffle=True, random_state=0) 
cross_val_score(model, X, y, cv=kf)
cross_val_score(model, X, y, cv=kf).mean()
cross_val_score(model, X, y, cv=kf).std()

In [None]:
# 참고: K-Fold index를 이용해서 각각 실행하고 결과 취합하기
accuracy_list = []
for train_index, test_index in kf.split(X):
    X_train = X.loc[train_index]; y_train = y.loc[train_index]
    X_test = X.loc[test_index]; y_test = y.loc[test_index] 
    print(np.c_[y_train.value_counts(),y_test.value_counts()])
    y_predict = knn.fit(X_train, y_train).predict(X_test)
    accuracy_list.append(accuracy_score(y_test, y_predict))
print(accuracy_list)
print(sum(accuracy_list)/len(accuracy_list))

In [None]:
# 참고: K-Fold index 확인하기 
train_idx, test_idx = {}, {}
for idx, [train_index, test_index] in enumerate(kf.split(X)):
    train_idx[idx] = train_index
    test_idx[idx] = test_index
train_idx

## 3. 예측 성능

#### Confusion matrix(혼동표) 

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, X_test, y_test)

In [None]:

plot_confusion_matrix(model, X_test, y_test, 
                      normalize='true')

#### Classification report(분류 보고서) 

In [None]:
from sklearn.metrics import classification_report
y_predict = knn.predict(X_test)
print(classification_report(y_test, y_predict))

## 실습_00

#### 예측 모델 성능지표(performance index)

In [None]:
from sklearn.metrics import confusion_matrix
y_true = ["cat", "ant", "cat", "cat", "ant", "bird", "bird", "cat"]
y_pred = ["ant", "ant", "cat", "cat", "ant",  "cat", "bird", "cat"]
cm = confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay(confusion_matrix=cm,
                      display_labels=["ant", "bird", "cat"]).plot()

In [None]:
from sklearn.metrics import 


In [None]:
from sklearn.metrics import 


In [None]:
# accuracy


In [None]:
# precision


In [None]:
# recall



In [None]:
# F1



In [None]:
# Macro Accuracy



In [None]:
# Weighted Accuracy



## 실습_01

#### titanic data 

데이터 준비

In [None]:
from seaborn import load_dataset
from sklearn.model_selection import train_test_split, KFold

In [None]:
titanic = load_dataset('titanic')[['survived', 'pclass', 'sex', 'age', 'sibsp', 'fare']].dropna()

In [None]:
X_t = titanic.drop('survived', axis=1)
X_t['sex'] = X_t['sex'].replace({'male':0, 'female':1})
y_t = titanic['survived']

In [None]:
X_t_train, X_t_test, y_t_train, y_t_test = train_test_split( )

모델 학습

In [None]:
from sklearn.neighbors import 
model = 
kf = 

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X_t, y_t, cv=kf)

In [None]:
cross_val_score(model, X_t, y_t, cv=kf)

In [None]:
cross_val_score(model, X_t, y_t, cv=kf)

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report
model.fit(X_t_train, y_t_train)

성능 확인

## 4. 하이퍼파라미터 튜닝

#### GridsearchCV

데이터 준비 (Prepare Data)

In [None]:
import pandas as pd
# 데이터 불러오기
data = pd.read_csv('../data/size_korea_small.csv')
# 데이터 정제 및 변환
X = data.drop('성별', axis=1)
y = data['성별'].replace({'남':0,'여':1})

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

클래스와 알고리즘 선택

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = 

Hyperparameter별 값 범위 지정

In [None]:
hyperparameters = {'n_neighbors': [1, 2, 3, 11, 17],
                   'weights': ['uniform', 'distance'],
                   'leaf_size': [4, 6, 8, 10]}

Hyperparameter 선정과 CV 방식 결정

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = 
grid_search.fit(X_train, y_train)

결과 Hyperparameter 확인

In [None]:
print()
print(f'optimal train score: {:.3f}')
print(f'test score: {:.3f}')
print(f'optimal parameter: {}')
print()

모델 성능 확인

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix
plot_confusion_matrix(grid_search, X_test, y_test)
y_predict = grid_search
y_probas = grid_search
print(classification_report(y_test, y_predict))

## 4. 하이퍼파라미터 튜닝

#### 직접 KNN  구현

거리 계산 함수

In [None]:
(((X_train-x)**2)
           .sum(axis=1)
           .sort_values()
           .head(5).index)

In [None]:
y_train[idx].value_counts().sort_values()

예측

In [None]:
y_pred_manual = X_test.apply(knn_manual, axis=1)

예측 정확도 계산

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_pred_manual))
print(confusion_matrix(y_test, y_pred_manual))
print('accuracy is',accuracy_score(y_pred_manual,y_test))

## 4. 하이퍼파라미터 튜닝

#### RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, plot_confusion_matrix
randomized_search = RandomizedSearchCV(model, param_distributions= hyperparameters, cv=5, 
                                       n_iter=30, verbose=2, scoring='accuracy')
randomized_search.fit(X_train, y_train)

모델 학습

In [None]:
print(randomized_search.best_estimator_)
print(randomized_search.best_estimator_.get_params())
print('optimal train score: {:.3f}'.format(randomized_search.best_score_))
print('optimal test score: {:.3f}'.format(randomized_search.score(X_test, y_test)))
print('optimal parameter: {}'.format(randomized_search.best_params_))

성능 확인

In [None]:
plot_confusion_matrix(randomized_search, X_test, y_test)
y_predict = randomized_search.predict(X_test)
y_probas = randomized_search.predict_proba(X_test)
print(classification_report(y_test, y_predict))

In [None]:
randomized_search.cv_results_

In [None]:
pd.DataFrame(randomized_search.cv_results_).columns
(pd.DataFrame(randomized_search.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
 .sort_values(by='rank_test_score').head(10))

## 실습_02

#### iris data + RandomizedSearchCV 

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import 

데이터 준비

In [None]:
iris = load_iris()
X_r, y_r = iris['data'], iris['target']

In [None]:
X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(X_r, y_r, test_size=0.2, stratify=y_r, random_state=0)

In [None]:
from sklearn.neighbors import 


모델 학습

In [None]:
hyperparameters = {'n_neighbors': list(range(5,20,2)),                  # model.get_params().keys()
                   'weights': ['uniform', 'distance'],
                   'leaf_size': list(range(4,10)),
                   'algorithm': ['ball_tree', 'kd_tree'],  'p': [1,2]}

In [None]:
randomized_search = RandomizedSearchCV(, hyperparameters, cv=5, n_iter=20, verbose=True)
randomized_search.fit(X_r_train, y_r_train)

성능 확인

In [None]:
y_r_pred = randomized_search.predict(X_r_test)
randomized_search.predict_proba(X_r_test)
print(classification_report(y_r_test, y_r_pred))

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report
plot_confusion_matrix(randomized_search, X_r_test, y_r_test)

## 실습_02

#### iris data + 소비자행태자료

훈련과 검증데이터 분리와 모델 정확성

In [None]:
import numpy as np
accuracy = []
for i in np.arange(0, 20) :
    X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(X_r, y_r, test_size= 0.2, random_state=i)
    accuracy.append(cross_val_score(model, X_r_train, y_r_train, cv=5).mean())
    
pd.DataFrame(accuracy).plot(kind= 'bar', rot='70', figsize=(20,4))

In [None]:
pd.DataFrame(accuracy).plot.hist()

## 수신자운영특성 곡선 
(Receiver Operating Characteristic Curve, ROC)

#### 5. 각종 성능지표

데이터 및 모델 준비

In [None]:
import pandas as pd
# 데이터 불러오기
data = pd.read_csv('../data/size_korea_small.csv')
# 데이터 정제 및 변환
X = data.drop('성별', axis=1)
y = data['성별'].replace({'남':0,'여':1})

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

모델 학습

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)

In [None]:
hyperparameters = {'n_neighbors': list(range(5,20,2)),                  # model.get_params().keys()
                   'weights': ['uniform', 'distance'],
                   'leaf_size': list(range(4,10)),
                   'algorithm': ['ball_tree', 'kd_tree'],  'p': [1,2]}

In [None]:
randomized_search = RandomizedSearchCV(model, hyperparameters, cv=5, n_iter=20, verbose=True)
randomized_search.fit(X_train, y_train)

분류임계값(decision threshold)에 따른 재현율과 정밀도

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
y_probas = randomized_search.predict_proba(X_test)
plt.figure(figsize=(10, 4))
plt.hist(y_probas[:, 1], bins=20, cumulative=True)
print('50% 이상의 확률 예측 자료는 {} 개'.format(sum((y_probas[:, 1] >= 0.5)*1)))

In [None]:
y_predict = randomized_search.predict(X_test)
unique, counts = np.unique(y_predict, return_counts=True)
print(unique, counts)

In [None]:
result = pd.DataFrame(y_probas, columns =['남', '여']).round(2)
result['분류'] = y_predict
result

In [None]:
fig, axs = plt.subplots(1,2, figsize=(12,4))
axs[0].hist(y_probas[:,0], color ='blue', label='남'); axs[0].legend()
axs[1].hist(y_probas[:,1], color ='red', label='여'); axs[1].legend()

In [None]:
# 정밀도, 재현율 그려보기
from sklearn.metrics import precision_recall_curve
p, r, thresholds = precision_recall_curve(y_test, y_probas[:, 1])
pd.DataFrame(np.c_[p, r].T, index=['정밀도', '재현율']).round(3).T.plot()

참고: pandas로 confusion_matrix, ROC 구현하기

In [None]:
# pandas를 이용해서 수동으로 confusion_matrix 구현해 보기
result = pd.DataFrame(np.c_[y_probas, y_predict, y_test], 
                      columns =['남', '여', 'pred', 'test'])
result.groupby(result['여']>0.5)['test'].value_counts().unstack()

In [None]:
# pandas를 이용해서 수동으로 roc 구현하기
result.groupby(result['여']>0.5)['test'].value_counts().unstack()

roc = []
for th in np.linspace(0,1,100):
    temp = result.groupby(result['여']>th)['test'].value_counts()
    tpr = (temp[True, 1] if (True, 1) in temp.index else 0) / temp[:, 1].sum()
    fpr = (temp[True, 0] if (True, 0) in temp.index else 0) / temp[:, 0].sum()
    roc.append([fpr, tpr])

plt.plot(np.array(roc)[:,0], np.array(roc)[:,1])

참고: 양성(1=여성)기준 수신자운영특성곡선과 정밀도 재현율 곡선

In [None]:
from sklearn.metrics import RocCurveDisplay, roc_curve
clf = randomized_search.best_estimator_
y_score = clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1])

In [None]:
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
prec, recall, _ = precision_recall_curve(y_test, y_score, pos_label=clf.classes_[1])

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=axs[0])
PrecisionRecallDisplay(precision=prec, recall=recall).plot(ax=axs[1])

#### 기타 성능지표 - lift, log_loss

In [None]:
import scikitplot as skplt
skplt.metrics.plot_lift_curve(y_test, y_probas)

In [None]:

skplt.metrics.plot_cumulative_gain(y_test, y_probas)

In [None]:
from sklearn.metrics import log_loss
log_loss(y_test, y_probas)

## 실습_03

#### Titanic RandomizedSearchCV,  성능 지표 구하기

데이터 준비

In [None]:
from seaborn import load_dataset
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import plot_confusion_matrix, classification_report

In [None]:
titanic = load_dataset('titanic')[['survived', 'pclass', 'sex', 'age', 'sibsp', 'fare']].dropna()
X_t = titanic.drop('survived', axis=1)
X_t['sex'] = X_t['sex'].replace({'male':0, 'female':1})
y_t = titanic['survived']

In [None]:
X_t_train,X_t_test,y_t_train,y_t_test = train_test_split(X_t,y_t,test_size=0.2,stratify=y_t,random_state=0)

모델 학습

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [None]:
hyperparameters = {'n_neighbors': list(range(5,20,2)),                  # model.get_params().keys()
                   'weights': ['uniform', 'distance'],
                   'leaf_size': list(range(4,10)),
                   'algorithm': ['ball_tree', 'kd_tree'],  'p': [1,2]}

In [None]:
randomized_search = RandomizedSearchCV(model, hyperparameters, cv=5, n_iter=20, verbose=True)
randomized_search.fit(X_t_train, y_t_train)

성능 확인

In [None]:
y_t_pred = randomized_search.predict(X_t_test)
y_t_probas = randomized_search.predict_proba(X_t_test)
plot_confusion_matrix(randomized_search, X_t_test, y_t_test)
print(classification_report(y_t_test, y_t_pred))

In [None]:
from sklearn.metrics import log_loss
log_loss(y_t_test, y_t_probas)

## 6. 파이프라인

#### 데이터 준비

In [None]:
# 필요 패키지 import
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False 

In [None]:
# 데이터 불러오기
data = pd.read_csv('../data/size_korea_small.csv')
# 데이터 정제 및 변환
X = data.drop('성별', axis=1)
y = data['성별'].replace({'남':0,'여':1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

클래스와 알고리즘 선택

In [None]:
# feature 개수를 줄이고 적합하는 과정으로 수행 계획
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=25, step=20)
lr = LogisticRegression()

#### 파이프라인으로 연결

In [None]:
from sklearn.pipeline import Pipeline
clf = Pipeline(steps=[('feature_selector', rfe_selector),
                      ('classifier', lr)])

적합 및 성능 확인

In [None]:
clf.fit(X_train, y_train)
print(f'model score: {clf.score(X_test, y_test):.3f}')

모델 저장

In [None]:
from joblib import dump, load
dump(clf, '../working/clf.joblib')

In [None]:
# RFE로 feature 선택하기
data = pd.read_csv('../data/size_korea_w_null.csv').dropna()
X = data.drop(['성별','기초대사량평가'], axis=1)
y = data['성별'].replace({'남':0,'여':1})
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=5, step=20)
rfe_selector.fit(X, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(rfe_feature)

## 7. 모델 저장 및 활용

In [None]:
#### # 학습되어 저장된 모델 불러오기
from joblib import dump, load
model = load('../working/clf.joblib')

In [None]:
# 과거 학습된 모델을 바로 적용하기
y_pred = model.predict(X_test)

In [None]:
# 과거 학습된 모델의 예측 점수와 비교하기
print(f"model score: {model.score(X_test, y_test):.3f}")