# 2.1 핵심 개념
* k-최근접 이웃법 : 각 데이터들 가느이 거리를 측정하여 가까운 k개의 다른 데이터들의 레이블을 참조하여 분류하는 방법
* 거리 측정 방법 : 유클리디안 거리 계산법, 민코브스키 방법
* 중요 하이퍼파라미터 : '몇 개의 케이스들을 기준으로 동일 범주, 동일 값을 분류 혹은 예측할 것인가'
* K 찾는 방법
   * sqrt(데이터 수)
* K 값이 작을수록 정교한 분류와 예측 가능→과대적합
* K 값이 클수록 주변에 많은 케이들의 평균적인 군집과 평균값으로 분류와 예측→과소적합

# 2.3 분석 코드

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
data=pd.read_csv("../data/breast-cancer-wisconsin.csv", encoding='utf-8')
X=data[data.columns[1:-1]]
y=data[['Class']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, random_state=42)

## Part1. 분류 (Classification)

In [5]:
#기본 모델 적용
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(X_train, y_train)
pred_train=model.predict(X_train)
model.score(X_train, y_train)

0.984375

In [6]:
from sklearn.metrics import confusion_matrix
confusion_train=confusion_matrix(y_train,pred_train)
print("훈련데이터 오차행렬 : \n", confusion_train)

훈련데이터 오차행렬 : 
 [[331   2]
 [  6 173]]


In [7]:
from sklearn.metrics import classification_report
cfreport_train=classification_report(y_train, pred_train)
print("분류예측 레포트 : \n",cfreport_train)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       333
           1       0.99      0.97      0.98       179

    accuracy                           0.98       512
   macro avg       0.99      0.98      0.98       512
weighted avg       0.98      0.98      0.98       512



In [8]:
pred_test=model.predict(X_test)
model.score(X_test, y_test)

0.9532163742690059

In [9]:
confusion_test=confusion_matrix(y_test, pred_test)
print("테스트데이터 오차행렬 : \n",confusion_test)

테스트데이터 오차행렬 : 
 [[106   5]
 [  3  57]]


In [10]:
cfreport_test=classification_report(y_test, pred_test)
print("분류예측 레포트 : \n",cfreport_test)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       111
           1       0.92      0.95      0.93        60

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



In [16]:
KNeighborsClassifier?

In [15]:
#GridSearch
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(KNeighborsClassifier(),
                        param_grid={'n_neighbors':[1,3,5,7,9]},
                        cv=5,
                        return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9]},
             return_train_score=True)

In [21]:
print(f"Best Parameter : {grid_search.best_params_}")
print(f"Best Score : {grid_search.best_score_:.4f}")
print(f"TestSet Score : {grid_search.score(X_test, y_test):.4f}")

Best Parameter : {'n_neighbors': 3}
Best Score : 0.9824
TestSet Score : 0.9532


In [27]:
# RandomSearch
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
random_search=RandomizedSearchCV(KNeighborsClassifier(),
                                param_distributions={'n_neighbors':randint(low=1,high=20)},
                                n_iter=100,
                                cv=5,
                                return_train_score=True)
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=100,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022841D98908>},
                   return_train_score=True)

In [28]:
print(f"Best Parameter : {random_search.best_params_}")
print(f"Best Score : {random_search.best_score_}")
print(f"TestSet Score : {random_search.score(X_test, y_test)}")

Best Parameter : {'n_neighbors': 3}
Best Score : 0.9823910146582906
TestSet Score : 0.9532163742690059


## Part2. 회귀(Regression)

In [30]:
#분석데이터 준비
data2=pd.read_csv("../data/house_price.csv", encoding='utf-8')
X=data2[data2.columns[1:5]]
y=data2[['house_value']]

X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42)

In [34]:
from sklearn.preprocessing import MinMaxScaler
scaler_minmax=MinMaxScaler()
scaler_minmax.fit(X_train)
X_scaled_minmax_train=scaler_minmax.transform(X_train)
X_scaled_minmax_test=scaler_minmax.transform(X_test)

In [36]:
#기본모델적용
from sklearn.neighbors import KNeighborsRegressor
model=KNeighborsRegressor()
model.fit(X_scaled_minmax_train, y_train)
pred_train=model.predict(X_scaled_minmax_train)
model.score(X_scaled_minmax_train, y_train)

0.6804607237174459

In [38]:
pred_test=model.predict(X_scaled_minmax_test)
model.score(X_scaled_minmax_test,y_test)

0.5541889571372401

In [40]:
# RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train=mean_squared_error(y_train, pred_train)
MSE_test=mean_squared_error(y_test, pred_test)
print(f"훈련 테이터 RMSE : {np.sqrt(MSE_train):.4f}")
print(f"테스트 데이터 RMSE : {np.sqrt(MSE_test):.4f}")

훈련 테이터 RMSE : 53952.6980
테스트 데이터 RMSE : 63831.9166


In [45]:
#Grid Search
grid_search=GridSearchCV(KNeighborsRegressor(),
                        param_grid={'n_neighbors':[1,3,5,7,9,11]},
                        cv=5,
                        return_train_score=True)
grid_search.fit(X_scaled_minmax_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11]},
             return_train_score=True)

In [47]:
print(f"Best Parameter : {grid_search.best_params_}")
print(f"Best Score : {grid_search.best_score_:.4f}")
print(f"TestSet Score : {grid_search.score(X_scaled_minmax_test, y_test):.4f}")

Best Parameter : {'n_neighbors': 11}
Best Score : 0.5638
TestSet Score : 0.5880


In [58]:
# Random Search
random_search=RandomizedSearchCV(KNeighborsRegressor(),
                                param_distributions={'n_neighbors':randint(low=1, high=20)},
                                cv=5,
                                n_iter=20,
                                return_train_score=True)
random_search.fit(X_scaled_minmax_train, y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=20,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000022846D845C0>},
                   return_train_score=True)

In [59]:
print(f"Best Parameter : {random_search.best_params_}")
print(f"Best Score : {random_search.best_score_:.4f}")
print(f"TestSet Score : {random_search.score(X_scaled_minmax_test, y_test):.4f}")

Best Parameter : {'n_neighbors': 19}
Best Score : 0.5777
TestSet Score : 0.6004
