
# KNN (분류)

## data load/ library import

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data=pd.read_csv("breast-cancer-wisconsin.csv")
x=data[data.columns[1:10]]
y=data[['Class']]

In [3]:
print(x.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Clump_Thickness              683 non-null    int64
 1   Cell_Size                    683 non-null    int64
 2   Cell_Shape                   683 non-null    int64
 3   Marginal_Adhesion            683 non-null    int64
 4   Single_Epithelial_Cell_Size  683 non-null    int64
 5   Bare_Nuclei                  683 non-null    int64
 6   Bland_Chromatin              683 non-null    int64
 7   Normal_Nucleoli              683 non-null    int64
 8   Mitoses                      683 non-null    int64
dtypes: int64(9)
memory usage: 48.1 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Class   683 non-null    int64
dtypes: int64(1)
memory usa

In [2]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,stratify=y,random_state=42)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x_train)
x_scaled_train=scaler.transform(x_train)
x_scaled_test=scaler.transform(x_test)

## KNN Load

In [4]:
from sklearn.neighbors import KNeighborsClassifier

### Model 학습

In [5]:
model=KNeighborsClassifier()
model.fit(x_scaled_train,y_train)
pred_train=model.predict(x_scaled_train)
model.score(x_scaled_train,y_train)

0.984375

### 훈련 Data set 학습 결과 (혼동행렬)

In [10]:
from sklearn.metrics import confusion_matrix

In [11]:
confusion_train=confusion_matrix(y_train,pred_train)

In [12]:
print(confusion_train)

[[331   2]
 [  6 173]]


### 분류 결과 report 확인

In [13]:
from sklearn.metrics import classification_report
cfreport_train=classification_report(y_train,pred_train)
print(cfreport_train)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       333
           1       0.99      0.97      0.98       179

    accuracy                           0.98       512
   macro avg       0.99      0.98      0.98       512
weighted avg       0.98      0.98      0.98       512



### test set 결과 확인

In [14]:
pred_test=model.predict(x_scaled_test)

In [15]:
model.score(x_scaled_test,y_test)

0.9532163742690059

In [16]:
confusion_test=confusion_matrix(y_test,pred_test)
print(confusion_test)

[[106   5]
 [  3  57]]


In [17]:
from sklearn.metrics import classification_report
cfreport_test=classification_report(y_test,pred_test)
print(cfreport_test)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       111
           1       0.92      0.95      0.93        60

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



## Hyper Parameter 찾기 위한 Grid/Random Search

- hyper parameter : n_neighbors (default : 5)

### Grid Search

#### library load

In [20]:
from sklearn.model_selection import GridSearchCV
param_grid={"n_neighbors":[1,3,5,7,9,11]}
grid_search=GridSearchCV(KNeighborsClassifier(),param_grid,cv=5,return_train_score=True)
# 파라미터 찾기
grid_search.fit(x_scaled_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11]},
             return_train_score=True)

#### 최적화 값 찾기 

In [21]:
print("Best Parameter : {}".format(grid_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(grid_search.best_score_))
print('Test set Score : {:.4f}'.format(grid_search.score(x_scaled_test,y_test)))

Best Parameter : {'n_neighbors': 3}
Best Cross-validity Score : 0.9824
Test set Score : 0.9532


### Random Search

#### library load

In [26]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
param_distribs={"n_neighbors":randint(low=1,high=20)} # 무작위 난수 생성
random_search=RandomizedSearchCV(KNeighborsClassifier(),param_distributions=param_distribs,n_iter=20,cv=5,return_train_score=True)
# 파라미터 찾기

random_search.fit(x_scaled_train,y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=20,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000026A46DA56D0>},
                   return_train_score=True)

#### 최적화 값 찾기

In [27]:
print("Best Parameter : {}".format(random_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(random_search.best_score_))
print('Test set Score : {:.4f}'.format(random_search.score(x_scaled_test,y_test)))

Best Parameter : {'n_neighbors': 3}
Best Cross-validity Score : 0.9824
Test set Score : 0.9532


# KNN(회귀)

## data 준비

In [68]:
data2=pd.read_csv("house_price.csv")
data2.describe()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
count,17689.0,17689.0,17689.0,17689.0,17689.0,17689.0
mean,27.378823,3.671141,0.213278,2.952117,5.244001,189043.439313
std,11.28023,1.525937,0.051167,0.731573,1.184922,95487.122628
min,1.0,0.4999,0.1,0.75,1.64,14999.0
25%,18.0,2.5329,0.177464,2.47027,4.426829,114400.0
50%,28.0,3.4539,0.204104,2.854962,5.190779,171100.0
75%,36.0,4.5918,0.240157,3.316092,5.953728,242700.0
max,51.0,9.9055,0.498127,6.954023,11.901869,500000.0


In [79]:
x=data2[data2.columns[1:5]]
y=data2[["house_value"]]

In [80]:
x

Unnamed: 0,income,bedrooms,households,rooms
0,6.7770,0.141112,2.442244,8.103960
1,6.0199,0.160984,2.726688,5.752412
2,5.1155,0.249061,1.902676,3.888078
3,4.7109,0.231383,1.913669,4.508393
4,4.5625,0.255583,3.092664,4.667954
...,...,...,...,...
17684,2.3013,0.214583,2.748299,4.897959
17685,2.6750,0.246622,3.428571,4.698413
17686,2.3667,0.340771,1.876812,3.572464
17687,2.1000,0.386107,2.987805,3.774390


In [81]:
y

Unnamed: 0,house_value
0,500000
1,500000
2,500000
3,500000
4,500000
...,...
17684,26600
17685,22500
17686,17500
17687,14999


### test / train set 분리

In [84]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split(x,y,stratify=y,random_state=42)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [75]:
from sklearn.model_selection import train_test_split
x2_train,x2_test,y2_train,y2_test= train_test_split(x2,y2,stratify=y,random_state=42)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x2_train)
x2_scaled_train=scaler.transform(x2_train)
x2_scaled_test=scaler.transform(x2_test)

ValueError: Found input variables with inconsistent numbers of samples: [17689, 683]