# KNN(회귀)

## data/libary 준비

In [5]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data=pd.read_csv("house_price.csv")
x=data[data.columns[1:5]]
y=data[['house_value']]

In [6]:
print(x.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17689 entries, 0 to 17688
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   income      17689 non-null  float64
 1   bedrooms    17689 non-null  float64
 2   households  17689 non-null  float64
 3   rooms       17689 non-null  float64
dtypes: float64(4)
memory usage: 552.9 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17689 entries, 0 to 17688
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   house_value  17689 non-null  int64
dtypes: int64(1)
memory usage: 138.3 KB
None


## test/train set 분리

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [12]:
x_train,x_test,y_train,y_test= train_test_split(x, y,random_state=42)
scaler=MinMaxScaler()
scaler.fit(x_train)
x_scaled_train=scaler.transform(x_train)
x_scaled_test=scaler.transform(x_test)

## model 적용

In [13]:
from sklearn.neighbors import KNeighborsRegressor
model=KNeighborsRegressor()
model.fit(x_scaled_train,y_train)
pred_train=model.predict(x_scaled_train)
model.score(x_scaled_train,y_train)

0.6804607237174459

In [14]:
pred_test=model.predict(x_scaled_test)
model.score(x_scaled_test,y_test)

0.5541889571372401

## RMSE 확인하기

In [15]:
import numpy as np

In [17]:
from sklearn.metrics import mean_squared_error
MSE_train=mean_squared_error(y_train,pred_train)
MSE_test=mean_squared_error(y_test,pred_test)
print(np.sqrt(MSE_train))
print(np.sqrt(MSE_test))

53952.69804097723
63831.91662964773


## hyperparameter 최적화

### Grid Search

In [19]:
from sklearn.model_selection import GridSearchCV
param_grid={"n_neighbors":[1,3,5,7,9,11]}
grid_search=GridSearchCV(KNeighborsRegressor(),param_grid,cv=5,return_train_score=True)
# 파라미터 찾기
grid_search.fit(x_scaled_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11]},
             return_train_score=True)

#### 결과 확인하기

In [20]:
print("Best Parameter : {}".format(grid_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(grid_search.best_score_))
print('Test set Score : {:.4f}'.format(grid_search.score(x_scaled_test,y_test)))

Best Parameter : {'n_neighbors': 11}
Best Cross-validity Score : 0.5638
Test set Score : 0.5880


### Random Search

In [22]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
param_distribs={"n_neighbors":randint(low=1,high=20)} # 무작위 난수 생성
random_search=RandomizedSearchCV(KNeighborsRegressor(),param_distributions=param_distribs,n_iter=20,cv=5,return_train_score=True)
# 파라미터 찾기

random_search.fit(x_scaled_train,y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=20,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019CFD8394F0>},
                   return_train_score=True)

#### 결과 확인하기

In [23]:
print("Best Parameter : {}".format(random_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(random_search.best_score_))
print('Test set Score : {:.4f}'.format(random_search.score(x_scaled_test,y_test)))

Best Parameter : {'n_neighbors': 19}
Best Cross-validity Score : 0.5777
Test set Score : 0.6004
