## K-nearest neighbors

Check the performance of the model using KNN.

1. with default setting
2. with optimized hyperparamters using grid search
3. Baseline model with default setting
4. Baseline model with optimized hyperparamters using grid search 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [3]:
# import data
PARENT = "Predicting_House_Prices"
path = Path(PARENT).parent / "../Data/X_train_model2.csv"
X_train = pd.read_csv(path)

path2 = Path(PARENT).parent / "../Data/y_train_model2.csv"
y_train = pd.read_csv(path2)

path3 = Path(PARENT).parent / "../Data/X_valid_model2.csv"
X_valid = pd.read_csv(path3)

path4 = Path(PARENT).parent / "../Data/y_valid_model2.csv"
y_valid = pd.read_csv(path4)

In [4]:
X_train = X_train.drop(columns="Unnamed: 0")
y_train = y_train.drop(columns="Unnamed: 0")
X_valid = X_valid.drop(columns="Unnamed: 0")
y_valid = y_valid.drop(columns="Unnamed: 0")

In [5]:
X_valid.describe()

Unnamed: 0,yearBuilt,livingArea,bathrooms,bedrooms,parking,garageSpaces,hasGarage,pool,spa,isNewConstruction,...,county_Wayne County,county_Wheeler County,county_White County,county_Whitfield County,county_Wilcox County,county_Wilkes County,county_Wilkinson County,county_Worth County,county_Yolo County,county_Yuba County
count,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,...,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0,6537.0
mean,-0.003228,-0.006875,-0.000329,0.001178,0.769772,0.003339,0.587578,0.134465,0.17791,0.030442,...,0.000765,0.000306,0.000612,0.003365,0.000153,0.000612,0.000306,0.000459,0.003518,0.002754
std,1.006085,0.02532,0.992459,0.986828,0.421011,1.005247,0.492308,0.341178,0.382467,0.171814,...,0.027648,0.01749,0.024731,0.057919,0.012368,0.024731,0.01749,0.021419,0.059216,0.052406
min,-6.093072,-0.041445,-1.262212,-1.990809,0.0,-0.894006,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.668783,-0.020694,-0.5293,-0.344171,1.0,-0.894006,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.091309,-0.012452,-0.5293,-0.344171,1.0,-0.083681,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.782301,-0.000236,0.203613,0.479148,1.0,0.726645,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.576942,0.610073,12.663129,8.712339,1.0,16.933163,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
print(X_train.shape, X_valid.shape)

(15421, 1759) (6609, 1759)


### with default setting

In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

knn_score = knn.score(X_valid, y_valid)
knn_preds = knn.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, knn_preds)} \n"
    f" Score : {knn_score:.5f} \n",
)

 Mean Squared Error : 1552023753043.586 
 Score : 0.54149 



In [8]:
(mean_squared_error(y_valid, knn_preds))**0.5

1245802.4534586477

### with optimized hyperparamters using grid search

In [40]:
from sklearn.model_selection import train_test_split

# 30% of training data is used for hyperparameter tuning for efficient search
X_temp, _, y_temp, _ = train_test_split(X_train, y_train, random_state=0, test_size=0.7)
print(len(X_temp), len(y_temp))

4575 4575


In [41]:
# List Hyperparameters
grid_params = { 'n_neighbors' : np.arange(1, 16),
                'weights' : ['uniform','distance'],
                'metric' : ['minkowski','euclidean','manhattan'],
                'p': [1,2]}

In [42]:
from sklearn.model_selection import GridSearchCV

# Use GridSearch
gs = GridSearchCV(KNeighborsRegressor(), grid_params, verbose = 1, cv = 3, n_jobs = -1)

g_res = gs.fit(X_temp, y_temp)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


In [43]:
# Print The value of best Hyperparameters
g_res.best_params_

{'metric': 'minkowski', 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}

In [9]:
# KNN with optimized parameters
knn2 = KNeighborsRegressor(n_neighbors = 9, weights = 'distance', metric = 'minkowski', p = 1)
knn2.fit(X_train, y_train)

knn2_score = knn2.score(X_valid, y_valid)
knn2_preds = knn2.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, knn2_preds)} \n"
    f" Score : {knn2_score:.5f} \n",
)

 Mean Squared Error : 1317532847769.4202 
 Score : 0.61077 



In [10]:
(mean_squared_error(y_valid, knn2_preds))**0.5

1147838.3369488146

In [46]:
# KNN with optimized parameters explored before changing preprocessing 
knn3 = KNeighborsRegressor(n_neighbors = 40, weights = 'distance', metric = 'minkowski', p = 1, leaf_size = 15)
knn3.fit(X_train, y_train)

knn3_score = knn3.score(X_valid, y_valid)
knn3_preds = knn2.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, knn3_preds)} \n"
    f" Score : {knn3_score:.5f} \n",
)

 Mean Squared Error : 1627688565616.9204 
 Score : 0.53944 



In [11]:
import pickle

# functions to save / load pickle objects
def save_obj(obj, filename):
    try:
        with open(filename, "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print("Error:", ex)


def load_obj(filename):
    try:
        with open(filename, "rb") as f:
            return pickle.load(f)
    except Exception as ex:
        print("Error:", ex)

In [12]:
# save model
save_obj(knn2, '../Code/KNN.pickle')

### Baseline model with default setting

In [47]:
# subsetting for only necessary vairables
X_train_sub = X_train.filter(["livingArea", "bathrooms", "bedrooms", "garageSpaces", 
                            "pool", "isNewConstruction", "state_CA", "state_GA"])
X_valid_sub = X_valid.filter(["livingArea", "bathrooms", "bedrooms", "garageSpaces", 
                            "pool", "isNewConstruction", "state_CA", "state_GA"])

In [48]:
# KNN with default setting
knn4 = KNeighborsRegressor()
knn4.fit(X_train_sub, y_train)

knn4_score = knn4.score(X_valid_sub, y_valid)
knn4_preds = knn2.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, knn4_preds)} \n"
    f" Score : {knn4_score:.5f} \n",
)

 Mean Squared Error : 1627688565616.9204 
 Score : 0.34260 



### Baseline model with optimized hyperparamters using grid search

In [35]:
# KNN with optimized parameters
knn5 = KNeighborsRegressor(n_neighbors = 14, weights = 'distance', metric = 'minkowski', p = 2)
knn5.fit(X_train_sub, y_train)

knn5_score = knn5.score(X_valid_sub, y_valid)
knn5_preds = knn2.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, knn5_preds)} \n"
    f" Score : {knn5_score:.5f} \n",
)

 Mean Squared Error : 3252744670495.3506 
 Score : 0.34961 

