K Nearest Neighbour Regressor as model for predicting house prices.
Following implementaion example of sklearn KNeighborsRegressor from http://scikit-learn.org/stable/auto_examples/neighbors/plot_regression.html

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [2]:
def extend_and_one_hot_encode_categorical_column(df, column_name, unique_values):
    for categorical_value in unique_values:
        df[categorical_value] = 0
        df.loc[df.loc[:, column_name] == categorical_value, categorical_value] = 1
    return df

In [3]:
data = pd.read_csv('../data/train.csv', delimiter=',').drop(['Id'], axis=1)
categorical_columns = data.select_dtypes(exclude = ['number']).columns

for column_name in categorical_columns:
    unique_values = data.loc[:, column_name].unique()
    data = extend_and_one_hot_encode_categorical_column(data, column_name, unique_values)

data = data.drop(categorical_columns, axis = 1)

In [4]:
data = data.fillna(0)

In [5]:
number_of_samples = data.shape[0]
train_data = data.iloc[0:int(number_of_samples*0.6), :]
validation_data = data.iloc[int(number_of_samples*0.6):int(number_of_samples*0.8), :]
test_data = data.iloc[int(number_of_samples*0.8):, :]

X_train = train_data.drop(['SalePrice'], axis=1)
y_train = train_data.loc[:, 'SalePrice']
X_validation = validation_data.drop(['SalePrice'], axis=1)
y_validation = validation_data.loc[:, 'SalePrice']
X_test = test_data.drop(['SalePrice'], axis=1)
y_test = test_data.loc[:, 'SalePrice']

In [6]:
n_neighbors = 4
score_on_train = {}
score_on_validation = {}
cv_scores = {}
for i, weights in enumerate(['uniform', 'distance']):
    knn = KNeighborsRegressor(n_neighbors, weights = weights, p=1)
    model = knn.fit(X_train, y_train)
    
    train_prediction = model.predict(X_train)
    score_on_train[i] = [weights, r2_score(y_train, train_prediction)]
    
    cv_scores[i] = [weights, cross_val_score(knn, X_train, y_train, cv=10).mean()]
    
    validation_prediction = model.predict(X_validation)
    score_on_validation[i] = [weights, r2_score(y_validation, validation_prediction)]    

In [7]:
print('r2 score on training is ', score_on_train)
print('CV score on training is ', cv_scores)
print('r2 score on validation is ', score_on_validation)

r2 score on training is  {0: ['uniform', 0.83897441637815739], 1: ['distance', 1.0]}
CV score on training is  {0: ['uniform', 0.71468081641994063], 1: ['distance', 0.72335948224592628]}
r2 score on validation is  {0: ['uniform', 0.71546307364249739], 1: ['distance', 0.71738100748415201]}
