## Read data

In [123]:
import pandas as pd
import numpy as np
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':int, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':int, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data_small.csv/kc_house_data_small.csv', dtype = dtype_dict)
train = pd.read_csv('kc_house_data_small_train.csv/kc_house_data_small_train.csv', dtype = dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv/kc_house_data_small_test.csv', dtype = dtype_dict)
validation = pd.read_csv('kc_house_data_small_validation.csv/kc_house_data_validation.csv', dtype = dtype_dict)

## Define useful functions

In [124]:
def get_numpy_data(data, features, output):
    data['constant'] = 1
    features = np.append('constant', features)  ## add new element to an array
    feature_matrix = data[features].to_numpy()
    output_array = data[output].to_numpy()
    return(feature_matrix, output_array)

In [125]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis = 0)
    normalized_feature = features/norms
    return(normalized_feature, norms)

In [126]:
all_feature =  ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
feature_matrix_train, out_train = get_numpy_data(train, all_feature, 'price')
feature_matrix_test, out_test = get_numpy_data(test, all_feature, 'price')
feature_matrix_validation, out_validation = get_numpy_data(validation, all_feature, 'price')

In [127]:
normalized_train, norms = normalize_features(feature_matrix_train)
normalized_test = feature_matrix_test/norms
normalized_validation = feature_matrix_validation/norms

In [184]:
# the distance between the first house of the test set  and the 10th house of the training set
print (normalized_test[0])
print (normalized_train[9])
np.sqrt(np.sum((normalized_test[0]-normalized_train[9])**2))

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


0.05972359371398078

In [186]:
dist0 = np.zeros(10)
for i in range(10):
    dist0[i] = np.sqrt(np.sum((normalized_test[0]-normalized_train[i])**2))
np.where(dist0== dist0.min())    

(array([8]),)

## Perform 1-nearest neighbor regression

In [187]:
diff = normalized_train - normalized_test[0]
diff[-1].sum()

-0.09343399874654643

In [188]:
dist = np.sqrt(np.sum(diff**2, axis = 1))

In [189]:
def compute_distances(features_instances, features_query):
    diff = features_instances - features_query
    dist = np.sqrt(np.sum(diff**2, axis = 1))
    return(dist)

In [191]:
# the closest house to the third house in test data
distances = compute_distances(normalized_train, normalized_test[2])
print(np.where(distances == distances.min()))
out_train[382]

(array([382]),)


249000.0

## Perform k-nearest neighbor regression

In [156]:
def k_nearest_neighbors(k, features_train, features_query):
    diff = features_train - features_query
    dist = np.sqrt(np.sum(diff**2, axis = 1))
    dist_sort = dist.argsort()    # or np.argsort(dist)  sort argument and output the index
    neighbors = dist_sort[:k]
    return(neighbors)

In [157]:
# test
k_nearest_neighbors(4, normalized_train, normalized_test[2])

array([ 382, 1149, 4087, 3142])

In [160]:
def predict_output_query(k, features_train, output_train, features_query):
    neighbors_index = k_nearest_neighbors(k, features_train, features_query)
    prediction = np.mean(output_train[neighbors_index])
    return(prediction)

In [161]:
# test
predict_output_query(4, normalized_train, out_train, normalized_test[2])

413987.5

In [168]:
def predict_output(k, features_train, output_train, features_query):
    n = features_query.shape[0]
    prediction = np.zeros(n)
    for i in range(n):
        prediction[i] = predict_output_query(k, features_train, output_train, features_query[i])
    return(prediction)

In [171]:
# test Make predictions for the first 10 houses in the test set, using k=10. 
prediction10 = predict_output(10, normalized_train, out_train, normalized_test[0:10])
print(np.where(prediction10==prediction10.min()))
print(prediction10[6])

(array([6]),)
350032.0


## Choosing best value of k via a validation set

In [181]:
RSS = np.zeros(15)
for k in range(1, 16):
    prediction = predict_output(k, normalized_train, out_train, normalized_validation)
    RSS[k-1] = np.sum((out_validation - prediction)**2)     

In [182]:
np.where(RSS==RSS.min())

(array([7]),)

In [192]:
## K=8, RSS on test data
prediction_k8 = predict_output(k, normalized_train, out_train, normalized_test)
RSS_k8 = np.sum((out_test - prediction_k8)**2)
print(RSS_k8)

134342939295287.66
