In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import math

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data_small.csv', dtype= dtype_dict)
training = pd.read_csv('kc_house_data_small_train.csv', dtype= dtype_dict)
testing = pd.read_csv('kc_house_data_small_test.csv', dtype= dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype = dtype_dict)

In [4]:
def get_numpy_data(train_data, features, output):
    train_data['constant'] = 1
    features = ['constant'] + features
    feature_matrix = train_data[features].values
    
    output_array = np.array(train_data[output])    
    return feature_matrix, output_array

In [5]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis= 0)
    features_normalized = features / norms
    return features_normalized, norms

In [6]:
feature_list = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront',
                'view',
                'condition',
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built',
                'yr_renovated',
                'lat',
                'long',
                'sqft_living15',
                'sqft_lot15']

In [7]:
features_train, train_output = get_numpy_data(training, feature_list, 'price')
features_valid, valid_output = get_numpy_data(validation, feature_list, 'price')
features_test, test_output = get_numpy_data(testing, feature_list, 'price')

In [8]:
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms

In [12]:
print(features_test[0])
print(features_train[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


### What is the Euclidean distance between the query house and the 10th house of the training set?

In [13]:
np.sqrt(np.sum((features_test[0] - features_train[9]) ** 2))

0.05972359371398078

### Computing the distance from query house to the first 10 houses of the training set 

In [26]:
dist_set = [np.sqrt(np.sum((features_test[0] - i) ** 2)) for i in features_train[0:10]]
dist_set

[0.06027470916295592,
 0.08546881147643746,
 0.06149946435279315,
 0.05340273979294363,
 0.05844484060170442,
 0.059879215098128345,
 0.05463140496775461,
 0.055431083236146074,
 0.052383627840220305,
 0.05972359371398078]

### Among the first 10 training houses, which house is the closest to the query house?

In [128]:
min_idx = dist_set.index(min(dist_set))
print(min_idx + 1, dist_set[max_idx])

9 0.052383627840220305


### Vectorized (example)

In [30]:
for i in range(3):
    print(features_train[i] - features_test[0])

[ 0.00000000e+00 -3.87821276e-03 -1.20498190e-02 -1.05552733e-02
  2.08673616e-04 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03 -5.50336860e-03 -2.48168183e-02
 -1.63756198e-04  0.00000000e+00 -1.70254220e-05  1.29876855e-05
 -5.14364795e-03  6.69281453e-04]
[ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
  7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
 -1.91048898e-04  6.65082271e-02  4.23090220e-05  6.16364736e-06
 -2.89330197e-03  1.47606982e-03]
[ 0.00000000e+00 -7.75642553e-03 -1.20498190e-02 -1.30002801e-02
  1.60518166e-03 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -5.21450589e-03 -8.32384500e-03 -2.48168183e-02
 -3.13866046e-04  0.00000000e+00  4.70885840e-05  1.56292487e-05
  3.72914476e-03  1.64764925e-03]


In [36]:
# vectorized
results = features_train[0:3] - features_test[0]

In [35]:
#verify the result
results[0] - (features_train[0] - features_test[0])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

## Perform 1-Nearest Neighbor Regression

In [40]:
diff =  features_train[:] - features_test[0]

In [42]:
# verify
diff[-1].sum()

-0.09343399874654643

### Computing Euclidean  distance

In [48]:
#verify the result
print(np.sum(diff**2, axis=1)[15])
print(np.sum(diff[15]**2))

0.0033070590284564457
0.0033070590284564453


In [47]:
distances = np.sqrt(np.sum(diff**2, axis=1))

In [49]:
distances[100]

0.023708232416678195

In [95]:
def compute_distances(features_instances, features_query):
    diff = features_instances[:] - features_query
    distances = np.sqrt(np.sum(diff ** 2, axis = 1))
    return distances

### Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?



In [96]:
distances = compute_distances(features_train, features_test[2])
np.argmin(distances)

382

### What is the predicted value of the query house based on 1-nearest neighbor regression?

In [103]:
train_output[382]

249000.0

## Perform k-nearest neighbor regression

In [104]:
def k_nearest_neighbors(k, feature_train, features_query):
    distance = compute_distances(feature_train, features_query)
    neighbors = np.argsort(distance, axis = 0)[:k]
    return neighbors

### Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [105]:
neighbor = k_nearest_neighbors(4, features_train, features_test[2])
neighbor

array([ 382, 1149, 4087, 3142])

In [106]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbor = k_nearest_neighbors(k, features_train, features_query)
    prediction = np.mean(output_train[neighbor])
    return prediction

### Taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.

In [107]:
predict_output_of_query(4, features_train, train_output, features_test[2])

413987.5

In [108]:
def predict_output(k, features_train, output_train, features_query):
    
    query_len = features_query.shape[0]
    predictions = []
    
    for f in range(query_len):
        prediction = predict_output_of_query(k, features_train, output_train, features_test[f])
        predictions.append(prediction)
    return predictions

### Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?

In [110]:
predictions = predict_output(10, features_train, train_output, features_test[:10])
predictions

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.7,
 484000.0,
 457235.0]

In [115]:
print('Index of minimum:', np.argmin(predictions))
print('Minimum values:', min(predictions))

Index of minimum: 6
Minimum values: 350032.0


### Validation

In [119]:
rss_list = []
for k in range(1, 16):
    predictions = predict_output(k, features_train, train_output, features_valid)
    rss = ((valid_output - predictions) ** 2).sum()
    rss_list.append(rss)

In [125]:
rss_list

[355632427476622.0,
 317939124951086.5,
 313153111376087.9,
 301621468995236.0,
 294266734341982.3,
 287781925015337.06,
 287842561046849.25,
 286179146468969.25,
 281718696883431.0,
 280358603702662.8,
 278687700531166.2,
 278744728841426.88,
 275043861135800.7,
 273895810640073.22,
 272162684453609.12]

### Which k produced the lowest RSS on validation data?

In [122]:
np.argmin(rss_list)+ 1

15

### What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set?

In [123]:
predictions = predict_output(15, features_train, train_output, features_test)
rss = ((test_output - predictions) **2 ).sum()
rss

134342939295287.66