In [6]:
import pandas as pd
import numpy as np
import sys

In [7]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 
              'id':str, 'sqft_lot':int, 'view':int}

In [8]:
training_data = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
validation_data = pd.read_csv('kc_house_data_small_validation.csv', dtype=dtype_dict)

In [9]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1
    features = ['constant'] + features

    features_frame = data_frame[features]

    feature_matrix = features_frame.as_matrix()
    output_array = data_frame[output]

    output_array = output_array.as_matrix()
    return(feature_matrix, output_array)

In [10]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    normalized_features = feature_matrix / norms
    return (normalized_features, norms)

In [11]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']

In [12]:
features_train, train_output = get_numpy_data(training_data, feature_list, 'price')
features_test, test_output = get_numpy_data(test_data, feature_list, 'price')
features_validation, validation_output = get_numpy_data(validation_data, feature_list, 'price')

In [13]:
# Normalize features
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_validation = features_validation / norms

In [14]:
print features_test[0]

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]


In [15]:
print features_train[9]

[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


### Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set?

In [16]:
print np.sqrt(np.sum((features_test[0] - features_train[9]) ** 2))

0.059723593714


### Quiz Question: Among the first 10 training houses, which house is the closest to the query house?

In [17]:
min_so_far = sys.float_info.max
closest_idx = 0
for i in range(0, 10):
    dist = np.sqrt(np.sum((features_test[0] - features_train[i]) ** 2))
    if dist < min_so_far:
        min_so_far = dist
        closest_idx = i
print "closest is", closest_idx

closest is 8


In [18]:
diff = features_train - features_test[0]

In [20]:
diff[-1].sum()

-0.093433998746546426

In [21]:
np.sum(diff**2, axis=1)[15] - np.sum(diff[15]**2)

4.3368086899420177e-19

In [22]:
distances = np.sqrt(np.sum(diff**2, axis=1))

In [23]:
print distances[100]

0.0237082324167


In [24]:
def compute_distances(features_instances, features_query):
    diff = features_train - features_query
    return np.sqrt(np.sum(diff**2, axis=1))

### Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?

In [25]:
new_query_distances = compute_distances(features_train, features_test[2])

In [45]:
min_idx = np.argmin(new_query_distances)
print min_idx

382


### Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?



In [46]:
training_data.loc[min_idx]['price']

249000.0

In [31]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = compute_distances(features_train, features_query)
    dist_idx_sorted = np.argsort(distances)
    return dist_idx_sorted[0:k]

### Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [32]:
print k_nearest_neighbors(4, features_train, features_test[2])

[ 382 1149 4087 3142]


In [57]:
def predict_output_of_query(k, features_train, output_train, features_query):
    knn_indices = k_nearest_neighbors(k, features_train, features_query)
    prices = [output_train[i] for i in knn_indices]
    return np.mean(prices)

### Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.



In [40]:
print predict_output_of_query(4, features_train, train_output, features_test[2])

413987.5


In [62]:
def predict_output(k, features_train, output_train, features_query):
    shape = features_query.shape
    rows = shape[0]
    predictions = np.zeros(rows)
    for i in range(0, rows):
        predictions[i] = predict_output_of_query(k, features_train, output_train, 
                                                 features_query[i])
    return predictions

### Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?



In [71]:
predictions = predict_output(10, features_train, train_output, features_test[0:10])

(10, 18)


In [72]:
min_idx = np.argmin(predictions)
print min_idx

6


In [73]:
predictions[min_idx]

350032.0

In [78]:
lowest_rss = sys.float_info.max
the_k = 1
for k in range(1, 16):
    print "Checking k =", k
    predictions = predict_output(k, features_train, train_output, features_validation)
    rss = ((predictions - validation_output) ** 2).sum()
    print "Rss =", rss
    if rss < lowest_rss:
        lowest_rss = rss
        the_k = k

Checking k = 1
(1435, 18)
Rss = 1.05453830252e+14
Checking k = 2
(1435, 18)
Rss = 8.3445073504e+13
Checking k = 3
(1435, 18)
Rss = 7.26920960192e+13
Checking k = 4
(1435, 18)
Rss = 7.19467216521e+13
Checking k = 5
(1435, 18)
Rss = 6.98465174197e+13
Checking k = 6
(1435, 18)
Rss = 6.88995443532e+13
Checking k = 7
(1435, 18)
Rss = 6.83419734501e+13
Checking k = 8
(1435, 18)
Rss = 6.73616787355e+13
Checking k = 9
(1435, 18)
Rss = 6.8372727959e+13
Checking k = 10
(1435, 18)
Rss = 6.93350486686e+13
Checking k = 11
(1435, 18)
Rss = 6.95238552156e+13
Checking k = 12
(1435, 18)
Rss = 6.90499695872e+13
Checking k = 13
(1435, 18)
Rss = 7.00112545083e+13
Checking k = 14
(1435, 18)
Rss = 7.0908698869e+13
Checking k = 15
(1435, 18)
Rss = 7.11069283859e+13


### Quiz Question: What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set.

In [79]:
print the_k

8


In [80]:
all_test_predictions = predict_output(the_k, features_train, train_output, features_test)

(1741, 18)


In [81]:
print ((all_test_predictions - test_output) ** 2).sum()

1.33118823552e+14
