In [44]:
import pandas as pd
import numpy as np
import sys

In [22]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 
              'id':str, 'sqft_lot':int, 'view':int}

In [23]:
training_data = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
validation_data = pd.read_csv('kc_house_data_small_validation.csv', dtype=dtype_dict)

In [24]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1
    features = ['constant'] + features

    features_frame = data_frame[features]

    feature_matrix = features_frame.as_matrix()
    output_array = data_frame[output]

    output_array = output_array.as_matrix()
    return(feature_matrix, output_array)

In [25]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    normalized_features = feature_matrix / norms
    return (normalized_features, norms)

In [26]:
feature_list = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']

In [27]:
features_train, train_output = get_numpy_data(training_data, feature_list, 'price')
features_test, test_output = get_numpy_data(test_data, feature_list, 'price')
features_validation, validation_output = get_numpy_data(validation_data, feature_list, 'price')

In [30]:
# Normalize features
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_validation = features_validation / norms

In [31]:
print features_test[0]

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]


In [36]:
print features_train[9]

[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


### Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set?

In [47]:
print np.sqrt(np.sum((features_test[0] - features_train[9]) ** 2))

0.059723593714


### Quiz Question: Among the first 10 training houses, which house is the closest to the query house?

In [50]:
min_so_far = sys.float_info.max
closest_idx = 0
for i in range(0, 10):
    dist = np.sqrt(np.sum((features_test[0] - features_train[i]) ** 2))
    if dist < min_so_far:
        min_so_far = dist
        closest_idx = i
print "closest is", closest_idx

closest is 8


In [62]:
diff = features_train - features_test[0]

In [63]:
print results[0] - (features_train[0]-features_test[0])
# should print all 0's if results[0] == (features_train[0]-features_test[0])
print results[1] - (features_train[1]-features_test[0])
# should print all 0's if results[1] == (features_train[1]-features_test[0])
print results[2] - (features_train[2]-features_test[0])
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [64]:
diff[-1].sum()

-0.09343399874654644

In [65]:
np.sum(diff**2, axis=1)[15] - np.sum(diff[15]**2)

4.3368086899420177e-19

In [66]:
distances = np.sqrt(np.sum(diff**2, axis=1))

In [67]:
print distances[100]

0.0237082324167


In [70]:
def compute_distances(features_instances, features_query):
    diff = features_train - features_query
    return np.sqrt(np.sum(diff**2, axis=1))

### 16. Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?

In [71]:
new_query_distances = compute_distances(features_train, features_test[2])

In [73]:
idnp.argmin(new_query_distances)

382


### Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?



In [79]:
training_data.loc[382]['price']

249000.0