In [54]:
import pandas as pd

import numpy as np

from sklearn import linear_model

from functools import reduce 

In [2]:
dtype_dict = {
    'bathrooms': float,
    'bedrooms': float,
    'condition': int,
    'date': str,
    'floors': float,
    'grade': int,
    'id': str,
    'lat': float,
    'long': float,
    'price': float,
    'sqft_above': int,
    'sqft_basement': int,
    'sqft_living': float,
    'sqft_living15': float,
    'sqft_lot': int,
    'sqft_lot15': float,
    'view': int,
    'waterfront': int,
    'yr_built': int,
    'yr_renovated': int,
    'zipcode': str
}

In [5]:
kc_house_data = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)

kc_house_train_data = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)

kc_house_test_data = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)

kc_house_validation_data = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)

In [16]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 
    return (np.array(data[['constant'] + features]), np.array(data[output]))


def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    return (features / norms, norms)

In [20]:
get_numpy_data(kc_house_train_data, ['sqft_living', 'bedrooms'], 'price')

(array([[  1.00000000e+00,   1.18000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   2.57000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   7.70000000e+02,   2.00000000e+00],
        ..., 
        [  1.00000000e+00,   2.27000000e+03,   3.00000000e+00],
        [  1.00000000e+00,   2.52000000e+03,   4.00000000e+00],
        [  1.00000000e+00,   1.53000000e+03,   3.00000000e+00]]),
 array([ 221900.,  538000.,  180000., ...,  507250.,  610685.,  360000.]))

In [19]:
feature_list = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront',
                'view',
                'condition',
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built',
                'yr_renovated',
                'lat',
                'long',
                'sqft_living15',
                'sqft_lot15']

In [21]:
features_train, output_train = get_numpy_data(kc_house_train_data, feature_list, 'price')
features_test, output_test = get_numpy_data(kc_house_test_data, feature_list, 'price')
features_validation, output_validation = get_numpy_data(kc_house_validation_data, feature_list, 'price')

features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_validation = features_validation / norms

In [22]:
print(features_test[0])
print(features_train[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [24]:
def euclidean_distance(f1, f2):
    return np.sqrt(np.sum((f1 - f2) ** 2))

In [25]:
euclidean_distance(features_test[0], features_train[9])

0.059723593713980783

In [27]:
[euclidean_distance(features_test[0], features_train[i]) for i in range(10)]

[0.060274709162955922,
 0.085468811476437465,
 0.061499464352793153,
 0.053402739792943632,
 0.05844484060170442,
 0.059879215098128345,
 0.054631404967754607,
 0.055431083236146074,
 0.052383627840220305,
 0.059723593713980783]

In [29]:
# verify that vectorization works
results = features_train[0:3] - features_test[0]

print(results[0] - (features_train[0]-features_test[0]))
# should print all 0's if results[0] == (features_train[0]-features_test[0])

print(results[1] - (features_train[1]-features_test[0]))
# should print all 0's if results[1] == (features_train[1]-features_test[0])

print(results[2] - (features_train[2]-features_test[0]))
# should print all 0's if results[2] == (features_train[2]-features_test[0])

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [30]:
diff = features_train[:] - features_test[0]

In [31]:
diff[-1].sum()

-0.093433998746546426

In [35]:
np.sum(diff[15]**2)

0.0033070590284564453

In [36]:
np.sum(diff**2, axis=1)[15]

0.0033070590284564457

In [47]:
distances = np.sqrt(np.sum(diff ** 2, axis=1))

In [49]:
distances[100]

0.023708232416678195

In [52]:
def compute_distances(features_instances, features_query):
    return np.sqrt(np.sum((features_instances[:] - features_query) ** 2, axis=1))

In [55]:
def smallest_index(a):
    return reduce(lambda x,y: x if x[1] < y[1] else y, enumerate(a))

In [57]:
smallest_index(compute_distances(features_train, features_test[2]))

(382, 0.0028604955575117085)

In [59]:
output_train[382]

249000.0

In [63]:
def k_nearest_neighbors(k, features_instances, features_query):
    distances = compute_distances(features_instances, features_query)
    return np.argsort(distances)[:k]

In [64]:
k_nearest_neighbors(4, features_train, features_test[2])

array([ 382, 1149, 4087, 3142])

In [65]:
def predict_output_of_query(k, features_train, output_train, features_query):
    nearest = k_nearest_neighbors(k, features_train, features_query)
    return np.average(output_train[nearest])

In [66]:
predict_output_of_query(4, features_train, output_train, features_test[2])

413987.5

In [67]:
def predict_output(k, features_train, output_train, features_query):
    return [predict_output_of_query(k, features_train, output_train, q) for q in features_query]

In [70]:
predict_output(10, features_train, output_train, features_test[:10])

[881300.0,
 431860.0,
 460595.0,
 430200.0,
 766750.0,
 667420.0,
 350032.0,
 512800.70000000001,
 484000.0,
 457235.0]

In [71]:
ks = range(1, 16)

In [72]:
def compute_rss(predictions, real):
    errors = real - predictions
    rss = sum(errors * errors)
    return rss

In [74]:
predictions_on_k = [(k , predict_output(k, features_train, output_train, features_validation)) for k in ks]

In [75]:
results = [(k, compute_rss(predictions, output_validation)) for (k, predictions) in predictions_on_k]

In [76]:
results

[(1, 105453830251561.0),
 (2, 83445073504025.5),
 (3, 72692096019202.797),
 (4, 71946721652091.688),
 (5, 69846517419718.586),
 (6, 68899544353181.094),
 (7, 68341973450051.055),
 (8, 67361678735491.5),
 (9, 68372727958976.336),
 (10, 69335048668556.703),
 (11, 69523855215598.875),
 (12, 69049969587246.453),
 (13, 70011254508263.625),
 (14, 70908698869034.438),
 (15, 71106928385945.359)]

In [79]:
print("%.e" % compute_rss(predict_output(8, features_train, output_train, features_test), output_test))

1e+14
