In [12]:
import pandas as pd

import numpy as np

from sklearn import linear_model

In [2]:
def polynomial_dataframe(feature, degree):
    poly_dataframe = pd.DataFrame()
    poly_dataframe['power_1'] = feature
    if degree > 1:
        for power in range(2, degree+1):
            name = 'power_' + str(power)
            poly_dataframe[name] = feature ** power
    return poly_dataframe

In [22]:
def print_coefficients(model):    
    # Get the degree of the polynomial
    deg = len(model.coef_)-1

    coef = list(model.coef_)
    coef.reverse()
    # Numpy has a nifty function to print out polynomials in a pretty way
    # (We'll use it, but it needs the parameters in the reverse order)
    print('Learned polynomial for degree ' + str(deg) + ':')
    print(np.poly1d(coef))

In [3]:
dtype_dict = {
    'bathrooms': float,
    'bedrooms': float,
    'condition': int,
    'date': str,
    'floors': float,
    'grade': int,
    'id': str,
    'lat': float,
    'long': float,
    'price': float,
    'sqft_above': int,
    'sqft_basement': int,
    'sqft_living': float,
    'sqft_living15': float,
    'sqft_lot': int,
    'sqft_lot15': float,
    'view': int,
    'waterfront': int,
    'yr_built': int,
    'yr_renovated': int,
    'zipcode': str
}

In [25]:
kc_house_data = pd.read_csv('kc_house_data.csv', dtype = dtype_dict)

wk3_kc_house_train_data = pd.read_csv('wk3_kc_house_train_data.csv', dtype = dtype_dict)

wk3_kc_house_test_data = pd.read_csv('wk3_kc_house_test_data.csv', dtype = dtype_dict)

wk3_kc_house_valid_data = pd.read_csv('wk3_kc_house_valid_data.csv', dtype = dtype_dict)

wk3_kc_house_set_1_data = pd.read_csv('wk3_kc_house_set_1_data.csv', dtype = dtype_dict)

wk3_kc_house_set_2_data = pd.read_csv('wk3_kc_house_set_2_data.csv', dtype = dtype_dict)

wk3_kc_house_set_3_data = pd.read_csv('wk3_kc_house_set_3_data.csv', dtype = dtype_dict)

wk3_kc_house_set_4_data = pd.read_csv('wk3_kc_house_set_4_data.csv', dtype = dtype_dict)

In [6]:
sales = kc_house_data.sort_values(['sqft_living','price'])

In [7]:
l2_small_penalty = 1.5e-5

In [13]:
poly15_data = polynomial_dataframe(sales['sqft_living'], 15)

model = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)

model.fit(poly15_data, sales['price'])

Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [19]:
model.coef_

array([  1.24873306e+02,  -4.77376011e-02,   3.01446238e-05,
        -2.44419942e-09,  -1.94153675e-13,   8.54085686e-18,
         1.51142121e-21,   8.27979094e-26,   6.52603100e-31,
        -3.27895017e-34,  -3.87962315e-38,  -2.72437650e-42,
        -1.07790800e-46,   3.78242694e-51,   1.39790296e-54])

In [84]:
def fit_polys(feature, target, l2_penalty):
    model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
    model.fit(feature, target)
    return model

In [85]:
def compute_rss(predictions, real):
    errors = real - predictions
    rss = sum(errors * errors)
    return rss

In [86]:
for data in [wk3_kc_house_set_1_data, wk3_kc_house_set_2_data, wk3_kc_house_set_3_data, wk3_kc_house_set_4_data]:
    feature = polynomial_dataframe(data['sqft_living'], 15)
    target = data['price']
    m = fit_polys(feature, target, 1e-9)
    print(m.coef_[0])

544.669372537
859.362607229
-755.395975432
1119.44566936


In [87]:
for data in [wk3_kc_house_set_1_data, wk3_kc_house_set_2_data, wk3_kc_house_set_3_data, wk3_kc_house_set_4_data]:
    feature = polynomial_dataframe(data['sqft_living'], 15)
    target = data['price']
    m = fit_polys(feature, target, 1.23e2)
    print(m.coef_[0])

2.32806802958
2.09756902778
2.28906258119
2.08596194092


In [88]:
wk3_kc_train_valid_shuffled = pd.read_csv('wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)

wk3_kc_test = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)

In [89]:
n = len(train_valid_shuffled)
k = 10

for i in range(k):
    start = (n * i) // k
    end = (n * (i+1)) // k - 1
    print(i, (start, end))

0 (0, 1938)
1 (1939, 3878)
2 (3879, 5817)
3 (5818, 7757)
4 (7758, 9697)
5 (9698, 11636)
6 (11637, 13576)
7 (13577, 15515)
8 (15516, 17455)
9 (17456, 19395)


In [92]:
'''
Compute starting and ending indices of segment i and call 'start' and 'end'

Form validation set by taking a slice (start:end+1) from the data.

Form training set by appending slice (end+1:n) to the end of slice (0:start).

Train a linear model using training set just formed, with a given l2_penalty

Compute validation error (RSS) using validation set just formed

'''

def k_fold_cross_validation(k, l2_penalty, data):
    rsses = []

    for i in range(k):
        start = (n * i) // k
        end = (n * (i + 1)) // k - 1

        validation = data[start: end+1]
        training = data[0: start].append(data[end+1: n])

        model = fit_polys(polynomial_dataframe(training['sqft_living'], 15), training['price'], l2_penalty)

        rsses.append(compute_rss(model.predict(polynomial_dataframe(validation['sqft_living'], 15)), validation['price']))
    
    return np.mean(rsses)

In [95]:
for l2_penalty in np.logspace(3, 9, num=13):
    print('%f\t%f' % (l2_penalty, k_fold_cross_validation(10, l2_penalty, wk3_kc_train_valid_shuffled)))

1000.000000	264977361036975.906250
3162.277660	265692935858318.250000
10000.000000	265924368957215.687500
31622.776602	265998081509342.812500
100000.000000	266021444553980.500000
316227.766017	266028837918724.437500
1000000.000000	266031176438468.156250
3162277.660168	266031915996607.250000
10000000.000000	266032149870752.593750
31622776.601684	266032223828783.750000
100000000.000000	266032247216419.937500
316227766.016838	266032254612245.000000
1000000000.000000	266032256951011.156250


In [99]:
model = fit_polys(polynomial_dataframe(wk3_kc_house_train_data['sqft_living'], 15), wk3_kc_house_train_data['price'], 1000.0)

print('%.2e' % compute_rss(model.predict(polynomial_dataframe(wk3_kc_test['sqft_living'], 15)), wk3_kc_test['price']))

2.84e+14
