In [23]:
import graphlab

graphlab.canvas.set_target('ipynb')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [25]:
sales['bedrooms_squared'] = sales['bedrooms']*sales['bedrooms']
sales['bed_bath_rooms'] = sales['bedrooms']*sales['bathrooms']
sales['log_sqft_living'] = np.log(sales['sqft_living'])
sales['lat_plus_long'] = sales['lat']+sales['long']

In [27]:
train_data, test_data = sales.random_split(0.8,seed=0)

In [28]:
sales.head(1)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398

long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
-122.25677536,1340.0,5650.0,9.0,3.0,7.07326971746,-74.74554138


In [29]:
features1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_1 = graphlab.linear_regression.create(train_data, target = 'price', features=features1, validation_set=None)

In [30]:
features2 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']
model_2 = graphlab.linear_regression.create(train_data, target = 'price', features=features2, validation_set=None)

In [31]:
features3 = ['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']
model_3 = graphlab.linear_regression.create(train_data, target = 'price', features=features3, validation_set=None)

In [36]:
model_1['coefficients']

name,index,value,stderr
(intercept),,-56140675.759,1649985.42057
sqft_living,,310.263325779,3.18882960407
bedrooms,,-59577.1160679,2487.27977321
bathrooms,,13811.840544,3593.54213295
lat,,629865.78954,13120.7100325
long,,-214790.285284,13284.2851627


dtype: float
Rows: 17384
[-23884.018077872694, -301789.2975793332, -135874.4876407571, 112826.64050759748, 74280.96149339154, -362305.2000751719, -59706.951354920864, 128056.20156693459, -220254.06248743832, -25293.209875714034, -374810.0842846595, 48591.956228841096, 66537.72251987457, 64674.19520970434, -157895.83467406034, 53284.33187600225, 14838.988854188472, 63454.2352575399, -86658.09727393463, 1142005.5690988936, -65358.6566111967, 93525.69085392356, -223411.30522929877, 184485.147672493, 270766.0155455284, -49424.73562210426, -11237.673478342593, -193023.94312246144, 2589.384249776602, -86749.175462313, -37470.906856261194, 146611.51386988536, -48729.84117506072, 147446.495014254, -28800.57557208091, -95793.36676715314, 213262.76309002563, 89229.00923293456, -479569.6636722833, 127996.14165293798, 188259.93688360974, 56601.818958736956, 621231.800417494, 18845.42311604321, -387685.23280571774, 138459.28468951955, -121254.4519761242, 198102.57901303098, 65593.34908917174, -2299

In [40]:
def get_residual_sum_of_squares(model, data):
    diff = data['price'] - model.predict(data)
    RSS = (diff*diff).sum()
    return(RSS)

In [48]:
get_residual_sum_of_squares(model_1,train_data)

971328233535829.0

In [49]:
get_residual_sum_of_squares(model_2,train_data)

961592067848420.6

In [50]:
get_residual_sum_of_squares(model_3,train_data)

905276314542889.8

# Test Questions

In [54]:
test_data['bedrooms_squared'].mean()

12.446677701584298

In [55]:
test_data['bed_bath_rooms'].mean()

7.503901631591383

In [56]:
test_data['log_sqft_living'].mean()

7.550274679645924

In [57]:
test_data['lat_plus_long'].mean()

-74.65333497217343

In [60]:
model_1['coefficients']

name,index,value,stderr
(intercept),,-56140675.759,1649985.42057
sqft_living,,310.263325779,3.18882960407
bedrooms,,-59577.1160679,2487.27977321
bathrooms,,13811.840544,3593.54213295
lat,,629865.78954,13120.7100325
long,,-214790.285284,13284.2851627


In [61]:
model_2['coefficients']

name,index,value,stderr
(intercept),,-54410676.1295,1650405.1657
sqft_living,,304.449298058,3.20217535636
bedrooms,,-116366.043226,4805.54966544
bathrooms,,-77972.3305052,7565.0599109
lat,,625433.835007,13058.3530975
long,,-203958.603056,13268.1283731
bed_bath_rooms,,26961.6249074,1956.36561554


In [63]:
get_residual_sum_of_squares(model_1,test_data)

226568089090648.22

In [64]:
get_residual_sum_of_squares(model_2,test_data)

224368799991589.47

In [67]:
get_residual_sum_of_squares(model_3,test_data)

251829318965955.53