In [1]:
import numpy as np
import turicreate as tc
from math import log

In [2]:
sales = tc.SFrame('./kc_house_data.gl/')
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [3]:
train_data,test_data = sales.random_split(.8,seed=0)

In [4]:
example_features = ['sqft_living', 'bedrooms', 'bathrooms']
example_model = tc.linear_regression.create(train_data, target = 'price', features = example_features, 
                                                  validation_set = None)

In [5]:
example_weight_summary = example_model.coefficients
print(example_weight_summary)

+-------------+-------+---------------------+--------------------+
|     name    | index |        value        |       stderr       |
+-------------+-------+---------------------+--------------------+
| (intercept) |  None |  87910.07249236747  | 7873.338143368095  |
| sqft_living |  None |  315.40344055209215 | 3.4557003258408105 |
|   bedrooms  |  None | -65080.215552821624 | 2717.4568544075883 |
|  bathrooms  |  None |  6944.020192650054  | 3923.1149314251925 |
+-------------+-------+---------------------+--------------------+
[4 rows x 4 columns]



In [6]:
example_predictions = example_model.predict(train_data)
print(example_predictions[0]) # should be 271789.505878

271789.50587802136


In [7]:
def get_residual_sum_of_squares(model, data, outcome):
    # First get the predictions
    predictions = model.predict(data).to_numpy()
    
    # Then compute the residuals/errors
    residuals = outcome.to_numpy() - predictions
    
    # Then square and add them up
    RSS = np.sum(residuals**2)

    return RSS 

rss_example_train = get_residual_sum_of_squares(example_model, test_data, test_data['price'])
print(rss_example_train) # should be 2.7376153833e+14

273761538330191.75


Next create the following 4 new features as column in both TEST and TRAIN data:
* bedrooms_squared = bedrooms\*bedrooms
* bed_bath_rooms = bedrooms\*bathrooms
* log_sqft_living = log(sqft_living)
* lat_plus_long = lat + long 
As an example here's the first one:

In [8]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

In [9]:
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']

In [10]:
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))

In [11]:
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [12]:
round(np.mean(test_data['bedrooms_squared'].to_numpy()),2)

12.45

In [13]:
round(np.mean(test_data['bed_bath_rooms'].to_numpy()),2)

7.5

In [14]:
round(np.mean(test_data['log_sqft_living'].to_numpy()),2)

7.55

In [15]:
round(np.mean(test_data['lat_plus_long'].to_numpy()),2)

-74.65

In [16]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [17]:
model_1 = tc.linear_regression.create(train_data, target = 'price', features = model_1_features, validation_set = None)
model_2 = tc.linear_regression.create(train_data, target = 'price', features = model_2_features, validation_set = None)
model_3 = tc.linear_regression.create(train_data, target = 'price', features = model_3_features, validation_set = None)

In [18]:
print(model_1.coefficients)

+-------------+-------+---------------------+--------------------+
|     name    | index |        value        |       stderr       |
+-------------+-------+---------------------+--------------------+
| (intercept) |  None | -56140675.734359965 | 1649985.0503066182 |
| sqft_living |  None |  310.2633257761251  | 3.1888295954448056 |
|   bedrooms  |  None |  -59577.11606763772 | 2487.279772923297  |
|  bathrooms  |  None |  13811.84054063167  | 3593.542128998486  |
|     lat     |  None |  629865.7894470703  | 13120.709684446634 |
|     long    |  None | -214790.28511873118 | 13284.282195942329 |
+-------------+-------+---------------------+--------------------+
[6 rows x 4 columns]



In [19]:
print(model_2.coefficients)

+----------------+-------+---------------------+--------------------+
|      name      | index |        value        |       stderr       |
+----------------+-------+---------------------+--------------------+
|  (intercept)   |  None |  -54410676.10518201 | 1650404.7917022314 |
|  sqft_living   |  None |  304.4492980546738  | 3.2021753457961064 |
|    bedrooms    |  None | -116366.04323084738 | 4805.549658410095  |
|   bathrooms    |  None |  -77972.33051617088 | 7565.059886168023  |
|      lat       |  None |  625433.8349156899  | 13058.352750177153 |
|      long      |  None |  -203958.6028924356 | 13268.125381655715 |
| bed_bath_rooms |  None |  26961.62490959477  | 1956.3656111962875 |
+----------------+-------+---------------------+--------------------+
[7 rows x 4 columns]



In [20]:
rss_model_1 = get_residual_sum_of_squares(model_1, train_data, train_data['price'])
print(rss_model_1)

971328233549080.8


In [21]:
rss_model_2 = get_residual_sum_of_squares(model_2, train_data, train_data['price'])
print(rss_model_2)

961592067860935.1


In [22]:
rss_model_3 = get_residual_sum_of_squares(model_3, train_data, train_data['price'])
print(rss_model_3)

905276314549487.5


In [23]:
rss_model_1 = get_residual_sum_of_squares(model_1, test_data, test_data['price'])
print(rss_model_1)

226568089094147.94


In [24]:
rss_model_2 = get_residual_sum_of_squares(model_2, test_data, test_data['price'])
print(rss_model_2)

224368799994905.12


In [25]:
rss_model_3 = get_residual_sum_of_squares(model_3, test_data, test_data['price'])
print(rss_model_3)

251829318966504.2
