In [None]:
pip install turicreate

In [70]:
import turicreate as tc

In [71]:
sales = tc.SFrame('/content/drive/My Drive/home_data.sframe')
sales

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650.0,1.0,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242.0,2.0,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000.0,1.0,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000.0,1.0,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080.0,1.0,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930.0,1.0,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819.0,2.0,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711.0,1.0,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470.0,1.0,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560.0,2.0,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7.0,1180.0,0.0,1955.0,0.0,98178,47.51123398
0,3,7.0,2170.0,400.0,1951.0,1991.0,98125,47.72102274
0,3,6.0,770.0,0.0,1933.0,0.0,98028,47.73792661
0,5,7.0,1050.0,910.0,1965.0,0.0,98136,47.52082
0,3,8.0,1680.0,0.0,1987.0,0.0,98074,47.61681228
0,3,11.0,3890.0,1530.0,2001.0,0.0,98053,47.65611835
0,3,7.0,1715.0,0.0,1995.0,0.0,98003,47.30972002
0,3,7.0,1060.0,0.0,1963.0,0.0,98198,47.40949984
0,3,7.0,1050.0,730.0,1960.0,0.0,98146,47.51229381
0,3,7.0,1890.0,0.0,2003.0,0.0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


In [72]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

# In the dataset, 'floors' was defined with type string, 
# so we'll convert them to float, before creating a new feature.
sales['floors'] = sales['floors'].astype(float) 
sales['floors_square'] = sales['floors']*sales['floors']

In [73]:
all_features = ['bedrooms', 'bedrooms_square',
                'bathrooms',
                'sqft_living', 'sqft_living_sqrt',
                'sqft_lot', 'sqft_lot_sqrt',
                'floors', 'floors_square',
                'waterfront', 'view', 'condition', 'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 'yr_renovated']

In [74]:
model_all = tc.linear_regression.create(sales, target='price', features=all_features,
                                                validation_set=None, 
                                                l2_penalty=0., l1_penalty=1e10)

In [90]:
model_all.coefficients.print_rows(num_rows=18,num_columns=4)

+------------------+-------+--------------------+--------+
|       name       | index |       value        | stderr |
+------------------+-------+--------------------+--------+
|   (intercept)    |  None | 274873.05595049594 |  None  |
|     bedrooms     |  None |        0.0         |  None  |
| bedrooms_square  |  None |        0.0         |  None  |
|    bathrooms     |  None | 8468.531086910027  |  None  |
|   sqft_living    |  None | 24.420720982446056 |  None  |
| sqft_living_sqrt |  None | 350.0605533860446  |  None  |
|     sqft_lot     |  None |        0.0         |  None  |
|  sqft_lot_sqrt   |  None |        0.0         |  None  |
|      floors      |  None |        0.0         |  None  |
|  floors_square   |  None |        0.0         |  None  |
|    waterfront    |  None |        0.0         |  None  |
|       view       |  None |        0.0         |  None  |
|    condition     |  None |        0.0         |  None  |
|      grade       |  None | 842.0680348975227  |  None 

In [76]:
subset_chosen_features = ['bathrooms','sqft_living','sqft_living_sqrt','grade','sqft_above']

In [77]:
(training_and_validation, testing) = sales.random_split(.9,seed=1) # initial train/test split
(training, validation) = training_and_validation.random_split(0.5, seed=1) # split training into train and validate

In [78]:
import numpy as np
l1_penalty_set = np.logspace(1, 7, num=13)
l1_penalty_set

array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07])

In [79]:
def Rss(model,output,data_frame):
  error = model.predict(data_frame) - data_frame[output]
  value = error*error
  rss = value.sum()
  return rss
  

In [80]:
min_rss = 10000000000000000000000000000
best_l1_penalty = 0
for i in range(len(l1_penalty_set)):
  model = tc.linear_regression.create(training, target='price', features=all_features,validation_set=None,l2_penalty=0.0,l1_penalty=l1_penalty_set[i],verbose=False)
  current = Rss(model,'price',validation)
  if current < min_rss:
    min_rss = current
    best_l1_penalty = l1_penalty_set[i]
print(best_l1_penalty)
  

10.0


In [81]:
model_1 = tc.linear_regression.create(sales,target='price', features=all_features,
                                                validation_set=None, 
                                                l2_penalty=0., l1_penalty=10.0)

In [82]:
model_1.coefficients['value'].nnz()

18

In [83]:
max_nonzeros = 7
l1_penalty_values = np.logspace(8, 10, num=20)
for value in l1_penalty_values:
  print(value)


100000000.0
127427498.57031322
162377673.91887242
206913808.111479
263665089.87303555
335981828.6283788
428133239.8719396
545559478.1168514
695192796.1775591
885866790.4100832
1128837891.6846883
1438449888.2876658
1832980710.8324375
2335721469.0901213
2976351441.6313133
3792690190.7322536
4832930238.571753
6158482110.6602545
7847599703.514623
10000000000.0


In [84]:
l1_penalty_min = 0
l1_penalty_max = 0
for l1_penalty in np.logspace(8, 10, num=20):
  model = tc.linear_regression.create(training, target='price', features=all_features,validation_set=None,l2_penalty=0.0,l1_penalty=l1_penalty,verbose=False)
  value = model.coefficients['value'].nnz()
  if value > max_nonzeros:
    l1_penalty_min = l1_penalty
  elif value < max_nonzeros and value > l1_penalty_max:
    l1_penalty_max = l1_penalty
print(l1_penalty_min)
print(l1_penalty_max)
  
  




2976351441.6313133
3792690190.7322536


In [85]:
l1_penalty_min = 2976351441.6313133
l1_penalty_max = 3792690190.7322536

In [86]:
l1_penalty_values_1= np.linspace(l1_penalty_min,l1_penalty_max,20)
l1_penalty_values_1

array([2.97635144e+09, 3.01931664e+09, 3.06228184e+09, 3.10524703e+09,
       3.14821223e+09, 3.19117743e+09, 3.23414263e+09, 3.27710782e+09,
       3.32007302e+09, 3.36303822e+09, 3.40600341e+09, 3.44896861e+09,
       3.49193381e+09, 3.53489901e+09, 3.57786420e+09, 3.62082940e+09,
       3.66379460e+09, 3.70675980e+09, 3.74972499e+09, 3.79269019e+09])

In [87]:
min_rss = 10000000000000000000000000000
best_l1_penalty_1 = 0
for l1_penalty_1 in np.linspace(l1_penalty_min,l1_penalty_max,20):
  model = tc.linear_regression.create(training, target='price', features=all_features,validation_set=None,l2_penalty=0.0,l1_penalty=l1_penalty_1,verbose=False)
  value_1 = model.coefficients['value'].nnz()
  current = Rss(model,'price',validation)
  if current < min_rss and value_1 == max_nonzeros:
    min_rss = current
    best_l1_penalty_1 = l1_penalty_1
print(best_l1_penalty_1)

3448968612.163437


In [88]:
final_model =  tc.linear_regression.create(training, target='price', features=all_features,
                                                validation_set=None, 
                                                l2_penalty=0., l1_penalty=3448968612.16)

In [89]:
final_model.coefficients.print_rows(num_rows=19,num_columns=4)

+------------------+-------+--------------------+--------+
|       name       | index |       value        | stderr |
+------------------+-------+--------------------+--------+
|   (intercept)    |  None | 222253.19254432796 |  None  |
|     bedrooms     |  None | 661.7227177822556  |  None  |
| bedrooms_square  |  None |        0.0         |  None  |
|    bathrooms     |  None | 15873.957259267987 |  None  |
|   sqft_living    |  None | 32.41022145125966  |  None  |
| sqft_living_sqrt |  None | 690.1147733133253  |  None  |
|     sqft_lot     |  None |        0.0         |  None  |
|  sqft_lot_sqrt   |  None |        0.0         |  None  |
|      floors      |  None |        0.0         |  None  |
|  floors_square   |  None |        0.0         |  None  |
|    waterfront    |  None |        0.0         |  None  |
|       view       |  None |        0.0         |  None  |
|    condition     |  None |        0.0         |  None  |
|      grade       |  None | 2899.4202697498704 |  None 