In [1]:
import pandas as pd
import numpy as np
from math import log, sqrt
from sklearn import linear_model

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

### Adding new features

In [4]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [5]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [6]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True).fit(sales[all_features], sales['price'])

### Which features were assigned nonzero weights?

In [7]:
[all_features[i] for i in np.where(model_all.coef_ != 0)[0].tolist()]

['sqft_living', 'view', 'grade']

## Lasoo Regression

In [8]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [9]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [10]:
l1_penalty = np.logspace(1, 7, num=13)

### Learning a model on Training dat. Then, computing Rss on validation data

In [11]:
rss_list = []
for l1 in l1_penalty:
    model = linear_model.Lasso(alpha= l1, normalize= True).fit(training[all_features], training['price'])
    prediction = model.predict(validation[all_features])
    rss = validation['price'] - prediction
    rss_list.append((rss**2).sum())

### Which was the best value for the l1_penalty?

In [12]:
l1_penalty[rss_list.index(min(rss_list))]

10.0

### which value of l1_penalty produced the lowest RSS on VALIDATION data?


In [13]:
min(rss_list)

398213327300134.94

### Computing RSS on Test data for the model with the best L1 penlalty

In [14]:
model_best = linear_model.Lasso(alpha= 10, normalize= True).fit(training[all_features], training['price'])

In [15]:
prediction_test = model_best.predict(testing[all_features])
rss_test = testing['price'] - prediction_test
rss_sum_test = (rss_test ** 2).sum()
rss_sum_test

98467402552698.8

###  How many nonzero weights do you have?

In [16]:
np.count_nonzero(model_best.coef_)
# len(model_best.coef_[model_best.coef_ != 0])

14

In [17]:
np.count_nonzero(model_best.coef_) + np.count_nonzero(model_best.intercept_)

15

### Limited the number of nonzero feature to 7

In [18]:
max_nonzeros = 7
l1_penalty =  np.logspace(1, 4, num=20)

In [19]:
nonzero_list = []
for l1 in l1_penalty:
    model = linear_model.Lasso(alpha = l1, normalize= True).fit(training[all_features], training['price'])
    
    if model.intercept_ != 0:
        nonzero_no = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    else:
        nonzero_no = np.count_nonzero(model.coef_)
        
    nonzero_list.append(nonzero_no)
    

### What values did you find for l1_penalty_min and l1_penalty_max?

In [20]:
min_idx = nonzero_list.index(min([i for i in nonzero_list if i > max_nonzeros]))
max_idx = nonzero_list.index(max([i for i in nonzero_list if i < max_nonzeros]))

In [21]:
l1_penalty_min = l1_penalty[min_idx]
l1_penalty_min

127.42749857031335

In [22]:
l1_penalty_max = l1_penalty[max_idx]
l1_penalty_max

263.6650898730358

In [43]:
temp = {}
for l1_penalty in np.linspace(l1_penalty_min, l1_penalty_max, 20):
    model = linear_model.Lasso(alpha= l1_penalty, normalize= True).fit(training[all_features], training['price'])
    prediction = model.predict(validation[all_features])
    
    no_nonzero = 0
    if model.intercept_ != 0:
        no_nonzero = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    else:
        no_nonzero = np.count_nonzero(model.coef_)
    rss = np.sum(np.square(validation['price'] - prediction))
    
    temp[l1_penalty] = rss, no_nonzero

### What value of l1_penalty in our narrow range has the lowest RSS on the VALIDATION set and has sparsity equal to ‘max_nonzeros’?

In [46]:
temp

{127.42749857031335: (435374677102680.6, 10),
 134.5978981125619: (437009229124471.25, 10),
 141.76829765481045: (438236128386912.25, 8),
 148.938697197059: (439158937799660.06, 8),
 156.10909673930755: (440037365263316.56, 7),
 163.2794962815561: (440777489641605.25, 7),
 170.44989582380464: (441566698090139.9, 7),
 177.6202953660532: (442406413188666.25, 7),
 184.79069490830176: (443296716874315.06, 7),
 191.96109445055032: (444239780526141.6, 7),
 199.13149399279888: (445230739842614.2, 7),
 206.3018935350474: (446268896864706.3, 6),
 213.47229307729594: (447112919434640.6, 6),
 220.6426926195445: (447998187851564.9, 6),
 227.81309216179307: (448924706673255.06, 6),
 234.98349170404163: (449892475899711.0, 6),
 242.1538912462902: (450901498778123.2, 6),
 249.32429078853872: (451952426654987.06, 6),
 256.49469033078725: (453043924367599.25, 6),
 263.6650898730358: (454176669662635.25, 6)}

In [61]:
temp_list = []
for i,j in temp.items():
    if j[1] == max_nonzeros:
        print(i, j[0])

156.10909673930755 440037365263316.56
163.2794962815561 440777489641605.25
170.44989582380464 441566698090139.9
177.6202953660532 442406413188666.25
184.79069490830176 443296716874315.06
191.96109445055032 444239780526141.6
199.13149399279888 445230739842614.2


In [66]:
model = linear_model.Lasso(alpha= 156.10909674, normalize= True).fit(training[all_features], training['price'])
model.coef_

array([-0.00000000e+00, -0.00000000e+00,  1.06108903e+04,  1.63380252e+02,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  5.06451687e+05,  4.19600436e+04,  0.00000000e+00,
        1.16253554e+05,  0.00000000e+00,  0.00000000e+00, -2.61223488e+03,
        0.00000000e+00])

In [68]:
[all_features[i] for i in np.where(model.coef_ != 0)[0].tolist()]

['bathrooms', 'sqft_living', 'waterfront', 'view', 'grade', 'yr_built']