In [43]:
from math import log, sqrt

import pandas as pd

import numpy as np

from sklearn import linear_model

In [44]:
dtype_dict = {
    'bathrooms': float,
    'bedrooms': float,
    'condition': int,
    'date': str,
    'floors': float,
    'grade': int,
    'id': str,
    'lat': float,
    'long': float,
    'price': float,
    'sqft_above': int,
    'sqft_basement': int,
    'sqft_living': float,
    'sqft_living15': float,
    'sqft_lot': int,
    'sqft_lot15': float,
    'view': int,
    'waterfront': int,
    'yr_built': int,
    'yr_renovated': int,
    'zipcode': str
}

In [45]:
kc_house_data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [46]:
sales = kc_house_data
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms'] * sales['bedrooms']
sales['floors_square'] = sales['floors'] * sales['floors']

In [47]:
sales.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'sqft_living_sqrt',
       'sqft_lot_sqrt', 'bedrooms_square', 'floors_square'],
      dtype='object')

In [48]:
all_features = [
    'bedrooms',
    'bedrooms_square',
    'bathrooms',
    'sqft_living',
    'sqft_living_sqrt',
    'sqft_lot',
    'sqft_lot_sqrt',
    'floors',
    'floors_square',
    'waterfront',
    'view',
    'condition',
    'grade',
    'sqft_above',
    'sqft_basement',
    'yr_built',
    'yr_renovated'
]

model_all = linear_model.Lasso(alpha=5e2, normalize=True)      # set parameters
model_all.fit(sales[all_features], sales['price'])             # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [49]:
list(zip(all_features, model_all.coef_))

[('bedrooms', 0.0),
 ('bedrooms_square', 0.0),
 ('bathrooms', 0.0),
 ('sqft_living', 134.43931395541438),
 ('sqft_living_sqrt', 0.0),
 ('sqft_lot', 0.0),
 ('sqft_lot_sqrt', 0.0),
 ('floors', 0.0),
 ('floors_square', 0.0),
 ('waterfront', 0.0),
 ('view', 24750.00458560952),
 ('condition', 0.0),
 ('grade', 61749.103090708129),
 ('sqft_above', 0.0),
 ('sqft_basement', 0.0),
 ('yr_built', -0.0),
 ('yr_renovated', 0.0)]

In [50]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [51]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms'] * testing['bedrooms']
testing['floors_square'] = testing['floors'] * testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms'] * training['bedrooms']
training['floors_square'] = training['floors'] * training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms'] * validation['bedrooms']
validation['floors_square'] = validation['floors'] * validation['floors']

In [52]:
def compute_rss(predictions, real):
    errors = real - predictions
    rss = sum(errors * errors)
    return rss

In [53]:
l1_penalties = np.logspace(1, 7, num=13)

In [54]:
l1_penalties

array([  1.00000000e+01,   3.16227766e+01,   1.00000000e+02,
         3.16227766e+02,   1.00000000e+03,   3.16227766e+03,
         1.00000000e+04,   3.16227766e+04,   1.00000000e+05,
         3.16227766e+05,   1.00000000e+06,   3.16227766e+06,
         1.00000000e+07])

In [55]:
compute_rss(model_all.predict(sales[all_features]), sales['price'])

1439386143411749.5

In [58]:
def choose_l1_penalty(l1_penalties):
    for p in l1_penalties:
        model = linear_model.Lasso(alpha=p, normalize=True)
        model.fit(training[all_features], training['price'])
        rss = compute_rss(model.predict(validation[all_features]), validation['price'])
        print('%f\t%f' % (p, rss))

In [59]:
choose_l1_penalty(l1_penalties)

10.000000	398213327300134.250000
31.622777	399041900253348.562500
100.000000	429791604072558.562500
316.227766	463739831045119.500000
1000.000000	645898733633803.250000
3162.277660	1222506859427156.750000
10000.000000	1222506859427156.750000
31622.776602	1222506859427156.750000
100000.000000	1222506859427156.750000
316227.766017	1222506859427156.750000
1000000.000000	1222506859427156.750000
3162277.660168	1222506859427156.750000
10000000.000000	1222506859427156.750000


In [60]:
best_l1_penalty = l1_penalties[0]

In [61]:
model = linear_model.Lasso(alpha=best_l1_penalty, normalize=True)
model.fit(training[all_features], training['price'])

Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [67]:
list(zip(all_features, model.coef_))

[('bedrooms', -16144.562757076479),
 ('bedrooms_square', 373.24538434909186),
 ('bathrooms', 50841.243339864224),
 ('sqft_living', 617.85355950427777),
 ('sqft_living_sqrt', -44411.354866711656),
 ('sqft_lot', 0.7856230648316227),
 ('sqft_lot_sqrt', -701.19476536814375),
 ('floors', -0.0),
 ('floors_square', 5014.2004569684996),
 ('waterfront', 619488.75248591264),
 ('view', 38041.855652474056),
 ('condition', 24998.771838159253),
 ('grade', 128716.23462146104),
 ('sqft_above', 0.0),
 ('sqft_basement', 0.0),
 ('yr_built', -3293.8311799453932),
 ('yr_renovated', 10.057320864287972)]

In [68]:
np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)

15

## Choose the best model with max_nonzeros

In [69]:
max_nonzeros = 7

In [70]:
l1_penalties = np.logspace(1, 4, num=20)

In [71]:
l1_penalties

array([    10.        ,     14.38449888,     20.69138081,     29.76351442,
           42.81332399,     61.58482111,     88.58667904,    127.42749857,
          183.29807108,    263.66508987,    379.26901907,    545.55947812,
          784.75997035,   1128.83789168,   1623.77673919,   2335.72146909,
         3359.81828628,   4832.93023857,   6951.92796178,  10000.        ])

In [80]:
def get_models(l1_penalties):
    models = []
    for p in l1_penalties:
        model = linear_model.Lasso(alpha=p, normalize=True)
        model.fit(training[all_features], training['price'])
        models.append(model)
    return models

In [86]:
def how_many_nonzeros(m):
    return np.count_nonzero(m.coef_) + np.count_nonzero(m.intercept_)

In [81]:
models = get_models(l1_penalties)

In [78]:
list(zip(l1_penalties, list(map(lambda m: np.count_nonzero(m.coef_) + np.count_nonzero(m.intercept_), models))))

[(10.0, 15),
 (14.384498882876629, 15),
 (20.691380811147901, 15),
 (29.763514416313178, 15),
 (42.813323987193932, 13),
 (61.584821106602639, 12),
 (88.586679041008225, 11),
 (127.42749857031335, 10),
 (183.29807108324357, 7),
 (263.66508987303581, 6),
 (379.26901907322497, 6),
 (545.55947811685144, 6),
 (784.75997035146065, 5),
 (1128.8378916846884, 3),
 (1623.776739188721, 3),
 (2335.7214690901214, 2),
 (3359.8182862837812, 1),
 (4832.9302385717519, 1),
 (6951.9279617756056, 1),
 (10000.0, 1)]

In [101]:
model_min = list(filter(lambda m : how_many_nonzeros(m) > max_nonzeros, models))[-1]

In [102]:
model_min.alpha

127.42749857031335

In [103]:
model_max = list(filter(lambda m : how_many_nonzeros(m) < max_nonzeros, models))[0]

In [104]:
model_max.alpha

263.66508987303581

In [106]:
l1_penalties = np.linspace(model_min.alpha, model_max.alpha, 20)

In [108]:
def select(l1_penalties):
    models = {}
    for p in l1_penalties:
        model = linear_model.Lasso(alpha=p, normalize=True)
        model.fit(training[all_features], training['price'])
        if how_many_nonzeros(model) != max_nonzeros:
            continue
        rss = compute_rss(model.predict(validation[all_features]), validation['price'])
        models[model] = rss
        print('%f\t%f' % (p, rss))
    return min(models, key=models.get)

In [110]:
model_best = select(l1_penalties)

156.109097	440037365263316.437500
163.279496	440777489641605.375000
170.449896	441566698090138.875000
177.620295	442406413188665.562500
184.790695	443296716874312.812500
191.961094	444239780526140.687500
199.131494	445230739842614.312500


In [111]:
model_best.alpha

156.10909673930755

In [112]:
list(zip(all_features, model_best.coef_))

[('bedrooms', -0.0),
 ('bedrooms_square', -0.0),
 ('bathrooms', 10610.890284398287),
 ('sqft_living', 163.38025164762891),
 ('sqft_living_sqrt', 0.0),
 ('sqft_lot', -0.0),
 ('sqft_lot_sqrt', -0.0),
 ('floors', 0.0),
 ('floors_square', 0.0),
 ('waterfront', 506451.68711484916),
 ('view', 41960.043554852862),
 ('condition', 0.0),
 ('grade', 116253.5536997075),
 ('sqft_above', 0.0),
 ('sqft_basement', 0.0),
 ('yr_built', -2612.2348803574882),
 ('yr_renovated', 0.0)]