In [1]:
import pandas as pd

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int,\
              'sqft_living15':float, 'grade':int, 'yr_renovated':int,
              'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 
              'floors':float, 'condition':int, 'lat':float,
              'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [2]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [5]:
from sklearn import linear_model

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True)
model_all.fit(sales[all_features], sales['price'])

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [7]:
for j in range(len(all_features)):
    print(all_features[j], model_all.coef_[j])

('bedrooms', 0.0)
('bedrooms_square', 0.0)
('bathrooms', 0.0)
('sqft_living', 134.43931395540994)
('sqft_living_sqrt', 0.0)
('sqft_lot', 0.0)
('sqft_lot_sqrt', 0.0)
('floors', 0.0)
('floors_square', 0.0)
('waterfront', 0.0)
('view', 24750.00458561413)
('condition', 0.0)
('grade', 61749.103090711164)
('sqft_above', 0.0)
('sqft_basement', 0.0)
('yr_built', -0.0)
('yr_renovated', 0.0)


In [15]:
def transform(data):
    data['sqft_living_sqrt'] = data['sqft_living'].apply(sqrt)
    data['sqft_lot_sqrt'] = data['sqft_lot'].apply(sqrt)
    data['bedrooms_square'] = data['bedrooms']*data['bedrooms']
    data['floors_square'] = data['floors']*data['floors']
    return data

In [16]:
testing = transform(pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict))
training = transform(pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict))
validation = transform(pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict))

In [44]:
import numpy as np
minn = 1e100
for l1_penalty in np.logspace(1, 4, num=20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    pred = model.predict(validation[all_features])
    print("=====================")
    #print(pred[:10], validation['price'][:10], pred[:10] - validation['price'][:10])
    error = sum((pred - validation['price']) ** 2) / len(validation)
    non_zeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    minn = min(minn, error)
    print(str(l1_penalty) + " error = " + str(error) + " non-zeros " + str(non_zeros ))
print("minn =  " + str(minn))

10.0 error = 41329873098.1 non-zeros 15
14.3844988829 error = 41186490290.0 non-zeros 15
20.6913808111 error = 41122044821.3 non-zeros 15
29.7635144163 error = 41330102187.3 non-zeros 15
42.8133239872 error = 42229087547.5 non-zeros 13
61.5848211066 error = 44073429215.4 non-zeros 12
88.586679041 error = 44411656350.2 non-zeros 11
127.42749857 error = 45186785376.5 non-zeros 10
183.298071083 error = 45989332253.4 non-zeros 7
263.665089873 error = 47138211693.1 non-zeros 6
379.269019073 error = 49624595831.0 non-zeros 6
545.559478117 error = 55152795211.9 non-zeros 6
784.759970351 error = 61654728207.0 non-zeros 5
1128.83789168 error = 69959436428.2 non-zeros 3
1623.77673919 error = 83301443780.3 non-zeros 3
2335.72146909 error = 110145848767.0 non-zeros 2
3359.81828628 error = 126881874357.0 non-zeros 1
4832.93023857 error = 126881874357.0 non-zeros 1
6951.92796178 error = 126881874357.0 non-zeros 1
10000.0 error = 126881874357.0 non-zeros 1
minn =  41122044821.3


In [46]:
l1_penalty_min = 127.42
l1_penalty_max = 263.66

In [32]:
start = 0
finish = 1e10
while finish - start > 0.01:
    middle = (start + finish) / 2
    model = linear_model.Lasso(alpha=middle, normalize=True)
    model.fit(training[all_features], training['price'])
    non_zeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if non_zeros > 7:
        start = middle
    else:
        finish = middle
l1_penalty_min = start

In [33]:
start = 0
finish = 1e10
while finish - start > 0.01:
    middle = (start + finish) / 2
    model = linear_model.Lasso(alpha=middle, normalize=True)
    model.fit(training[all_features], training['price'])
    non_zeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if non_zeros < 7:
        finish = middle
    else:
        start = middle
l1_penalty_max = start

In [34]:
print(l1_penalty_min, l1_penalty_max)

(153.57727534137666, 206.19154383894056)


In [47]:
for l1_penalty in np.linspace(l1_penalty_min,l1_penalty_max,20):
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    pred = model.predict(validation[all_features])
    error = sum((pred - validation['price']) ** 2) / len(validation)
    non_zeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    minn = min(minn, error)
    print(str(l1_penalty) + " error = " + str(error) + " non-zeros " + str(non_zeros ))

127.42 error = 45186613759.9 non-zeros 10
134.590526316 error = 45356252612.3 non-zeros 10
141.761052632 error = 45483677334.3 non-zeros 8
148.931578947 error = 45579448913.9 non-zeros 8
156.102105263 error = 45670645648.1 non-zeros 7
163.272631579 error = 45747458044.2 non-zeros 7
170.443157895 error = 45829365104.7 non-zeros 7
177.613684211 error = 45916514347.9 non-zeros 7
184.784210526 error = 46008914259.8 non-zeros 7
191.954736842 error = 46106790264.3 non-zeros 7
199.125263158 error = 46209637480.6 non-zeros 7
206.295789474 error = 46317404871.0 non-zeros 6
213.466315789 error = 46405002457.4 non-zeros 6
220.636842105 error = 46496881029.9 non-zeros 6
227.807368421 error = 46593041062.0 non-zeros 6
234.977894737 error = 46693482553.8 non-zeros 6
242.148421053 error = 46798205841.1 non-zeros 6
249.318947368 error = 46907278530.9 non-zeros 6
256.489473684 error = 47020562042.9 non-zeros 6
263.66 error = 47138126721.2 non-zeros 6


In [48]:
model = linear_model.Lasso(alpha=156.1, normalize=True)
model.fit(training[all_features], training['price'])
for j in range(len(all_features)):
    print(all_features[j], model.coef_[j])


('bedrooms', -0.0)
('bedrooms_square', -0.0)
('bathrooms', 10612.804924695865)
('sqft_living', 163.37964773139075)
('sqft_living_sqrt', 0.0)
('sqft_lot', -0.0)
('sqft_lot_sqrt', -0.0)
('floors', 0.0)
('floors_square', 0.0)
('waterfront', 506458.93562342186)
('view', 41960.19841855827)
('condition', 0.0)
('grade', 116254.50464355544)
('sqft_above', 0.0)
('sqft_basement', 0.0)
('yr_built', -2612.3011354856285)
('yr_renovated', 0.0)
