In [1]:
import numpy as np
import pandas as pd

In [18]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [19]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [20]:
def get_numpy_data(dataset, features, output_name):
    dataset['constant'] = 1
    return dataset[['constant'] + features].values, dataset[output_name].values

In [21]:
def predict_output(feature_matrix, weights):
    return feature_matrix.dot(weights)

In [34]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    return features/norms, norms

In [39]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    prediction = predict_output(feature_matrix, weights)
    ro_i = sum(feature_matrix[:, i] * (output - prediction + weights[i] * feature_matrix[:,i]))
    new_weight_i = 0
    if i == 0:
        new_weight_i = ro_i
    if ro_i < -l1_penalty/2:
        new_weight_i = ro_i + l1_penalty/2
    if ro_i > l1_penalty/2:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0
    return new_weight_i

In [40]:
# should print 0.425558846691
import math
print(lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))

0.425558846691


In [41]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    weights = initial_weights
    change_magnitude = np.zeros((len(weights), 1))
    converged = False
    while not converged:
        for i in range(len(initial_weights)):
            weights_i = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            change_magnitude[i] = abs(weights[i] - weights_i)
            weights[i] = weights_i
        if sum(change_magnitude) < tolerance:
            converged = True
    return weights

In [42]:
features_matrix, output = get_numpy_data(sales, ['sqft_living', 'bedrooms'], 'price')
normalized_feature_matrix, norm = normalize_features(features_matrix)
print(norm)
print(normalized_feature_matrix)
l1_penalty = 1e7
tolerance = 1
lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, np.array([0, 0, 0]), l1_penalty, tolerance)

[  1.47013605e+02   3.34257264e+05   5.14075870e+02]
[[ 0.00680209  0.00353021  0.00583571]
 [ 0.00680209  0.00768869  0.00583571]
 [ 0.00680209  0.00230361  0.00389048]
 ..., 
 [ 0.00680209  0.00305154  0.00389048]
 [ 0.00680209  0.00478673  0.00583571]
 [ 0.00680209  0.00305154  0.00389048]]


array([       0, 82939472,        0])

In [43]:
from sklearn import linear_model
model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
model.fit(sales[['sqft_living', 'bedrooms']], sales['price'])
print(model.coef_)
print(model.intercept_)

[ 0.  0.]
540088.141767
