## Read data

In [7]:
import pandas as pd
import numpy as np
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv/kc_house_data.csv', dtype= dtype_dict)


In [8]:
def get_numpy_data(data, features, output):
    data['constant'] = 1
    features = ['constant'] + features              ## combine two list
    feature_matrix = data[features].to_numpy()
    output_array = data[output].to_numpy()
    return(feature_matrix, output_array)

In [9]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [10]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    feature_norm = features/norms
    return(norms, feature_norm)        

## Coordinate descent

In [12]:
features = ['sqft_living', 'bedrooms']
feature_matrix, output_array = get_numpy_data(sales, features, 'price')
norms, feature_norm = normalize_features(feature_matrix)
initial_weights = [1, 4, 1]
prediction = predict_output(feature_norm, initial_weights)
ro = np.zeros(3)
for i in range(3):
    ro[i] = sum(feature_norm[:, i]*(sales['price']-prediction + initial_weights[i]*feature_norm[:,i])) #????

In [13]:
ro
print(ro[2]*2,ro[1]*2)

161933397.3324781 175878941.64650303


In [14]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    prediction = predict_output(feature_matrix, weights)
    ro_i = sum(feature_matrix[:, i]*(output-prediction+weights[i]*feature_matrix[:, i]))
    if i==0:
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0
    return(new_weight_i)

In [15]:
# test function 
import math
lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
                   [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1)

0.4255588466910251

In [16]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    n = feature_matrix.shape[1]
    weights = initial_weights
    old_weights = np.zeros(n)
    weights_change = np.zeros(n)
    max_change = tolerance + 1
    while max_change > tolerance:
        for i in range(n):
            old_weights[i] = weights[i]
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            weights_change[i] = weights[i] - old_weights[i]
        max_change = max(weights_change)
    return weights

In [42]:
## test function
all_features= ['sqft_living', 'bedrooms']
feature_matrix, output_array = get_numpy_data(sales, all_features, 'price')
norms, norm_feature = normalize_features(feature_matrix)
Initial_weights = np.zeros(3)
L1_penalty = 1e7
Tolerance = 1.0

best_weights = lasso_cyclical_coordinate_descent(norm_feature, sales['price'], 
                                                 initial_weights=np.zeros(3), 
                                                 l1_penalty= 1e7, tolerance= 1.0)
prediction = predict_output(norm_feature, best_weights)
RSS = sum((sales['price'] - prediction)**2)
print(RSS)
print(np.where(best_weights != 0))

1630492484578351.0
(array([0, 1]),)


## Evaluationg Lasso fit with more features

In [18]:
train_data = pd.read_csv('kc_house_train_data.csv/kc_house_train_data.csv', dtype= dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv/kc_house_test_data.csv', dtype= dtype_dict)

In [19]:
all_features= ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "waterfront",
               "view", "condition", "grade", 
               "sqft_above", "sqft_basement", "yr_built", "yr_renovated"]
feature_matrix, output_array = get_numpy_data(train_data, all_features, 'price')
norms2, normalized_feature = normalize_features(feature_matrix)

In [20]:
n2 = normalized_feature.shape[1] 
initial_weights = np.zeros(n2)
weights1e7 = lasso_cyclical_coordinate_descent(normalized_feature, train_data['price'], 
                                               initial_weights = np.zeros(n2), 
                                               l1_penalty=1e7, tolerance=1)

In [40]:
print(np.where(weights1e7 != 0 ))
print(all_features[2],all_features[5],all_features[6])
all_features[0]

(array([0, 3, 6, 7]),)
sqft_living waterfront view


'bedrooms'

In [37]:
weights1e8 = lasso_cyclical_coordinate_descent(normalized_feature, train_data['price'], 
                                               initial_weights = np.zeros(n2), 
                                               l1_penalty=1e8, tolerance=1)

In [38]:
print(np.where(weights1e8 != 0 ))
print(all_features[4],all_features[7],all_features[8])

(array([0]),)
floors condition grade


In [24]:
weights1e4 = lasso_cyclical_coordinate_descent(normalized_feature, train_data['price'], 
                                               initial_weights = np.zeros(n2), 
                                               l1_penalty=1e4, tolerance=5e5)

In [25]:
print(np.where(weights1e4 != 0 ))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]),)


In [34]:
weights1e7_normalized = weights1e7/norms2
weights1e8_normalized = weights1e8/norms2
weights1e4_normalized = weights1e4/norms2
print(weights1e7_normalized[3])

161.31745496774982


In [35]:
test_matrix, output_array = get_numpy_data(test_data, all_features, 'price')
prediction1e7 = predict_output(test_matrix, weights1e7_normalized)
RSS1e7 = sum((prediction1e7-output_array)**2)

prediction1e8 = predict_output(test_matrix, weights1e8_normalized)
RSS1e8 = sum((prediction1e8-output_array)**2)

prediction1e4 = predict_output(test_matrix, weights1e4_normalized)
RSS1e4 = sum((prediction1e4-output_array)**2)

In [36]:
print(RSS1e7,RSS1e8,RSS1e4)

275962077477488.8 537166151497322.4 228459958971392.34
