In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import math

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
training = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
testing = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [4]:
def get_numpy_data(train_data, features, output):
    train_data['constant'] = 1
    features = ['constant'] + features
    feature_matrix = train_data[features].values
    
    output_array = np.array(train_data[output])    
    return feature_matrix, output_array

In [5]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)    
    return predictions

In [6]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis= 0)
    features_normalized = features / norms
    return features_normalized, norms

In [7]:
two_feature = ['sqft_living', 'bedrooms']
feature_matrix, output_array = get_numpy_data(sales, two_feature, 'price')
features_normalized, norms = normalize_features(feature_matrix)
initial_weights = [1., 4., 1.]

In [8]:
predictions = predict_output(feature_matrix, initial_weights)
predictions

array([ 4724., 10284.,  3083., ...,  4083.,  6404.,  4083.])

In [9]:
ro = np.zeros(features_normalized.shape[1])
for i in range(features_normalized.shape[1]):
    ro[i] = np.sum(features_normalized[:, i] * (output_array + initial_weights[i] * features_normalized[:, i]))

In [10]:
2 * ro[1]

175878953.3636557

In [11]:
2* ro[2]

161933408.8107698

In [12]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    prediction = predict_output(feature_matrix, weights)
    
    #compute ro[i]
    ro_i = (feature_matrix[:, i] * (output - prediction + weights[i] * feature_matrix[:, i])).sum()

    
    if i == 0:
        new_weight_i = ro_i
    elif ro_i < -l1_penalty /2.:
        new_weight_i = (ro_i + l1_penalty / 2.)
    elif ro_i > l1_penalty / 2.:
        new_weight_i = (ro_i - l1_penalty / 2.)
    else:
        new_weight_i = 0.
    
    return new_weight_i

### Test lasso_coordinate_descent_step 

In [13]:
print(lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],
      [2./math.sqrt(13),3./math.sqrt(10)]]), np.array([1., 1.]), np.array([1., 4.]), 0.1))


0.4255588466910251


In [14]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    
    converged = False
    weights = initial_weights
    change = np.zeros(initial_weights.shape[0])
    while not converged:
        
        for i in range(len(weights)):
            new_weight = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            change[i] = np.abs(new_weight - weights[i])
            
            weights[i] = new_weight
        
        max_change = max(change)
        
        if max_change < tolerance:
            converged = True

    return weights

In [15]:
features = ['sqft_living', 'bedrooms']
output = 'price'
initial_weights = np.zeros(len(features)+1)
l1_penalty = 1e7
tolerance = 1.0

In [16]:
features_matirx, output = get_numpy_data(sales, features, output)
normalized_features_matrix, norms = normalize_features(feature_matrix)

In [17]:
weights = lasso_cyclical_coordinate_descent(normalized_features_matrix, output, initial_weights, l1_penalty, tolerance)

In [18]:
predictions = predict_output(normalized_features_matrix, weights)
rss = np.sum(np.square(output - predictions))

### What is the RSS of the learned model on the normalized dataset?

In [19]:
rss

1630492476715386.5

### Which features had weight zero at convergence?

In [20]:
print(weights)
print(features)

[21624997.95951909 63157247.20788956        0.        ]
['sqft_living', 'bedrooms']


### Applied multiple features for Lasso Regression
- l1_penalty = 1e7
- tolerance = 1

In [21]:
mul_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
                     'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
output = 'price'
initial_weights = np.zeros(len(mul_features)+1)
l1_penalty = 1e7
tolerance = 1.0

In [22]:
mul_features_matirx, output = get_numpy_data(training, mul_features, output)
normalized_mul_features_matrix, norms = normalize_features(mul_features_matirx)

In [23]:
weights1e7 = lasso_cyclical_coordinate_descent(normalized_mul_features_matrix, output, initial_weights, l1_penalty, tolerance)

### What features had non-zero weight in this case?

In [24]:
mul_features_list = ['constant'] + mul_features

In [25]:
[mul_features_list[i] for i in (np.where(weights1e7 > 0)[0]).tolist()]

['constant', 'sqft_living', 'waterfront', 'view']

### Applied multiple features for Lasso Regression
- l1_penalty = 1e8
- tolerance = 1

In [26]:
mul_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
                     'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
output = 'price'
initial_weights = np.zeros(len(mul_features)+1)
l1_penalty = 1e8
tolerance = 1.0

In [27]:
mul_features_matirx, output = get_numpy_data(training, mul_features, output)
normalized_mul_features_matrix, norms = normalize_features(mul_features_matirx)

In [28]:
weights1e8 = lasso_cyclical_coordinate_descent(normalized_mul_features_matrix, output, initial_weights, l1_penalty, tolerance)

In [29]:
weights1e8

array([71114625.71488702,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ,        0.        ,
              0.        ,        0.        ])

### What features had non-zero weight in this case?

In [51]:
[mul_features_list[i] for i in (np.where(weights1e8 != 0)[0]).tolist()]

['constant']

### Applied multiple features for Lasso Regression
- l1_penalty = 1e4
- tolerance = 5e5

In [42]:
mul_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
                     'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
output = 'price'
initial_weights = np.zeros(len(mul_features)+1)
l1_penalty = 1e4
tolerance = 5e5

In [43]:
mul_features_matirx, output = get_numpy_data(training, mul_features, output)
normalized_mul_features_matrix, norms = normalize_features(mul_features_matirx)

In [44]:
weights1e4 = lasso_cyclical_coordinate_descent(normalized_mul_features_matrix, output, initial_weights, l1_penalty, tolerance)

In [47]:
weights1e4

array([ 78564738.34156762, -22097398.92430532,  12791071.87278517,
        93808088.09281193,  -2013172.75704954,  -4219184.93265014,
         6482842.81753506,   7127408.53480689,   5001664.8546964 ,
        14327518.43714051, -15770959.15237397,  -5159591.22213147,
       -84495341.7684364 ,   2824439.49703683])

### What features had non-zero weight in this case?

In [52]:
[mul_features_list[i] for i in (np.where(weights1e4 != 0)[0]).tolist()]

['constant',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

### Rescaling learned weights

In [93]:
normalized_weights1e7 = weights1e7 / norms
normalized_weights1e8 = weights1e8 / norms
normalized_weights1e4 = weights1e4 / norms

In [94]:
normalized_weights1e7[3]

161.31745764611762

### Evaluating the model on Testing data

In [98]:
output = 'price'
test_features_matirx, output = get_numpy_data(testing, mul_features, output)

### Model with weights1e7

In [101]:
predictions = predict_output(test_features_matirx, normalized_weights1e7)
rss = np.sum(np.square(output - predictions))
rss

275962075920366.78

### Model with weights1e8

In [102]:
predictions = predict_output(test_features_matirx, normalized_weights1e8)
rss = np.sum(np.square(output - predictions))
rss

537166151497322.75

### Model with weights1e4

In [103]:
predictions = predict_output(test_features_matirx, normalized_weights1e4)
rss = np.sum(np.square(output - predictions))
rss

228459958971393.25