# Descriptions and Details:






In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Prepare Data

In [101]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [102]:
sales = pd.read_csv("data_files/kc_house_data.csv", dtype = dtype_dict)

In [103]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column  
    features = ['constant'] + features # combine two lists

    feature_matrix = data[features].to_numpy()
    output_array = data[output].to_numpy()
    
    return(feature_matrix, output_array) # returns a 2D array and 1D array

## Compute predictions

In [82]:
def predict_output(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

## Normalize features
In the house dataset, features vary wildly in their relative magnitude: `sqft_living` is very large overall compared to `bedrooms`, for instance. As a result, weight for `sqft_living` would be much smaller than weight for `bedrooms`. This is problematic because "small" weights are dropped first as `l1_penalty` goes up. 

To give equal considerations for all features, we need to **normalize features** as discussed in the lectures: **we divide each feature by its 2-norm** so that the transformed feature has norm 1.

Let's see how we can do this normalization easily with Numpy: let us first consider a small matrix.

In [83]:
X = np.array([[3.,5.,8.],[4.,12.,15.]])
# Numpy provides a shorthand for computing 2-norms of each column:
norms = np.linalg.norm(X, axis=0) # gives [norm(X[:,0]), norm(X[:,1]), norm(X[:,2])]
print (norms)
# apply element-wise division:
print (X / norms)

[ 5. 13. 17.]
[[0.6        0.38461538 0.47058824]
 [0.8        0.92307692 0.88235294]]


Using the shorthand we just covered, write a short function called `normalize_features(feature_matrix)`, which normalizes columns of a given feature matrix. The function should return a pair `(normalized_features, norms)`, where the second item contains the norms of original features. As discussed in the lectures, we will use these norms to normalize the test data in the same way as we normalized the training data. 

In [84]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis = 0)
    normalized_features = features/norms
    return normalized_features, norms

In [85]:
# test the function:
features, norms = normalize_features(np.array([[3.,6.,9.],[4.,8.,12.]]))
print (features)
# should print
# [[ 0.6  0.6  0.6]
#  [ 0.8  0.8  0.8]]
print (norms)
# should print
# [5.  10.  15.]

[[0.6 0.6 0.6]
 [0.8 0.8 0.8]]
[ 5. 10. 15.]


## Single Coordinate Descent Step

In [212]:
def lasso_coordinate_descent_step(j, feature_matrix, output, weights, l1_penalty):
    prediction = predict_output(feature_matrix, weights)
    feature_j = feature_matrix[:,j]
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_j = np.dot(feature_j, output - (prediction - weights[j] * feature_j))   # extract feature j

    if j == 0: # intercept -- do not regularize
        new_weight_j = ro_j 
    elif ro_j < -l1_penalty/2.:
        new_weight_j = ro_j + l1_penalty/2
    elif ro_j > l1_penalty/2.:
        new_weight_j = ro_j - l1_penalty/2
    else:
        new_weight_j = 0.
    
    return new_weight_j

In [213]:
# test the function
import math
print (lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],[2./math.sqrt(13),3./math.sqrt(10)]]), 
                                   np.array([1., 1.]), np.array([1., 4.]), 0.1))
# should print 0.425558846691

0.4255588466910251


## Cyclical Coordinate Descent

In [202]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    converged = False # trigger the iteration
    weight_change_list = np.empty(len(initial_weights))
    weights_list = np.array(initial_weights)
    while not converged:
        for j in range(len(weights_list)): # iterate over coefficients
            old_weight = weights_list[j]
            weights_list[j] = lasso_coordinate_descent_step(j, feature_matrix, output, weights_list, l1_penalty)
            weight_change = abs(weights_list[j] - old_weight)
            weight_change_list[j] = weight_change
        max_change = np.max(weight_change_list)
        if max_change < tolerance:
            converged = True
    return weights_list

In [220]:
def lasso_cyclical_coordinate_descent_1(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    converged = False # trigger the iteration
    weight_change_list = np.empty(len(initial_weights))
    weights_list = np.zeros((100,len(initial_weights)))
    weights_list[0,:] = initial_weights
    iteration = 1
    print_frequency = 1
    while not converged:
        
        ### === code section for adjusting frequency of debugging output. ===
        if iteration == 10:
            print_frequency = 10
        if iteration == 100:
            print_frequency = 100
        if iteration%print_frequency==0:
            print('Iteration = ' , str(iteration))
        ### === end code section ===
        
        for j in range(len(weight_change_list)): # iterate over coefficients
            old_weight = weights_list[iteration-1][j] # use weights from previous round
            weights_list[iteration][j] = lasso_coordinate_descent_step(j, feature_matrix, output, weights_list[iteration-1,:], l1_penalty)
            weight_change = abs(weights_list[iteration][j] - old_weight)
            weight_change_list[j] = weight_change
        max_change = np.max(weight_change_list)
        if iteration%print_frequency==0:
            print('in this iteration', iteration,':', weights_list[iteration,:], '\n')
        
        iteration += 1 
        if max_change < tolerance or iteration >= 100:
            converged = True
    return weights_list, weight_change_list

In [205]:
a = np.zeros(3)
a

array([0., 0., 0.])

### Using cyclical coordinate descent 

In [166]:
(feature_matrix, output) = get_numpy_data(sales, ['sqft_living', 'bedrooms'], 'price')
(normalized_feature_matrix, norms) = normalize_features(feature_matrix)

In [167]:
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

In [203]:
weights = lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)
print(weights)

[21624997.9595191  63157247.20788956        0.        ]


In [221]:
weights,list = lasso_cyclical_coordinate_descent_1(normalized_feature_matrix, output,
                                            initial_weights, l1_penalty, tolerance)
list


Iteration =  1
in this iteration 1 : [79400304.63764462 82939472.68182784 75966703.4053849 ] 

Iteration =  2
in this iteration 2 : [-69701976.78367452 -51391433.00037618 -68851242.31261744] 

Iteration =  3
in this iteration 3 : [1.92783738e+08 2.11682985e+08 1.91660885e+08] 

Iteration =  4
in this iteration 4 : [-2.99001660e+08 -2.64304035e+08 -2.99657732e+08] 

Iteration =  5
in this iteration 5 : [6.10046173e+08 6.39276004e+08 6.13646515e+08] 

Iteration =  6
in this iteration 6 : [-1.09694367e+09 -1.04427573e+09 -1.10544941e+09] 

Iteration =  7
in this iteration 7 : [2.10032083e+09 2.12972039e+09 2.11897805e+09] 

Iteration =  8
in this iteration 8 : [-3.91149156e+09 -3.82827512e+09 -3.94871448e+09] 

Iteration =  9
in this iteration 9 : [7.38793658e+09 7.38787482e+09 7.45966347e+09] 

Iteration =  10
in this iteration 10 : [-1.38698942e+10 -1.37057980e+10 -1.40084766e+10] 

Iteration =  20
in this iteration 20 : [-7.72736584e+12 -7.67092671e+12 -7.80555629e+12] 

Iteration =  3

array([5.82525340e+34, 5.78277227e+34, 5.88419391e+34])

In [92]:
# compute RSS on normalized dataset
predictions = predict_output(normalized_feature_matrix, weights)
RSS = np.sum((output - predictions)**2)
RSS

1630492476715386.5

## Evaluating LASSO fit with more features

In [110]:
train_data = pd.read_csv("data_files/kc_house_train_data.csv", dtype = dtype_dict)
test_data = pd.read_csv("data_files/kc_house_test_data.csv", dtype = dtype_dict)

In [111]:
multiple_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']

In [112]:
(multiple_feature_matrix, output) = get_numpy_data(train_data, multiple_features, 'price')
(normalized_multiple_feature_matrix, multiple_norms) = normalize_features(multiple_feature_matrix) # normalize features

In [113]:
# l1_penalty: 1e7
initial_weights = np.zeros(len(multiple_features) + 1)
l1_penalty = 1e7
tolerance = 1.0
weights1e7 = lasso_cyclical_coordinate_descent(normalized_multiple_feature_matrix, output, initial_weights, l1_penalty, tolerance)
print (weights1e7)

[24429600.23440313        0.                0.         48389174.77154896
        0.                0.          3317511.21492165  7329961.81171425
        0.                0.                0.                0.
        0.                0.        ]


In [121]:
# l1_penalty: 1e8
initial_weights = np.zeros(len(multiple_features) + 1)
l1_penalty = 1e8
tolerance = 1.0
weights1e8 = lasso_cyclical_coordinate_descent(normalized_multiple_feature_matrix, output, initial_weights, l1_penalty, tolerance)
print (weights1e8)

[71114625.71488702        0.                0.                0.
        0.                0.                0.                0.
        0.                0.                0.                0.
        0.                0.        ]


In [122]:
# l1_penalty: 1e4, tolerance = 5e5
initial_weights = np.zeros(len(multiple_features) + 1)
l1_penalty = 1e4
tolerance = 1
weights1e4 = lasso_cyclical_coordinate_descent(normalized_multiple_feature_matrix, output, initial_weights, l1_penalty, tolerance)
print (weights1e4)

[ 7.87859491e+08 -1.96611439e+07  1.28893367e+07  5.20015124e+07
 -1.34448346e+06  4.40613056e+06  6.85843473e+06  4.53587116e+06
  8.66423048e+06  1.28374342e+08  0.00000000e+00  3.64776371e+05
 -8.98872855e+08  6.49411472e+05]


## Rescale learned weights

Recall that we normalized our feature matrix, before learning the weights.  To use these weights on a test set, we must normalize the test data in the same way.

**Alternatively, we can rescale the learned weights to include the normalization**, so we never have to worry about normalizing the test data: 

In this case, we must scale the resulting weights so that we can make predictions with *original* features:
 1. Store the norms of the original features to a vector called `norms`:
```
features, norms = normalize_features(features)
```
 2. Run Lasso on the normalized features and obtain a `weights` vector
 3. Compute the weights for the original features by performing element-wise division, i.e.
```
weights_normalized = weights / norms
```
Now, we can apply `weights_normalized` to the test data, without normalizing it!

In [123]:
normalized_weights1e4 = weights1e4 / multiple_norms
normalized_weights1e7 = weights1e7 / multiple_norms
normalized_weights1e8 = weights1e8 / multiple_norms
print (normalized_weights1e7[3])
#should return 161.31745624837794.

161.31745764611762


## Evaluate each of the learned model on the test data

In [124]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, multiple_features, 'price')

In [125]:
prediction = predict_output(test_feature_matrix,normalized_weights1e4)
error = prediction - test_output
RSS = np.dot(error,error)
print (RSS)

194415789314446.62


In [126]:
prediction = predict_output(test_feature_matrix,normalized_weights1e7)
error = prediction - test_output
RSS = np.dot(error,error)
print (RSS)

275962075920366.78


In [127]:
prediction = predict_output(test_feature_matrix,normalized_weights1e8)
error = prediction - test_output
RSS = np.dot(error,error)
print (RSS)

537166151497322.7
