## In the first notebook we explored multiple regression using GraphLab Create. Now we will use SFrames along with numpy to solve for the regression weights with gradient descent.



## 1. Loading the data

In [2]:
import graphlab as gl

In [3]:
sales = gl.SFrame('../../week_1/data/kc_house_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to igagandeep.s@gmail.com and will expire on March 25, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1491948244.log


## 2. import numpy

In [4]:
import numpy as np

## 3. Next write a function that takes a data set, a list of features (e.g. [‘sqft_living’, ‘bedrooms’]), to be used as inputs, and a name of the output (e.g. ‘price’). This function should return a features_matrix (2D array) consisting of first a column of ones followed by columns containing the values of the input features in the data set in the same order as the input list. It should also return an output_array which is an array of the values of the output in the data set (e.g. ‘price’)

In [5]:
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # add a constant column to an SFrame
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_sframe = data_sframe[features]
    print features_sframe
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    print features_matrix
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [6]:
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list
print example_features[0,:] # this accesses the first row of the data the ':' indicates 'all columns'
print example_output[0] # and the corresponding output

+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1180.0   |
|    1     |    2570.0   |
|    1     |    770.0    |
|    1     |    1960.0   |
|    1     |    1680.0   |
|    1     |    5420.0   |
|    1     |    1715.0   |
|    1     |    1060.0   |
|    1     |    1780.0   |
|    1     |    1890.0   |
+----------+-------------+
[21613 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
[[  1.00000000e+00   1.18000000e+03]
 [  1.00000000e+00   2.57000000e+03]
 [  1.00000000e+00   7.70000000e+02]
 ..., 
 [  1.00000000e+00   1.02000000e+03]
 [  1.00000000e+00   1.60000000e+03]
 [  1.00000000e+00   1.02000000e+03]]
[  1.00000000e+00   1.18000000e+03]
221900.0


## 4.  Write a function ‘predict_output’ which accepts a 2D array ‘feature_matrix’ and a 1D array ‘weights’ and returns a 1D array ‘predictions’

In [7]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

## 5. Write a function that accepts a ‘feature’ array and ‘error’ array and returns the ‘derivative’ (a single number). 

In [8]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(feature, errors)
    return(derivative)

## 6. Gradient descent function

In [12]:
from math import sqrt

def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        # compute the errors as predictions - output:
        predictions = predict_outcome(feature_matrix, weights)
        error = (predictions - output)
        
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(error, feature_matrix[:, i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares = gradient_sum_squares + derivative**2
            # update the weight based on step size and derivative:
            weights[i] = weights[i] - step_size*derivative
        gradient_magnitude = sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

## 7. Now split the sales data into training and test data. 

In [13]:
train_data,test_data = sales.random_split(.8,seed=0)

## 8. Run the regression_gradient_descent on actual data

In [31]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)


+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1180.0   |
|    1     |    2570.0   |
|    1     |    770.0    |
|    1     |    1960.0   |
|    1     |    1680.0   |
|    1     |    5420.0   |
|    1     |    1715.0   |
|    1     |    1060.0   |
|    1     |    1780.0   |
|    1     |    1890.0   |
+----------+-------------+
[17384 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
[[  1.00000000e+00   1.18000000e+03]
 [  1.00000000e+00   2.57000000e+03]
 [  1.00000000e+00   7.70000000e+02]
 ..., 
 [  1.00000000e+00   1.53000000e+03]
 [  1.00000000e+00   1.60000000e+03]
 [  1.00000000e+00   1.02000000e+03]]

[-46999.88716555    281.91211912]


## 9. Quiz Question: What is the value of the weight for sqft_living -- the second element of ‘simple_weights’ (rounded to 1 decimal place)?

In [36]:
print "%.1f"%simple_weights[1]

281.9


## 10. On test data

In [39]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
test_predictions = predict_outcome(test_simple_feature_matrix, simple_weights)

+----------+-------------+
| constant | sqft_living |
+----------+-------------+
|    1     |    1430.0   |
|    1     |    2950.0   |
|    1     |    1710.0   |
|    1     |    2320.0   |
|    1     |    1090.0   |
|    1     |    2620.0   |
|    1     |    4220.0   |
|    1     |    2250.0   |
|    1     |    1260.0   |
|    1     |    2750.0   |
+----------+-------------+
[4229 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
[[  1.00000000e+00   1.43000000e+03]
 [  1.00000000e+00   2.95000000e+03]
 [  1.00000000e+00   1.71000000e+03]
 ..., 
 [  1.00000000e+00   2.52000000e+03]
 [  1.00000000e+00   2.31000000e+03]
 [  1.00000000e+00   1.02000000e+03]]


## 11. Quiz Question: What is the predicted price for the 1st house in the Test data set for model 1 (round to nearest dollar)?

In [40]:
print "%.1f" % test_predictions[0]

356134.4


## 12. Now compute RSS on all test data for this model. Record the value and store it for later.

RSS is the sum of the squared errors (difference between prediction and output).

In [42]:
test_error = test_output - test_predictions
test_RSS = (test_error**2).sum() 
print test_RSS

2.75400047593e+14


## 13. Now we will use the gradient descent to fit a model with more than 1 predictor variable (and an intercept).

In [45]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

model_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)

print model_weights

+----------+-------------+---------------+
| constant | sqft_living | sqft_living15 |
+----------+-------------+---------------+
|    1     |    1180.0   |     1340.0    |
|    1     |    2570.0   |     1690.0    |
|    1     |    770.0    |     2720.0    |
|    1     |    1960.0   |     1360.0    |
|    1     |    1680.0   |     1800.0    |
|    1     |    5420.0   |     4760.0    |
|    1     |    1715.0   |     2238.0    |
|    1     |    1060.0   |     1650.0    |
|    1     |    1780.0   |     1780.0    |
|    1     |    1890.0   |     2390.0    |
+----------+-------------+---------------+
[17384 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
[[  1.00000000e+00   1.18000000e+03   1.34000000e+03]
 [  1.00000000e+00   2.57000000e+03   1.69000000e+03]
 [  1.00000000e+00   7.70000000e+02   2.72000000e+03]
 ..., 
 [  1.00000000e+00   1.53000000e+03   1.53000000e+03]
 [  1.00000000e+00   1

## 14. Use the regression weights from this second model (using sqft_living and sqft_living_15) and predict the outcome of all the house prices on the TEST data.

In [60]:
(test_model_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)
test_model_predictions = predict_outcome(test_model_feature_matrix, model_weights)

+----------+-------------+---------------+
| constant | sqft_living | sqft_living15 |
+----------+-------------+---------------+
|    1     |    1430.0   |     1780.0    |
|    1     |    2950.0   |     2140.0    |
|    1     |    1710.0   |     1030.0    |
|    1     |    2320.0   |     2580.0    |
|    1     |    1090.0   |     1570.0    |
|    1     |    2620.0   |     2620.0    |
|    1     |    4220.0   |     2410.0    |
|    1     |    2250.0   |     2250.0    |
|    1     |    1260.0   |     1290.0    |
|    1     |    2750.0   |     1510.0    |
+----------+-------------+---------------+
[4229 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
[[  1.00000000e+00   1.43000000e+03   1.78000000e+03]
 [  1.00000000e+00   2.95000000e+03   2.14000000e+03]
 [  1.00000000e+00   1.71000000e+03   1.03000000e+03]
 ..., 
 [  1.00000000e+00   2.52000000e+03   2.52000000e+03]
 [  1.00000000e+00   2.

## 15. Quiz Question: What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?

In [58]:
print "%.1f" % test_model_predictions[0]

366651.4


## 16. What is the actual price for the 1st house in the Test data set?

In [59]:
test_data['price'][0]

310000.0

## 17. Quiz Question: Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2?

In [56]:
print "model 1 -> %.1f" % test_predictions[0]
print "model 2 -> %.1f" % test_model_predictions[0]
print "Original - >", test_data['price'][0]

model 1 -> 356134.4
model 2 -> 366651.4
Original - > 310000.0


## 18. Now compute RSS on all test data for the second model. Record the value and store it for later.

In [61]:
test_model_error = test_output - test_model_predictions
test_model_RSS = (test_model_error**2).sum() 
print test_model_RSS

2.70263446465e+14


## 19. Quiz Question: Which model (1 or 2) has lowest RSS on all of the TEST data?

In [65]:
print "1->",test_RSS
print "2->",test_model_RSS

1-> 2.75400047593e+14
2-> 2.70263446465e+14
