In [1]:
import pandas as pd

import numpy as np

from sklearn.linear_model import LinearRegression

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')

test_data = pd.read_csv('kc_house_test_data.csv')

In [4]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 
    return (data[['constant'] + features], data[output])

In [6]:
def predict_outcome(features, weights):
    '''
    features   N * f
    weights    f * 1
    '''
    return np.dot(features, weights)

In [8]:
def derivative(features, weights, target):
    '''
    features   N * f
    weights    f * 1
    target     N * 1
    '''
    outcome = predict_outcome(features, weights)
    return -2 * np.dot(features.T, target - outcome)

In [30]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False

    weights = np.array(initial_weights)

    while not converged:
        d_weights = derivative(feature_matrix, weights, output)
        
        gradient_magnitude = np.sqrt(np.sum(d_weights ** 2))

        if gradient_magnitude < tolerance:
            converged = True
        else:
            weights = weights - step_size * d_weights

    return weights

In [34]:
def get_model_weights(data, features, output, initial_weights, step_size, tolerance):
    (feature_matrix, target) = get_numpy_data(data, features, output)
    return regression_gradient_descent(feature_matrix, target, initial_weights, step_size, tolerance)

In [35]:
output = 'price'

In [37]:
simple_features = ['sqft_living']

weights1 = get_model_weights(train_data, simple_features, output, np.array([-47000., 1.]), 7e-12, 2.5e7)

array([-46999.88716085,    281.91224733])

In [47]:
(test_simple_feature_matrix, _) = get_numpy_data(test_data, simple_features, output)

test_predictions1 = predict_outcome(test_simple_feature_matrix, weights1)

In [49]:
test_predictions1[0]

356134.62652028602

In [53]:
model_features = ['sqft_living', 'sqft_living15']

weights2 = get_model_weights(train_data, model_features, output, np.array([-100000., 1., 1.]), 4e-12, 1e9)

[ -9.99999688e+04   2.45069901e+02   6.52824617e+01]


In [51]:
(test_model_feature_matrix, _) = get_numpy_data(test_data, model_features, output)

test_predictions2 = predict_outcome(test_model_feature_matrix, weights2)

In [52]:
test_predictions2[0]

366652.77081578644

In [61]:
real = test_data['price']

In [62]:
real[0]

310000.0

In [63]:
def get_residual_sum_of_squares(predictions, real):
    errors = real - predictions
    return np.sum(errors * errors)

In [64]:
print(get_residual_sum_of_squares(test_predictions1, real))

print(get_residual_sum_of_squares(test_predictions2, real))

2.75400042408e+14
2.70263313689e+14
