In [8]:
import pandas as pd
import numpy as np

# Load train and test dataset

In [339]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [340]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

# Define functions for finding optimums

In [341]:
def get_numpy_data(dataset, features, output_name):
    dataset['constant'] = 1
    features = ['constant'] + features
    return (dataset[features], dataset[output_name])

In [342]:
def predict_outcome(feature_matrix, weights):
    return feature_matrix.dot(weights)

In [428]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    print(type(initial_weights))
    converged = False
    weights = np.array(initial_weights)
    steps = 0
    while not converged:
        steps += 1
        gradient_sum_squares = 0
        old_weights = weights
        delta = (output.values - feature_matrix.values.dot(old_weights)).T.dot(feature_matrix.values)
        weights = old_weights + step_size * delta
        gradient_sum_squares = delta.T.dot(delta)
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance or steps > 1000:
            converged = True
    return weights

In [429]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7
weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)


<class 'numpy.ndarray'>


In [432]:
weights

array([-46999.88714469,    281.91205132])

# Computing simple feature predictions for test data

In [433]:
(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
test_weights = regression_gradient_descent(test_simple_feature_matrix, test_output, initial_weights, step_size, tolerance)

<class 'numpy.ndarray'>


In [434]:
type(test_weights)

numpy.ndarray

In [437]:
test_data['prediction'] = test_simple_feature_matrix.dot(test_weights)

In [463]:
model1_prediction = test_data.iloc[[0]]['prediction']
model1_prediction

0    356773.001222
Name: prediction, dtype: float64

# Coputing RSS for test data

In [447]:
RSS = sum((test_data['price'] - test_data['prediction'])**2)

In [448]:
RSS

275395691312818.44

# Build models with more then one predictor variable

In [449]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features,my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [450]:
weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

<class 'numpy.ndarray'>


In [453]:
test_data['prediction_2'] = feature_matrix.dot(weights)

In [462]:
model2_prediction = test_data.iloc[[0]]['prediction_2']
model2_prediction

0    276691.029244
Name: prediction_2, dtype: float64

## Actual price for first house:

In [464]:
actual_price = test_data.iloc[[0]]['price']

In [466]:
actual_price-model1_prediction

0   -46773.001222
dtype: float64

In [468]:
actual_price - model2_prediction

0    33308.970756
dtype: float64

# Computing RSS for second model

In [458]:
RSS_2 = sum((test_data['price'] - test_data['prediction_2'])**2)

In [459]:
RSS_2

836658052291629.0

In [469]:
RSS < RSS_2

True