In [1]:
import pandas as pd
import numpy as np
dtype_dict = {'bathrooms':float, 
              'waterfront':int, 
              'sqft_above':int, 
              'sqft_living15':float,
              'grade':int,
              'yr_renovated':int,
              'price':float, 
              'bedrooms':float, 
              'zipcode':str, 
              'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 
              'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int,
              'id':str, 'sqft_lot':int, 'view':int}
train = pd.read_csv("kc_house_train_data.csv", dtype=dtype_dict)
test = pd.read_csv("kc_house_test_data.csv", dtype=dtype_dict)

In [3]:
train['bedrooms_squared'] = train.bedrooms**2
train['bed_bath_rooms'] = train.bedrooms * train.bathrooms
train['log_sqft_living'] = np.log(train.sqft_living.values)
train['lat_plus_long'] = train.lat + train.long

In [7]:
test['bedrooms_squared'] = test.bedrooms**2
test['bed_bath_rooms'] = test.bedrooms * test.bathrooms
test['log_sqft_living'] = np.log(test.sqft_living.values)
test['lat_plus_long'] = test.lat + test.long

In [11]:
print test.bed_bath_rooms.mean()
print test.log_sqft_living.mean()
print test.lat_plus_long.mean()


7.50390163159
7.55027467965
-74.653333554


In [25]:
from sklearn import linear_model
# Create linear regression object
regr1 = linear_model.LinearRegression()
regr2 = linear_model.LinearRegression()
regr3 = linear_model.LinearRegression()
model1 = regr1.fit(train[["sqft_living", "bedrooms", "bathrooms",
                         "lat", "long"]].as_matrix(), 
                  train.price.values.reshape(len(train.index), 1))
model2 = regr2.fit(train[["sqft_living", "bedrooms", "bathrooms",
                         "lat", "long", "bed_bath_rooms"]].as_matrix(), 
                  train.price.values.reshape(len(train.index), 1))
model3 = regr3.fit(train[["sqft_living", "bedrooms", "bathrooms",
                         "lat", "long", "bed_bath_rooms", "bedrooms_squared", 
                         "log_sqft_living", "lat_plus_long"]].as_matrix(), 
                  train.price.values.reshape(len(train.index), 1))

In [43]:
#pred_train = regr.predict(train.sqft_living.values.reshape(len(train.index),1)).transpose()[0]
pred_test = regr3.predict(test[["sqft_living", "bedrooms", "bathrooms",
                         "lat", "long", "bed_bath_rooms", "bedrooms_squared", 
                         "log_sqft_living", "lat_plus_long"]]).transpose()[0]


In [44]:
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((pred_test - test.price.values) ** 2))

Residual sum of squares: 61299673494.25


In [52]:
def get_numpy_data(data, features, output):
    data['constant'] = 1 # add a constant column to an SFrame
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
    features_data = data[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_data.as_matrix()
    # assign the column of data_sframe associated with the target to the variable ‘output_sarray’

    # this will convert the SArray into a numpy array:
    output_array = data[output].values # GraphLab Create>= 1.7!!
    return(features_matrix, output_array)

In [62]:
def predict_outcome(feature_matrix, weights):
    predictions = feature_matrix.dot(weights)
    return(predictions)

In [63]:
def feature_derivative(errors, feature):
    derivative = np.vdot(feature, errors)
    return(derivative)

In [70]:
def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    while not converged:
        # compute the predictions based on feature_matrix and weights:
        predictions = predict_outcome(feature_matrix, weights)
        # compute the errors as predictions - output:
        errors = predictions - output
        gradient_sum_squares = 0 # initialize the gradient
        # while not converged, update each weight individually:
        for i in range(len(weights)):
            # Recall that feature_matrix[:, i] is the feature column associated with weights[i]
            # compute the derivative for weight[i]:
            derivative = feature_derivative(errors, feature_matrix[:,i])
            # add the squared derivative to the gradient magnitude
            gradient_sum_squares += derivative**2
            # update the weight based on step size and derivative:
            weights = weights - step_size*derivative
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return(weights)

In [71]:
simple_features = ['sqft_living']
my_output= 'price'
(simple_feature_matrix, output) = get_numpy_data(train, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [72]:
simple_weights = regression_gradient_descent(simple_feature_matrix,
                                             output,
                                             initial_weights,
                                             step_size,      
                                             tolerance)

In [73]:
simple_weights

array([-46719.20075876,    281.79924124])