In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 
              'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [18]:
train_data = pd.read_csv("kc_house_train_data.csv", dtype = dtype_dict)
test_data = pd.read_csv("kc_house_test_data.csv", dtype=dtype_dict)

In [19]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [26]:
type(train_data["sqft_lot15"])

pandas.core.series.Series

In [27]:
train_data["sqft_lot15"].size

17384

# Build a generic simple linear regression function 

In [28]:
def simple_linear_regression(input_feature, output):
    # compute the sum of input_feature and output
    sum_inputFeature = input_feature.sum()
    sum_output = output.sum()
    
    # compute the product of the output and the input_feature and its sum
    sum_InputOutput = (input_feature*output).sum()
    
    # compute the squared value of the input_feature and its sum
    sum_inputFeature_sq = (input_feature*input_feature).sum()
    
    num_data = input_feature.size
    # use the formula for the slope
    slope = (sum_InputOutput - (sum_inputFeature*sum_output)/num_data) / (sum_inputFeature_sq - sum_inputFeature**2/num_data)
    
    # use the formula for the intercept
    intercept = (sum_output - slope*sum_inputFeature)/num_data
    
    return (intercept, slope)

In [29]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])

print "Intercept: " + str(sqft_intercept)
print "Slope: " + str(sqft_slope)

Intercept: -47116.0790729
Slope: 281.95883963


# Predicting Values

In [31]:
def get_regression_predictions(input_feature, intercept, slope):
    # calculate the predicted values:
    predicted_values = input_feature*slope + intercept
    
    return predicted_values

Now that we can calculate a prediction given the slope and intercept let's make a prediction. Use (or alter) the following to find out the estimated price for a house with 2650 squarefeet according to the squarefeet model we estiamted above.

**Quiz Question: Using your Slope and Intercept from (4), What is the predicted price for a house with 2650 sqft?**

In [32]:
my_house_sqft = 2650
estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, estimated_price)

The estimated price for a house with 2650 squarefeet is $700074.85


# Residual Sum of Squares

In [33]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    # First get the predictions
    predictions = input_feature*slope + intercept

    # then compute the residuals (since we are squaring it doesn't matter which order you subtract)
    residuals = predictions - output

    # square the residuals and add them up
    RSS = (residuals*residuals).sum()

    return(RSS)

In [34]:
rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 1.20191835418e+15


# Predict the squarefeet given price

In [35]:
def inverse_regression_predictions(output, intercept, slope):
    # solve output = intercept + slope*input_feature for input_feature. Use this equation to compute the inverse predictions:
    estimated_feature = (output - intercept)/slope

    return estimated_feature

In [36]:
my_house_price = 800000
estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)
print "The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, estimated_squarefeet)

The estimated squarefeet for a house worth $800000.00 is 3004


# New Model: estimate prices from bedrooms

In [38]:
# Estimate the slope and intercept for predicting 'price' based on 'bedrooms'
nRooms_intercept, nRooms_slope = simple_linear_regression(train_data['bedrooms'], train_data['price'])

print "Intercept: " + str(nRooms_intercept)
print "Slope: " + str(nRooms_slope)


Intercept: 109473.177623
Slope: 127588.952934


# Test your Linear Regression Algorithm

In [39]:
# Compute RSS when using bedrooms on TEST data:
testRSS_bedroom = get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], nRooms_intercept, nRooms_slope)
print "The RSS on test data using bedrooms as input feature is "+ str(testRSS_bedroom)

The RSS on test data using bedrooms as input feature is 4.9336458596e+14


In [40]:
# Compute RSS when using squarefeet on TEST data:
testRSS_sqft = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)
print "The RSS on test data using bedrooms as input feature is "+ str(testRSS_sqft)

The RSS on test data using bedrooms as input feature is 2.75402933618e+14


In [43]:
pd.Series(1,index=list(range(4)),dtype='float32')

0    1
1    1
2    1
3    1
dtype: float32