# Simple Linear Regression - R&I 

In [1]:
import pandas as pd
import numpy as np

# Use split data - training and testing

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [3]:
sales = train_data.append(test_data)

In [4]:
sales.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [5]:
sales.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

# Summary functions - Compute Sum and Arithmetric Average (mean)

In [6]:
# Let's compute the mean of the House Prices in King County in 2 different ways.
kcprices = sales['price'] # extract the price column of the sales 

# recall that the arithmetic average (the mean) 
# is the sum of the prices divided by the total number of houses:
sum_kcprices = kcprices.sum()
num_kchouses = kcprices.size # when prices is an Series .size returns its length
avg_kcprice_1 = sum_kcprices/num_kchouses
avg_kcprice_2 = kcprices.mean() # if you just want the average, the .mean() function
print "average price via method 1: " + str(avg_kcprice_1)
print "average price via method 2: " + str(avg_kcprice_2)

average price via method 1: 540088.1417665294
average price via method 2: 540088.141767


In [7]:
# if we want to multiply every price by 0.5 it's a simple as:
half_kcprices = 0.5*kcprices

# Let's compute the sum of squares of price by multiply with (*)
kcprices_squared = kcprices*kcprices
sum_kcprices_squared = kcprices_squared.sum() # price_squared is the squares 
                                          # and we want to add them up.
print "the sum of price squared is: " + str(sum_kcprices_squared)

the sum of price squared is: 9217325138472070.0


# Simple linear regression function 

We can compute the slope and intercept for a simple linear regression on observations stored: input_feature, output.

Now computing the simple linear regression slope and intercept:

In [8]:
def simple_linear_regression(input_feature, output):
    # compute the sum of input_feature and output
    N = input_feature.size
    sum_x = input_feature.sum()
    sum_y = output.sum()
    # compute the product of the output and the input_feature and its sum
    sum_xy = np.dot(input_feature,output).sum()
    # compute the squared value of the input_feature and its sum
    sum_x2 = np.square(input_feature).sum()
    # use the formula for the slope
    slope = (sum_xy - (sum_y*sum_x)/float(N))/(sum_x2 - (sum_x*sum_x)/float(N))
    # use the formula for the intercept
    intercept = output.mean() - slope * input_feature.mean()
    return (intercept, slope)

If output = 1 + 1\*input_feature then we know both our slope and intercept should be 1

In [9]:
test_feature = pd.Series(range(5))
test_output = pd.Series(1 + 1*test_feature)
(test_intercept, test_slope) =  simple_linear_regression(test_feature, 
                                                         test_output)
print "Intercept: " + str(test_intercept)
print "Slope: " + str(test_slope)

Intercept: 1.0
Slope: 1.0


Regression model for predicting price based on sqft_living

In [10]:
sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], 
                                                      train_data['price'])

print "Intercept: " + str(sqft_intercept)
print "Slope: " + str(sqft_slope)

Intercept: -47116.07907289418
Slope: 281.9588396303426


# Predicting Values

Now that we have the model parameters: intercept & slope we can make predictions.

In [11]:
def get_regression_predictions(input_feature, intercept, slope):
    # calculate the predicted values:
    predicted_values = input_feature * slope + intercept
    return predicted_values

We now predicted price for a house with 3500 sqft

In [12]:
my_house_sqft = 3500
estimated_price = get_regression_predictions(my_house_sqft, 
                                             sqft_intercept, sqft_slope)
print "The estimated price for a house with %d squarefeet is $%.2f" % (my_house_sqft, 
                                                                       estimated_price)

The estimated price for a house with 3500 squarefeet is $939739.86


# Residual Sum of Squares (RSS)

We compute the RSS of a simple linear regression model given the input_feature, output, intercept and slope:

In [13]:
def get_residual_sum_of_squares(input_feature, output, 
                                intercept, slope):
    # First get the predictions
    predictions = get_regression_predictions(input_feature,
                                             intercept,slope)
    # then compute the residuals (since we are squaring it doesn't matter which order you subtract)
    residuals = predictions - output
    # square the residuals and add them up
    RSS = np.square(residuals).sum()
    return(RSS)

We will use the get_residual_sum_of_squares function using our test model where the data lie exactly on a line.

In [14]:
print get_residual_sum_of_squares(test_feature, test_output, 
                                  test_intercept, test_slope) # should be 0.0

0.0


We use the simple linear regression using squarefeet to predict prices on TRAINING data.

In [15]:
rss_kcprices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], 
                                                   train_data['price'], 
                                                   sqft_intercept, sqft_slope)
print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_kcprices_on_sqft)

The RSS of predicting Prices based on Square Feet is : 1201918354177283.0


# Predict the sqft. given the price (using the inverse regression estimate)

In [16]:
def inverse_regression_predictions(output, intercept, slope):
    # solve output = intercept + slope*input_feature for input_feature. 
    # Use this equation to compute the inverse predictions:
    estimated_feature = (output - intercept)/slope
    return estimated_feature

Now we estimated square-feet for a house costing $5,500,000?

In [17]:
my_house_price = 5500000
estimated_sqft = inverse_regression_predictions(my_house_price, 
                                                sqft_intercept, 
                                                sqft_slope)
print "The estimated squarefeet for a house worth $%.2f is %d" % (my_house_price, 
                                                                  estimated_sqft)

The estimated squarefeet for a house worth $5500000.00 is 19673


# 2nd Model: Estimate prices from bedrooms (using the training data)

In [18]:
# Estimate the slope and intercept for predicting 'price' based on 'bedrooms'
bedrooms_intercept, bedrooms_slope = simple_linear_regression(train_data['bedrooms'], 
                                                              train_data['price'])

print "Intercept: " + str(sqft_intercept)
print "Slope: " + str(sqft_slope)

Intercept: -47116.07907289418
Slope: 281.9588396303426


# LR Algorithm Test for both models (sqft & bedrooms)

We will now use the RSS from predicting prices using bedrooms and from predicting prices using squarefeet.

In [19]:
# RSS when using bedrooms on TEST data:
RSS_prices_on_sqft = get_residual_sum_of_squares(test_data['sqft_living'], 
                                                 test_data['price'], 
                                                 sqft_intercept, sqft_slope)
print 'Predicted Prices based on Square Feet: ' + str(RSS_prices_on_sqft)

Predicted Prices based on Square Feet: 275402933617812.12


In [20]:
# RSS when using squarefeet on TEST data:
RSS_prices_on_bedrooms = get_residual_sum_of_squares(test_data['bedrooms'], 
                                                     test_data['price'], 
                                                     bedrooms_intercept, 
                                                     bedrooms_slope)
print 'Predicted Prices based on Bedrooms: ' + str(RSS_prices_on_bedrooms)

Predicted Prices based on Bedrooms: 493364585960300.9
