In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [4]:
#data size
train_data.shape

(17384, 21)

In [5]:
test_data.shape

(4229, 21)

### Calculate slope and intercept

In [6]:
def simple_linear_regression(input_feature, output):
    sum_y = np.sum(output)
    sum_x = np.sum(input_feature)
    sum_y_x = np.sum(input_feature * output)
    size = len(input_feature)
    square_x = np.sum(np.square(input_feature))
    
    slope = (sum_y_x - sum_y * sum_x * (1/size))/ (square_x - sum_x * sum_x * (1/size))
    intercept = (sum_y/size) - slope * (sum_x/size) 
                                                  
    return slope, intercept

### Define input feature and output

In [7]:
input_feature = train_data['sqft_living']
output = train_data['price']

In [8]:
slope, intercept = simple_linear_regression(input_feature, output)

### Predict the result

In [9]:
def get_regression_predictions(input_feature, intercept, slope):    
    predicted_output = intercept + slope * input_feature
    return predicted_output

In [10]:
get_regression_predictions(input_feature, intercept, slope)

0        2.855954e+05
1        6.775181e+05
2        1.699922e+05
3        5.055232e+05
4        4.265748e+05
5        1.481101e+06
6        4.364433e+05
7        2.517603e+05
8        4.547707e+05
9        4.857861e+05
10       9.566574e+05
11       2.799562e+05
12       3.391675e+05
13       4.632294e+05
14       4.857861e+05
15       4.040181e+05
16       2.912345e+05
17       3.053325e+05
18       4.096572e+05
19       8.128584e+05
20       5.929305e+05
21       2.545799e+05
22       6.436831e+05
23       6.436831e+05
24       3.476263e+05
25       3.814614e+05
26       6.775181e+05
27       2.884149e+05
28       6.098480e+05
29       5.337191e+05
             ...     
17354    4.265748e+05
17355    1.320384e+06
17356    2.179252e+05
17357    4.406727e+05
17358    7.367295e+05
17359    7.367295e+05
17360    3.081521e+05
17361    4.801470e+05
17362    3.560851e+05
17363    3.814614e+05
17364    2.940541e+05
17365    6.690594e+05
17366    1.337302e+06
17367    7.339099e+05
17368    2

### What is the predicted price for a house with 2650 sqft?

In [11]:
get_regression_predictions(2650, intercept, slope)

700074.8459475137

### Calculate Residual Sum of Squares(RSS)

In [13]:
def get_residual_sum_of_squares(input_feature, output, intercept, slope):
    RSS = np.sum(np.square(output - (intercept + slope * input_feature)))
    return RSS

### RSS of training data

In [14]:
get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], intercept, slope)

1201918354177283.0

### Inverse regression prediction

In [22]:
def inverse_regression_prediction(output, intercept, slope):
    estimated_input = (output - intercept) / slope
    return estimated_input

### What is the estimated square-feet for a house costing $800,000?

In [23]:
inverse_regression_prediction(800000, intercept, slope)

3004.3962451522766

### Using number of bedrooms as input feature to predict the price

In [24]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [26]:
bedroom_slope, bedroom_intercetp = simple_linear_regression(train_data['bedrooms'], train_data['price'])

### Model 1 ( input feature:sqft_living) : RSS

In [36]:
model_1 = get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], intercept, slope)
model_1

275402933617812.12

### Model 2 (input feature: bedroom) : RSS

In [37]:
model_2 = get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedroom_intercetp, bedroom_slope)
model_2

493364585960301.0

In [39]:
if model_1 < model_2:
    print('Model 1 has lower RSS')
else:
    print('Model 2 has lower RSS')

Model 1 has lower RSS
