In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

# Reading CSV and splitting into training and test data

In [268]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [269]:
print(train_data)

               id             date    price  bedrooms  bathrooms  sqft_living  \
0      7129300520  20141013T000000   221900         3       1.00         1180   
1      6414100192  20141209T000000   538000         3       2.25         2570   
2      5631500400  20150225T000000   180000         2       1.00          770   
3      2487200875  20141209T000000   604000         4       3.00         1960   
4      1954400510  20150218T000000   510000         3       2.00         1680   
5      7237550310  20140512T000000  1225000         4       4.50         5420   
6      1321400060  20140627T000000   257500         3       2.25         1715   
7      2008000270  20150115T000000   291850         3       1.50         1060   
8      2414600126  20150415T000000   229500         3       1.00         1780   
9      3793500160  20150312T000000   323000         3       2.50         1890   
10     1736800520  20150403T000000   662500         3       2.50         3560   
11     9212900260  20140527T

In [270]:
train_data['price'].mean()

539366.6279337321

In [5]:
train_data['sqft_living'].mean()

2080.0295098941556

In [6]:
train_data['sqft_living'].var()

849403.4935202107

# Learning multiple regression model

In [8]:
from sklearn import linear_model

In [53]:
regr = linear_model.LinearRegression(fit_intercept=False)

In [169]:
X = np.ones((len(train_data), 4))
X[:,1:] = train_data[['sqft_living', 'bedrooms', 'bathrooms']].values
y = train_data['price']

In [131]:
X

array([[  1.00000000e+00,   1.18000000e+03,   3.00000000e+00,
          1.00000000e+00],
       [  1.00000000e+00,   2.57000000e+03,   3.00000000e+00,
          2.25000000e+00],
       [  1.00000000e+00,   7.70000000e+02,   2.00000000e+00,
          1.00000000e+00],
       ..., 
       [  1.00000000e+00,   1.53000000e+03,   3.00000000e+00,
          2.50000000e+00],
       [  1.00000000e+00,   1.60000000e+03,   3.00000000e+00,
          2.50000000e+00],
       [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00,
          7.50000000e-01]])

In [170]:
regr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [171]:
regr.coef_

array([ 87912.86581493,    315.40669062, -65081.88711588,   6942.16598637])

# Adding new features t odataset

In [279]:
def add_new_features(dataset):
    dataset['bedrooms_squared'] = dataset['bedrooms'] * dataset['bedrooms']
    dataset['bed_bath_rooms'] = dataset['bedrooms'] * dataset['bathrooms']
    dataset['log_sqft_living'] = np.log(dataset['sqft_living'])
    dataset['lat_plus_long'] = dataset['lat'] + dataset['long']

In [280]:
add_new_features(train_data)
add_new_features(test_data)

In [281]:
train_data.columns.values

array(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15',
       'bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living',
       'lat_plus_long'], dtype=object)

# Computing mean for newly added features

In [282]:
np.mean(test_data['bedrooms_squared'])

12.4466777015843

In [283]:
np.mean(train_data['bedrooms_squared'])

12.174240681086056

In [284]:
np.mean(test_data['bed_bath_rooms'])

7.5039016315913925

In [285]:
np.mean(test_data['log_sqft_living'])

7.550274679645921

In [286]:
np.mean(test_data['lat_plus_long'])

-74.65333355403185

# Evaluating the models with new features

In [287]:
feature_set1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
feature_set2 = feature_set1 + ['bed_bath_rooms']
feature_set3 = feature_set2 + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [288]:
def train_linear_regression(dataset, feature_set):
    y = dataset['price']
    X = np.ones((len(dataset), len(feature_set) + 1))
    X[:,1:] = dataset[feature_set]
    regr = linear_model.LinearRegression(fit_intercept=False)
    regr.fit(X, y)
    return regr

In [289]:
model1 = train_linear_regression(train_data, feature_set1)

In [290]:
model2 = train_linear_regression(train_data, feature_set2)

In [291]:
model3 = train_linear_regression(train_data, feature_set3)

# What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?

In [292]:
model1.coef_[3]

15706.742082734683

What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?

In [293]:
model2.coef_[3]

-71461.308292758564

# Estimating RSS for Linear Regression Models

In [294]:
def predict(model, dataset, feature_set):
    X = np.ones((len(dataset), len(feature_set) + 1))
    X[:,1:] = dataset[feature_set]
    return model.predict(X)

In [295]:
def compute_rss(model, dataset, feature_set):
    y_hat = predict(model, dataset, feature_set)
    y = dataset['price']
    return (y - y_hat).T.dot(y - y_hat)

## For train data

In [300]:
train_rss = [compute_rss(model1, train_data, feature_set1), compute_rss(model2, train_data, feature_set2), compute_rss(model3, train_data, feature_set3)]
train_rss

[967879963049545.38, 958419635074069.5, 903436455050478.25]

In [304]:
train_rss.index(min(train_rss))

2

## For test data

In [301]:
test_rss = [compute_rss(model1, test_data, feature_set1), compute_rss(model2, test_data, feature_set2), compute_rss(model3, test_data, feature_set3)]
test_rss

[225500469795490.34, 223377462976467.63, 259236319207171.28]

In [299]:
test_rss.index(min(test_rss))

1

In [303]:
np.array(test_rss) - np.array(train_rss)

array([ -7.42379493e+14,  -7.35042172e+14,  -6.44200136e+14])