# Description and Details
1. Load house sales data
2. Add 4 new variables in training set and test set
3. Estimate model 1, 2, 3 using three different sets of features
4. Calculate RSS of three models on both training set and test set

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

In [2]:
sales = pd.read_csv('kc_house_data.csv')
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [3]:
sales.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

# Create some new features

In [4]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']

train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: math.log(x))
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']

test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']


test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: math.log(x))
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

# Learning Multiple Models

In [5]:
feature_1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
feature_2 = feature_1 + ['bed_bath_rooms']
feature_3 = feature_2 + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [6]:
lm = linear_model.LinearRegression()
model_1 = lm.fit(train_data[feature_1], train_data['price'])
lm = linear_model.LinearRegression()
model_2 = lm.fit(train_data[feature_2], train_data['price'])
lm = linear_model.LinearRegression()
model_3 = lm.fit(train_data[feature_3], train_data['price'])

In [7]:
print(model_1.coef_, model_1.intercept_)
print(model_2.coef_, model_2.intercept_)
print(model_3.coef_, model_3.intercept_)

[ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
 -3.09374351e+05] -69075726.79256983
[ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
 -2.94298969e+05  2.55796520e+04] -66867968.8710789
[ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
 -4.06750711e+05 -8.57050439e+03 -6.78858667e+03 -5.61831484e+05
  1.27334900e+05] -62036084.98609828


# Comparing multiple models

First use your functions from earlier to compute the RSS on TRAINING Data for each of the three models.

In [8]:
#predict on training data
pred_model_1 = model_1.predict(train_data[feature_1])
pred_model_2 = model_2.predict(train_data[feature_2])
pred_model_3 = model_3.predict(train_data[feature_3])

In [9]:
#predict on test data
pred_model_test_1 = model_1.predict(test_data[feature_1])
pred_model_test_2 = model_2.predict(test_data[feature_2])
pred_model_test_3 = model_3.predict(test_data[feature_3])

In [10]:
#compute RSS for training data
RSS_train_1 = ((train_data['price'] - pred_model_1)**2).sum()
RSS_train_2 = ((train_data['price'] - pred_model_2)**2).sum()
RSS_train_3 = ((train_data['price'] - pred_model_3)**2).sum()
print('RSS_train_1 is:', RSS_train_1, '\n'  
      'RSS_train_2 is:', RSS_train_2, '\n'
      'RSS_train_3 is:', RSS_train_3)

RSS_train_1 is: 967879963049545.8 
RSS_train_2 is: 958419635074069.8 
RSS_train_3 is: 903436455050479.5


In [11]:
#compute RSS for training data
RSS_test_1 = ((test_data['price'] - pred_model_test_1)**2).sum()
RSS_test_2 = ((test_data['price'] - pred_model_test_2)**2).sum()
RSS_test_3 = ((test_data['price'] - pred_model_test_3)**2).sum()
print('RSS_test_1 is:', RSS_test_1, '\n'  
      'RSS_test_2 is:', RSS_test_2, '\n'
      'RSS_test_3 is:', RSS_test_3)

RSS_test_1 is: 225500469795489.97 
RSS_test_2 is: 223377462976467.06 
RSS_test_3 is: 259236319207179.94
