# Multiple Regression - R&I

In [1]:
import pandas as pd
import numpy as np

# Use split data - training and testing 

In [2]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

# Learning: Multiple regression model

We prepare a multiple regression model to predict 'price' based on the following features:
ML_features = ['sqft_living', 'bedrooms', 'bathrooms'] on training data:


In [3]:
from sklearn.linear_model import LinearRegression

In [4]:
ML_features = ['sqft_living', 'bedrooms', 'bathrooms']
ML_model = LinearRegression()
ML_model.fit(train_data[ML_features],train_data['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Extract the regression weights (coefficients):

In [5]:
intercept = ML_model.intercept_
ML_coef = ML_model.coef_

In [6]:
pd.DataFrame({'Name':['intercept']+ML_features,'Value':[intercept]+list(ML_coef)})

Unnamed: 0,Name,Value
0,intercept,87912.865815
1,sqft_living,315.406691
2,bedrooms,-65081.887116
3,bathrooms,6942.165986


# Our Predictions

We can use the .predict() function to find the predicted values for data we pass. 

In [7]:
ML_predictions = ML_model.predict(train_data[ML_features])
print ML_predictions[0] 

271789.26537996985


# Compute RSS

Calculate RSS given the model, data, and the outcome.

In [8]:
def get_residual_sum_of_squares(model, data, outcome,features=['sqft_living',
                                                               'bedrooms', 'bathrooms']):
    # First get the predictions
    predictions = model.predict(data[features])
    # Then compute the residuals/errors
    errors = predictions - outcome
    # Then square and add them up
    RSS = np.square(errors).sum()
    return(RSS)    

Computing the RSS on TEST data for the example model:

In [9]:
rss_ML_train = get_residual_sum_of_squares(ML_model, test_data, 
                                           test_data['price'])
print 'rss_ML_train: ' + str(rss_ML_train) # should be 2.7376153833e+14

rss_ML_train: 273761940583133.72


# Create some new features using log

We use logarithm function to create a new feature. 

In [10]:
from math import log

We now create 4 new features for both TEST and TRAIN data:
* bedrooms_squared = bedrooms\*bedrooms
* bed_bath_rooms = bedrooms\*bathrooms
* log_sqft_living = log(sqft_living)
* lat_plus_long = lat + long 


In [11]:
train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

In [12]:
# create the remaining 3 features in both TEST and TRAIN data
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x:log(x))
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x:log(x))
test_data['lat_plus_long'] = test_data['lat'] + train_data['long']

We now calculate the mean value of the new 4 features on test data.

In [13]:
test_data[['bedrooms_squared','bed_bath_rooms',
           'log_sqft_living','lat_plus_long']].mean()

bedrooms_squared    12.446678
bed_bath_rooms       7.503902
log_sqft_living      7.550275
lat_plus_long      -74.653175
dtype: float64

# Multiple Models Learning

* Model 1: squarefeet, # bedrooms, # bathrooms, latitude & longitude
* Model 2: add bedrooms\*bathrooms
* Model 3: Add log squarefeet, bedrooms squared, and the (nonsensical) latitude + longitude

In [14]:
ML_1_features = ['sqft_living', 'bedrooms', 
                 'bathrooms', 'lat', 'long']
ML_2_features = ML_features + ['bed_bath_rooms']
ML_3_features = ML_features + ['bedrooms_squared', 
                               'log_sqft_living', 'lat_plus_long']

Now we study the value of the weights/coefficients of each model:

In [15]:
# Learn the three models: (don't forget to set validation_set = None)
ML_1 = LinearRegression()
ML_1.fit(train_data[ML_1_features],train_data['price'])
ML_2 = LinearRegression()
ML_2.fit(train_data[ML_2_features],train_data['price'])
ML_3 = LinearRegression()
ML_3.fit(train_data[ML_3_features],train_data['price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
# Examine/extract each model's coefficients:
for model,features in zip([ML_1,ML_2,ML_3],[ML_1_features,ML_2_features,ML_3_features]):
    intercept = model.intercept_
    coef = model.coef_
    print pd.DataFrame({'Name':['intercepte']+features,'Value':[intercept]+list(coef)})

          Name         Value
0   intercepte -6.907573e+07
1  sqft_living  3.122586e+02
2     bedrooms -5.958653e+04
3    bathrooms  1.570674e+04
4          lat  6.586193e+05
5         long -3.093744e+05
             Name          Value
0      intercepte  326670.642748
1     sqft_living     308.527174
2        bedrooms -134778.680310
3       bathrooms -105451.484408
4  bed_bath_rooms   33095.519095
               Name         Value
0        intercepte  1.871039e+07
1       sqft_living  5.237447e+02
2          bedrooms -1.577311e+04
3         bathrooms  2.761289e+04
4  bedrooms_squared -2.784420e+03
5   log_sqft_living -5.637009e+05
6     lat_plus_long  2.006008e+05


# Multiple Models Comparison

In [17]:
# Find the RSS on TRAINING data for of each models:
for model,features in zip([ML_1,ML_2,ML_3],
                          [ML_1_features, ML_2_features,ML_3_features]):
    print get_residual_sum_of_squares(model,train_data,
                                      train_data['price'],features)

967879963049546.4
1147277494451542.5
1068771600564960.8


In [18]:
# Find the RSS on TESTING data for of each models:
for model,features in zip([ML_1,ML_2,ML_3],
                          [ML_1_features,ML_2_features,ML_3_features]):
    print get_residual_sum_of_squares(model,test_data,
                                      test_data['price'],features)

225500469795490.16
270145596835099.22
255230080596984.7
