# Multiple Linear Regression with scikit - learn

## Import Module

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from math import log

## Import Data Type

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 
              'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 
              'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 
              'floors':str, 'condition':int, 'lat':float, 
              'date':str, 'sqft_basement':int, 'yr_built':int, 
              'id':str, 'sqft_lot':int, 'view':int}

## Load Data from CSV Files

In [3]:
sales = pd.read_csv("kc_house_data.csv", dtype = dtype_dict)

## Split Data into Training and Testing

In [4]:
train_data = pd.read_csv("kc_house_train_data.csv", dtype = dtype_dict)
test_data = pd.read_csv("kc_house_test_data.csv", dtype = dtype_dict)

## Creating New Features

In [5]:
# 4 new features
# * bedrooms_squared = bedrooms * bedrooms
# * bed_bath_rooms = bedrooms * bathrooms
# * log_sqft_living = log(sqft_living)
# * lat_plus_long = lat + long

train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']

train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))

train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

Let's explore new features in `train_data`

In [6]:
train_data[['bedrooms','bathrooms','lat','long','bedrooms_squared',
            'bed_bath_rooms','log_sqft_living','lat_plus_long']].head(5)

Unnamed: 0,bedrooms,bathrooms,lat,long,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,3.0,1.0,47.5112,-122.257,9.0,3.0,7.07327,-74.7458
1,3.0,2.25,47.721,-122.319,9.0,6.75,7.851661,-74.598
2,2.0,1.0,47.7379,-122.233,4.0,2.0,6.646391,-74.4951
3,4.0,3.0,47.5208,-122.393,16.0,12.0,7.5807,-74.8722
4,3.0,2.0,47.6168,-122.045,9.0,6.0,7.426549,-74.4282


## Average Value of New Features on Test Data

In [7]:
print('Average of \'bedrooms_squared\': ' + str(test_data['bedrooms_squared'].mean()))
print('Average of \'bed_bath_rooms\': ' + str(test_data['bed_bath_rooms'].mean()))
print('Average of \'log_sqft_living\': ' + str(test_data['log_sqft_living'].mean()))
print('Average of \'lat_plus_long\': ' + str(test_data['lat_plus_long'].mean()))

Average of 'bedrooms_squared': 12.4466777015843
Average of 'bed_bath_rooms': 7.5039016315913925
Average of 'log_sqft_living': 7.550274679645921
Average of 'lat_plus_long': -74.65333355403185


## Learning Multiple Models

Now we will learn the weights for three (nested) models for predicting house prices. The first model will have the fewest features the second model will add one more feature and the third will add a few more:

* Model 1: [squarefeet, # bedrooms, # bathrooms, latitude & longitude]
* Model 2: Model 1 features + [bed_bath_rooms]
* Model 3: Model 2 features + [bedrooms_squared, log_sqft_living, lat_plus_long]

### Model 1

In [8]:
print("MODEL 1")
model_1 = linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
model_1_x_train = train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
model_1_y_train = train_data['price']
model_1.fit(model_1_x_train,model_1_y_train)
print("Intercepts: ", model_1.intercept_)
pd.DataFrame(list(zip(model_1_x_train.columns,model_1.coef_)), columns = ['features', 'estimated coefficients'])

MODEL 1
Intercepts:  -69075726.7926


Unnamed: 0,features,estimated coefficients
0,sqft_living,312.258646
1,bedrooms,-59586.533154
2,bathrooms,15706.742083
3,lat,658619.263931
4,long,-309374.351268


### Model 2

In [9]:
print("MODEL 2")
model_2 = linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
model_2_x_train = train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]
model_2_y_train = train_data['price']
model_2.fit(model_2_x_train,model_2_y_train)
print("Intercepts: ", model_2.intercept_)
pd.DataFrame(list(zip(model_2_x_train.columns,model_2.coef_)), columns = ['features', 'estimated coefficients'])

MODEL 2
Intercepts:  -66867968.8711


Unnamed: 0,features,estimated coefficients
0,sqft_living,306.610053
1,bedrooms,-113446.36807
2,bathrooms,-71461.308293
3,lat,654844.629503
4,long,-294298.969138
5,bed_bath_rooms,25579.652001


### Model 3

In [10]:
print("MODEL 3")
model_3 = linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
model_3_x_train = train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]
model_3_y_train = train_data['price']
model_3.fit(model_3_x_train,model_3_y_train)
print("Intercepts: ", model_3.intercept_)
pd.DataFrame(list(zip(model_3_x_train.columns,model_3.coef_)), columns = ['features', 'estimated coefficients'])

MODEL 3
Intercepts:  -62036084.9861


Unnamed: 0,features,estimated coefficients
0,sqft_living,529.42282
1,bedrooms,34514.229578
2,bathrooms,67060.781319
3,lat,534085.610867
4,long,-406750.710861
5,bed_bath_rooms,-8570.504395
6,bedrooms_squared,-6788.58667
7,log_sqft_living,-561831.484076
8,lat_plus_long,127334.900006


## Comparing Multiple Models

We will prepare test dataset to compare models:

In [11]:
model_1_x_test = test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
model_1_y_test = test_data['price']
model_2_x_test = test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]
model_2_y_test = test_data['price']
model_3_x_test = test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]
model_3_y_test = test_data['price']

Now that you've learned three models and extracted the model weights we want to evaluate which model is best.

First use your functions from earlier to compute the RSS on TRAINING Data for each of the three models.

In [12]:
model_1_rss_train = np.sum((model_1_y_train - model_1.predict(model_1_x_train))**2)
print("RSS of Model 1 (train data): %.4g" % model_1_rss_train)
model_2_rss_train = np.sum((model_2_y_train - model_2.predict(model_2_x_train))**2)
print("RSS of Model 2 (train data): %.4g" % model_2_rss_train)
model_3_rss_train = np.sum((model_3_y_train - model_3.predict(model_3_x_train))**2)
print("RSS of Model 3 (train data): %.4g" % model_3_rss_train)

RSS of Model 1 (train data): 9.679e+14
RSS of Model 2 (train data): 9.584e+14
RSS of Model 3 (train data): 9.034e+14


Now compute the RSS on on TEST data for each of the three models.

In [13]:
model_1_rss_test = np.sum((model_1_y_test - model_1.predict(model_1_x_test))**2)
print("RSS of Model 1 (test data): %.4g" % model_1_rss_test)
model_2_rss_test = np.sum((model_2_y_test - model_2.predict(model_2_x_test))**2)
print("RSS of Model 2 (test data): %.4g" % model_2_rss_test)
model_3_rss_test = np.sum((model_3_y_test - model_3.predict(model_3_x_test))**2)
print("RSS of Model 3 (test data): %.4g" % model_3_rss_test)

RSS of Model 1 (test data): 2.255e+14
RSS of Model 2 (test data): 2.234e+14
RSS of Model 3 (test data): 2.592e+14
