In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# get Boston housing data
from ISLP import load_data
df = load_data('Boston')
df.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'lstat', 'medv'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [10]:
df.shape

(506, 13)

In [11]:
df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97,50.0


In [12]:
df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
lstat      0
medv       0
dtype: int64

## Multiple Linear Regression

In [13]:
X = df.drop('medv', axis=1)
y = df['medv']

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (404, 12) (404,)
Test set: (102, 12) (102,)


In [15]:
model = LinearRegression()# ordinary least Square Regression
model.fit(X_train, y_train)
print ('Coefficients: ', model.coef_)
print ('Intercept: ',model.intercept_)

Coefficients:  [-1.16296632e-01  4.76363696e-02 -1.78967677e-03  3.41260522e+00
 -1.74918641e+01  3.75686585e+00 -8.24293816e-03 -1.56220676e+00
  2.69132207e-01 -1.35469697e-02 -8.76778958e-01 -4.86187894e-01]
Intercept:  40.05841937274651


In [16]:
y_pred_dummy = model.predict(X_train)
y_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

# The mean absolute error
print('Mean absolute error: %.2f'
      % mean_absolute_error(y_test, y_pred))


# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination(r2): %.2f'
      % r2_score(y_test, y_pred))

Mean squared error: 26.19
Mean absolute error: 3.49
Coefficient of determination(r2): 0.72


## Polynomial

In [18]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
X_train_poly

array([[1.000000e+00, 8.664000e-02, 4.500000e+01, ..., 2.310400e+02,
        4.362400e+01, 8.236900e+00],
       [1.000000e+00, 2.498000e-01, 0.000000e+00, ..., 4.494400e+02,
        4.519840e+02, 4.545424e+02],
       [1.000000e+00, 2.140900e-01, 2.200000e+01, ..., 3.648100e+02,
        6.856900e+01, 1.288810e+01],
       ...,
       [1.000000e+00, 9.390630e+00, 0.000000e+00, ..., 4.080400e+02,
        4.621760e+02, 5.234944e+02],
       [1.000000e+00, 8.447000e-02, 0.000000e+00, ..., 2.755600e+02,
        1.600240e+02, 9.292960e+01],
       [1.000000e+00, 9.299000e-02, 0.000000e+00, ..., 3.648100e+02,
        3.424630e+02, 3.214849e+02]])

In [19]:
poly_LR = LinearRegression()
poly_LR.fit(X_train_poly, y_train)
# The coefficients
print ('Coefficients: ', poly_LR.coef_)
print ('Intercept: ',poly_LR.intercept_)

Coefficients:  [ 3.00658409e+09 -4.04278679e+00 -2.41976712e-01 -2.97613560e+00
  5.56812266e+01  1.36948383e+02  2.14421125e+01  5.07737506e-01
 -3.20131192e+00  3.80179203e+00 -2.55735839e-01  5.56897817e+00
  5.24881118e-01  3.51312194e-03  1.50452200e-01  5.73570645e-01
  2.12186631e+00 -1.59274662e+00  1.16548354e-01  7.61942005e-03
  1.55892196e-01  2.16525932e-01 -2.82336915e-02  2.90112339e-01
  1.78867723e-02  2.80020850e-03  4.55299152e-03  3.87689439e-02
 -3.04181994e-01 -3.31102352e-02  1.21228446e-04 -5.09934707e-02
 -2.43751904e-02  5.96611412e-04  3.24264486e-02 -6.28074760e-03
 -6.50548905e-02 -8.35704322e-02  1.53598808e+00  4.56499501e-01
 -9.68287538e-04 -2.87358504e-01 -2.58546217e-01  3.96762219e-03
  8.50834304e-02 -4.80471362e-03  5.56812268e+01 -7.68558903e+01
 -7.77183621e+00 -1.44835333e-01 -2.67647436e-01 -3.09660438e-01
  8.39405673e-02 -1.90571219e+00 -2.64139930e-01 -8.49789328e+01
 -3.08015120e+00  1.12428007e-01  7.72065994e+00 -2.70793591e+00
  1.765446

In [20]:
y_pred_poly = poly_LR.predict(X_test_poly)

In [21]:
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred_poly))

# The mean absolute error
print('Mean absolute error: %.2f'
      % mean_absolute_error(y_test, y_pred_poly))


# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination(r2): %.2f'
      % r2_score(y_test, y_pred_poly))

Mean squared error: 18.28
Mean absolute error: 3.15
Coefficient of determination(r2): 0.80


In [22]:
def create_polynomial_regression_model(degree):
    "Creates a polynomial regression model for the given degree"
  
    poly_features = PolynomialFeatures(degree=degree)
  
    # transforms the existing features to higher degree features.
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.fit_transform(X_test)
  
    # fit the transformed features to Linear Regression
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)
  
    # predicting on test data-set
    y_test_predict = poly_model.predict(X_test_poly)
  
    # evaluating the model on test dataset
    rmse_test =  mean_squared_error(y_test, y_pred_poly)
    r2_test = r2_score(y_test, y_pred_poly)
  
  
    print("The model performance for the test set")
    print("-------------------------------------------")
    print("RMSE of test set is {}".format(rmse_test))
    print("R2 score of test set is {}".format(r2_test))

In [23]:
create_polynomial_regression_model(2)

The model performance for the test set
-------------------------------------------
RMSE of test set is 18.27677174837767
R2 score of test set is 0.8032412060831665
