In [1]:
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
import statsmodels.api as SM
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
dataset = load_boston()
print(dataset.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
train_data,test_data,train_label, test_label = train_test_split(dataset.data, dataset.target,test_size = 0.3)
train_data[:5]

array([[1.10874e+01, 0.00000e+00, 1.81000e+01, 0.00000e+00, 7.18000e-01,
        6.41100e+00, 1.00000e+02, 1.85890e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 3.18750e+02, 1.50200e+01],
       [1.12658e+00, 0.00000e+00, 1.95800e+01, 1.00000e+00, 8.71000e-01,
        5.01200e+00, 8.80000e+01, 1.61020e+00, 5.00000e+00, 4.03000e+02,
        1.47000e+01, 3.43280e+02, 1.21200e+01],
       [3.96100e-02, 0.00000e+00, 5.19000e+00, 0.00000e+00, 5.15000e-01,
        6.03700e+00, 3.45000e+01, 5.98530e+00, 5.00000e+00, 2.24000e+02,
        2.02000e+01, 3.96900e+02, 8.01000e+00],
       [4.22239e+00, 0.00000e+00, 1.81000e+01, 1.00000e+00, 7.70000e-01,
        5.80300e+00, 8.90000e+01, 1.90470e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 3.53040e+02, 1.46400e+01],
       [7.85700e-01, 2.00000e+01, 3.97000e+00, 0.00000e+00, 6.47000e-01,
        7.01400e+00, 8.46000e+01, 2.13290e+00, 5.00000e+00, 2.64000e+02,
        1.30000e+01, 3.84070e+02, 1.47900e+01]])

In [4]:
train_data, test_data = SM.add_constant(train_data), SM.add_constant(test_data)
train_data[:5]

array([[1.00000e+00, 1.10874e+01, 0.00000e+00, 1.81000e+01, 0.00000e+00,
        7.18000e-01, 6.41100e+00, 1.00000e+02, 1.85890e+00, 2.40000e+01,
        6.66000e+02, 2.02000e+01, 3.18750e+02, 1.50200e+01],
       [1.00000e+00, 1.12658e+00, 0.00000e+00, 1.95800e+01, 1.00000e+00,
        8.71000e-01, 5.01200e+00, 8.80000e+01, 1.61020e+00, 5.00000e+00,
        4.03000e+02, 1.47000e+01, 3.43280e+02, 1.21200e+01],
       [1.00000e+00, 3.96100e-02, 0.00000e+00, 5.19000e+00, 0.00000e+00,
        5.15000e-01, 6.03700e+00, 3.45000e+01, 5.98530e+00, 5.00000e+00,
        2.24000e+02, 2.02000e+01, 3.96900e+02, 8.01000e+00],
       [1.00000e+00, 4.22239e+00, 0.00000e+00, 1.81000e+01, 1.00000e+00,
        7.70000e-01, 5.80300e+00, 8.90000e+01, 1.90470e+00, 2.40000e+01,
        6.66000e+02, 2.02000e+01, 3.53040e+02, 1.46400e+01],
       [1.00000e+00, 7.85700e-01, 2.00000e+01, 3.97000e+00, 0.00000e+00,
        6.47000e-01, 7.01400e+00, 8.46000e+01, 2.13290e+00, 5.00000e+00,
        2.64000e+02, 1.300

A feature with constant value 1 is added for all rows

In [5]:
ols1 = SM.OLS(train_label,train_data)
model1 = ols1.fit()
model1.params

array([ 3.26622611e+01, -1.23387413e-01,  2.94537183e-02,  3.91261258e-02,
        4.03084404e+00, -1.41818694e+01,  3.93941942e+00, -3.50456087e-03,
       -1.23537966e+00,  2.73267586e-01, -9.84628282e-03, -9.29739461e-01,
        9.02779519e-03, -5.83773115e-01])

In [6]:
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.731
Method:                 Least Squares   F-statistic:                     74.91
Date:                Fri, 02 Jul 2021   Prob (F-statistic):           2.21e-91
Time:                        08:32:58   Log-Likelihood:                -1042.7
No. Observations:                 354   AIC:                             2113.
Df Residuals:                     340   BIC:                             2168.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         32.6623      6.204      5.265      0.0

In [7]:
pred1 = model1.predict(test_data)
mean_squared_error(test_label, pred1)

24.96474018384149

We can see the features x3 & x7 show abnormal behavior for t-test

In [14]:
ols2 = SM.OLS(train_label,np.delete(train_data,[3,7],axis=1))
model2 = ols2.fit()
model2.params

array([ 3.25515323e+01, -1.24179280e-01,  2.88901347e-02,  4.07242000e+00,
       -1.37672421e+01,  3.89105552e+00, -1.24130738e+00,  2.63990629e-01,
       -8.89334785e-03, -9.21352596e-01,  8.94330699e-03, -5.84891905e-01])

In [15]:
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.733
Method:                 Least Squares   F-statistic:                     88.92
Date:                Fri, 02 Jul 2021   Prob (F-statistic):           2.98e-93
Time:                        08:33:32   Log-Likelihood:                -1042.9
No. Observations:                 354   AIC:                             2110.
Df Residuals:                     342   BIC:                             2156.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         32.5515      6.120      5.319      0.0

We get a lower value for both AIC & BIC here  which indicates the model's improved predictive ability

In [16]:
pred2 = model2.predict(np.delete(test_data,[3,7],axis=1))
mean_squared_error(test_label, pred2)

24.889895141067154

In [17]:
model1.aic > model2.aic

True

In [18]:
print("No of times model1 is likely to minimize information loss than model2 : ",np.exp((model2.aic-model1.aic)/2))

No of times model1 is likely to minimize information loss than model2 :  0.16373313277830023


In [19]:
print("No of times model2 is likely to minimize information loss than model1 : ",np.exp((model1.aic-model2.aic)/2))

No of times model2 is likely to minimize information loss than model1 :  6.1074993376815865


Thus the removal features 3 & 7 proved beneficial