In [49]:
# import the required package

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.datasets import load_boston
import statsmodels.api as sm

In [2]:
# Load Boston data from scikit learn and transform it to data frame

boston = load_boston()
dfb = pd.DataFrame(boston.data, columns=boston.feature_names)
dfb['house_value'] = boston.target
dfb.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,house_value
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [47]:
# Multiple linear Regression of all independent variables
model1= smf.ols(formula= 'house_value ~ CRIM +ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B+LSTAT' , data=dfb)
modelfit_1 = model1.fit()
print(modelfit_1.summary())

                            OLS Regression Results                            
Dep. Variable:            house_value   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     108.1
Date:                Mon, 26 Apr 2021   Prob (F-statistic):          6.72e-135
Time:                        19:53:02   Log-Likelihood:                -1498.8
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     492   BIC:                             3085.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     36.4595      5.103      7.144      0.0

To improve Model1, a forward selection approach is used to filter out unnecessary features i.e, indipendent variables that are not significant. The criteria for this forward selection is using the lowest Residual Sum of Squares(RSS).


In [36]:
# function to calculate the RSS
def rss(ytrue, ypred):
    return np.sum(np.power((ytrue - ypred),2))
"""""
This function execute forward selection of four predictors 
based on the lowest calculated RSS value"""

def forward_selection(target, predictors, data, maxp=3):
    f = target + '~'
    first = True
    for i in range(maxp):
        print('\n\n ===============')
        model_rss = []
        for p in predictors:
            mod = smf.ols(formula= f+ ' + ' +p, data =dfb )
            res = mod.fit()
            model_rss.append(rss(data[target], res.predict(dfb)))
            
        tmp = pd.DataFrame({'RSS':np.round(model_rss, 1)}, index = predictors)
        tmp = tmp.sort_values(by=['RSS'])
        print('p =' + str(i + 1))
        print('current f :' + str(f))
        print('Top 5 predictors :')
        print(tmp.index.to_list()[0:5])
        
        if first:
            first = False
            f += tmp.index[0]
        else:
            f += ' + ' +tmp.index[0]
        print('Formular :' + f)
        predictors = tmp.index.to_list().copy()
        del predictors[0]
    return f


In [37]:
forward_selection('house_value',boston.feature_names, dfb, maxp=4)



p =1
current f :house_value~
Top 5 predictors :
['LSTAT', 'RM', 'PTRATIO', 'INDUS', 'TAX']
Formular :house_value~LSTAT


p =2
current f :house_value~LSTAT
Top 5 predictors :
['RM', 'PTRATIO', 'CHAS', 'DIS', 'AGE']
Formular :house_value~LSTAT + RM


p =3
current f :house_value~LSTAT + RM
Top 5 predictors :
['PTRATIO', 'CHAS', 'B', 'TAX', 'DIS']
Formular :house_value~LSTAT + RM + PTRATIO


p =4
current f :house_value~LSTAT + RM + PTRATIO
Top 5 predictors :
['DIS', 'B', 'CHAS', 'CRIM', 'AGE']
Formular :house_value~LSTAT + RM + PTRATIO + DIS


'house_value~LSTAT + RM + PTRATIO + DIS'

In [39]:
model_best_predictors = smf.ols(formula='house_value~LSTAT + RM + PTRATIO + DIS', data=dfb)
mfit = model_best_predictors.fit()
print(mfit.summary())

                            OLS Regression Results                            
Dep. Variable:            house_value   R-squared:                       0.690
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                     279.2
Date:                Mon, 26 Apr 2021   Prob (F-statistic):          5.23e-126
Time:                        19:38:33   Log-Likelihood:                -1543.7
No. Observations:                 506   AIC:                             3097.
Df Residuals:                     501   BIC:                             3118.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     24.4714      4.078      6.001      0.0

from the summary, there is an improvement of the Forward selesction approach with Lower F.st