# Automatic Backward Elimination

In [11]:
import pandas as pd
import numpy as np

In [12]:
dataset=pd.read_csv('50_Startups.csv')
np.set_printoptions(suppress=True, precision=2)
dataset.iloc[0:10,:]

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [13]:
X=dataset.iloc[:,:-1].values
y=dataset.iloc[:,[-1]].values
print(X[0:10,:])
print(y[0:10,:])

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']]
[[192261.83]
 [191792.06]
 [191050.39]
 [182901.99]
 [166187.94]
 [156991.12]
 [156122.51]
 [155752.6 ]
 [152211.77]
 [149759.96]]


In [14]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelEncoder=LabelEncoder()
X[:,3]=labelEncoder.fit_transform(X[:,3])
oneHotEncoder=OneHotEncoder(categorical_features=[3])
X=oneHotEncoder.fit_transform(X).toarray()
X=X[:,1:]
X[0:10,:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[     0.  ,      1.  , 165349.2 , 136897.8 , 471784.1 ],
       [     0.  ,      0.  , 162597.7 , 151377.59, 443898.53],
       [     1.  ,      0.  , 153441.51, 101145.55, 407934.54],
       [     0.  ,      1.  , 144372.41, 118671.85, 383199.62],
       [     1.  ,      0.  , 142107.34,  91391.77, 366168.42],
       [     0.  ,      1.  , 131876.9 ,  99814.71, 362861.36],
       [     0.  ,      0.  , 134615.46, 147198.87, 127716.82],
       [     1.  ,      0.  , 130298.13, 145530.06, 323876.68],
       [     0.  ,      1.  , 120542.52, 148718.95, 311613.29],
       [     0.  ,      0.  , 123334.88, 108679.17, 304981.62]])

## With p-values only

In [10]:
import statsmodels.formula.api as sm
def backwardElimination(x, y, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
 
SL = 0.05
X=np.append(np.ones((50,1)).astype(int),values=X,axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, y, SL)
X_Modeled[0:10,:]


array([[     1.  , 165349.2 ],
       [     1.  , 162597.7 ],
       [     1.  , 153441.51],
       [     1.  , 144372.41],
       [     1.  , 142107.34],
       [     1.  , 131876.9 ],
       [     1.  , 134615.46],
       [     1.  , 130298.13],
       [     1.  , 120542.52],
       [     1.  , 123334.88]])

## Using p-value and R-squared values

In [16]:
import statsmodels.formula.api as sm
def backwardEliminationWithRSquared(x, y, SL):
    numVars = len(x[0])
    temp = np.zeros((50,6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x


    
    
SL = 0.05
X=np.append(np.ones((50,1)).astype(int),values=X,axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardEliminationWithRSquared(X_opt, y, SL)
X_Modeled[0:10,:]

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     450.8
Date:                Sat, 03 Nov 2018   Prob (F-statistic):           2.16e-31
Time:                        11:35:46   Log-Likelihood:                -525.54
No. Observations:                  50   AIC:                             1057.
Df Residuals:                      47   BIC:                             1063.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.698e+04   2689.933     17.464      0.0

array([[     1.  , 165349.2 , 471784.  ],
       [     1.  , 162597.7 , 443898.  ],
       [     1.  , 153441.51, 407934.  ],
       [     1.  , 144372.41, 383199.  ],
       [     1.  , 142107.34, 366168.  ],
       [     1.  , 131876.9 , 362861.  ],
       [     1.  , 134615.46, 127716.  ],
       [     1.  , 130298.13, 323876.  ],
       [     1.  , 120542.52, 311613.  ],
       [     1.  , 123334.88, 304981.  ]])