# Multiple Linear Regression with Backward Elimination




In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

### Importing Data

In [2]:
# Gettind database
df = pd.read_csv("..\\Part 1\\50_Startups.csv")
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


### Data Preprocessing

In [3]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In [4]:
X.shape

(50, 6)

In [5]:
# Avoiding the Dummy Variable Trap
X = X[:, 1:]

By including dummy variable in a regression model however, one should be careful of the ***Dummy Variable Trap***. The Dummy Variable trap is a scenario in which the independent variables are multicollinear - a scenario in which two or more variables are highly correlated; in simple terms one variable can be predicted from the others.

The ***solution*** to the dummy variable trap is to drop one of the categorical variables (or alternatively, drop the intercept constant) - if there are m number of categories, use m-1 in the model, the value left out can be thought of as the reference value and the fit values of the remaining categories represent the change from this reference.

In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=0)



### Fitting Multiple Linear Regression to the Training set

In [7]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_test,y_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### Predicting the Test set results

In [8]:
y_pred = lr.predict(X_test)

In [9]:
y_pred

array([ 100494.19441414,  144259.40000006,  141456.63348949,
         81648.2429768 ,  183278.81846194,  106297.95337744,
         76209.33824841,  101213.63837421,  111693.01992799,
        176222.83072951])

### Now train with backward elimination

In [10]:
import statsmodels.formula.api as sm
# adding columan with 1 value for the which is for x0 in MLR equation b0x0 +b1x1 + ..
X = np.append(arr = np.ones((50,1)).astype(int),values = X,axis = 1)

In [11]:
#Run 1
X_opt = X[:,[0,1,2,3,4]]
regressor_OLS = sm.OLS(endog = y,exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Fri, 11 May 2018",Prob (F-statistic):,2.9e-28
Time:,15:37:55,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.46e+04,6371.060,8.571,0.000,4.18e+04,6.74e+04
x1,1091.1075,3377.087,0.323,0.748,-5710.695,7892.910
x2,-39.3434,3309.047,-0.012,0.991,-6704.106,6625.420
x3,0.8609,0.031,27.665,0.000,0.798,0.924
x4,-0.0527,0.050,-1.045,0.301,-0.154,0.049

0,1,2,3
Omnibus:,14.275,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.26
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,706000.0


We have decided significance leve SL=0.05(5%). x2 have significant hight p value eliminate it 

In [12]:
#Run 2
X_opt = X[:,[0,1,3,4,5]]
regressor_OLS = sm.OLS(endog = y,exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Fri, 11 May 2018",Prob (F-statistic):,8.49e-29
Time:,15:37:55,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [14]:
#Run 3
X_opt = X[:,[0,3,4,5]]
regressor_OLS = sm.OLS(endog = y,exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Fri, 11 May 2018",Prob (F-statistic):,4.53e-30
Time:,15:39:07,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [15]:
#Run 4 
X_opt = X[:,[0,3,5]]
regressor_OLS = sm.OLS(endog = y,exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Fri, 11 May 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,15:39:33,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [16]:
#Run 5
X_opt = X[:,[0,3]]
regressor_OLS = sm.OLS(endog = y,exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Fri, 11 May 2018",Prob (F-statistic):,3.5000000000000004e-32
Time:,15:40:00,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
