In [1]:
# Multiple Linear Regression with Backward Elimination on the '50_Startups' Dataset
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:,4].values

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder = LabelEncoder()
X[:,3] = labelencoder.fit_transform(X[:,3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [5]:
# Avoiding the - dummy variable trap
X = X[:, 1:]

In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/5, random_state = 0)

In [7]:
# Fitting the line
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

In [8]:
# For backward elimination - Used for reducing the number of features/independent variables
import statsmodels.api as sm
X = np.append(arr=np.ones((50,1)).astype(int),values = X,axis=1)
sig = 0.05 # Significance level

X_optimal = X[:,[0,1,2,3,4,5]] # X_optimal initilized as the original matrix of features (6)
regressor_ols = sm.OLS(endog = y, exog = X_optimal).fit()
regressor_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 08 Dec 2019",Prob (F-statistic):,1.34e-27
Time:,10:46:03,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [9]:
# Remove the feature that shows the highest p-value
# Lesser the p-value - lesser significant the feature
X_optimal = X[:,[0,1,3,4,5]]
regressor_ols = sm.OLS(endog = y, exog = X_optimal).fit()
regressor_ols.summary()

X_optimal = X[:,[0,3,4,5]]
regressor_ols = sm.OLS(endog = y, exog = X_optimal).fit()
regressor_ols.summary()

X_optimal = X[:,[0,3,5]]
regressor_ols = sm.OLS(endog = y, exog = X_optimal).fit()
regressor_ols.summary()

X_optimal = X[:,[0,3]]
regressor_ols = sm.OLS(endog = y, exog = X_optimal).fit()
regressor_ols.summary()
# X_optimal now remains a feature matrix with just 2 significant features
# The process mentioned above can be automated by the use of loops

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 08 Dec 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,10:46:03,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
