# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset

In [2]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

# Encoding Categorical data

In [3]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder = LabelEncoder()
X.iloc[:,-1] = labelencoder.fit_transform(X.iloc[:,-1])
onehotencoder = OneHotEncoder(categorical_features=[-1],dtype='int32')
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## Avoiding dummy variable trap

In [4]:
X = X[:,1:]

# Spliting Dataset into train and test

In [5]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=0)

# Fitting Multiple Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(train_X,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

# Predicting Result

In [7]:
y_pred = regressor.predict(test_X)

# Building Optimum model using Backward elimination

In [17]:
import statsmodels.formula.api  as sm
X = np.append(arr=np.ones((50,1),dtype=np.int32), values=X, axis=1)
X_opt = X[:,[0,1,2,3,4,5]]
regressor_Ols =  sm.OLS(endog=y,exog=X_opt).fit()

In [19]:
regressor_Ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Mon, 05 Nov 2018",Prob (F-statistic):,2.9e-28
Time:,00:11:37,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.73e+04,3185.541,8.571,0.000,2.09e+04,3.37e+04
x1,2.73e+04,3185.541,8.571,0.000,2.09e+04,3.37e+04
x2,1091.0715,3377.106,0.323,0.748,-5710.770,7892.913
x3,-39.4697,3309.066,-0.012,0.991,-6704.271,6625.332
x4,0.8609,0.031,27.665,0.000,0.798,0.924
x5,-0.0527,0.050,-1.045,0.301,-0.154,0.049

0,1,2,3
Omnibus:,14.276,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.262
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,2.55e+17


In [20]:
X = np.append(arr=np.ones((50,1),dtype=np.int32), values=X, axis=1)
X_opt = X[:,[0,1,4,5]]
regressor_Ols =  sm.OLS(endog=y,exog=X_opt).fit()

In [22]:
regressor_Ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,416.4
Date:,"Mon, 05 Nov 2018",Prob (F-statistic):,1.2599999999999999e-30
Time:,00:12:37,Log-Likelihood:,-527.42
No. Observations:,50,AIC:,1061.0
Df Residuals:,47,BIC:,1067.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.46e+04,1355.412,18.152,0.000,2.19e+04,2.73e+04
x1,2.46e+04,1355.412,18.152,0.000,2.19e+04,2.73e+04
x2,-560.4092,2841.924,-0.197,0.845,-6277.623,5156.804
x3,0.8545,0.030,28.843,0.000,0.795,0.914

0,1,2,3
Omnibus:,14.03,Durbin-Watson:,1.127
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.473
Skew:,-0.916,Prob(JB):,5.91e-05
Kurtosis:,5.448,Cond. No.,5.73e+16


In [23]:
X = np.append(arr=np.ones((50,1),dtype=np.int32), values=X, axis=1)
X_opt = X[:,[0,1,5]]
regressor_Ols =  sm.OLS(endog=y,exog=X_opt).fit()

In [25]:
regressor_Ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.02
Method:,Least Squares,F-statistic:,0.04727
Date:,"Mon, 05 Nov 2018",Prob (F-statistic):,0.829
Time:,00:13:37,Log-Likelihood:,-600.63
No. Observations:,50,AIC:,1205.0
Df Residuals:,48,BIC:,1209.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.556e+04,3542.814,15.682,0.000,4.84e+04,6.27e+04
x1,5.556e+04,3542.814,15.682,0.000,4.84e+04,6.27e+04
x2,2642.1322,1.22e+04,0.217,0.829,-2.18e+04,2.71e+04

0,1,2,3
Omnibus:,0.011,Durbin-Watson:,0.021
Prob(Omnibus):,0.994,Jarque-Bera (JB):,0.082
Skew:,0.022,Prob(JB):,0.96
Kurtosis:,2.807,Cond. No.,5.1e+16


In [27]:
X = np.append(arr=np.ones((50,1),dtype=np.int32), values=X, axis=1)
X_opt = X[:,[0,1]]
regressor_Ols =  sm.OLS(endog=y,exog=X_opt).fit()
regressor_Ols.summary()

  return self.ess/self.df_model


0,1,2,3
Dep. Variable:,Profit,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,
Date:,"Mon, 05 Nov 2018",Prob (F-statistic):,
Time:,00:19:34,Log-Likelihood:,-600.65
No. Observations:,50,AIC:,1203.0
Df Residuals:,49,BIC:,1205.0
Df Model:,0,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.601e+04,2850.077,19.651,0.000,5.03e+04,6.17e+04
x1,5.601e+04,2850.077,19.651,0.000,5.03e+04,6.17e+04

0,1,2,3
Omnibus:,0.018,Durbin-Watson:,0.02
Prob(Omnibus):,0.991,Jarque-Bera (JB):,0.068
Skew:,0.023,Prob(JB):,0.966
Kurtosis:,2.825,Cond. No.,5.02e+16
