In [169]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [170]:
dataset = pd.read_csv('50_Startups.csv')

In [171]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [172]:
X = dataset.loc[:, dataset.columns != 'Profit']
y = dataset['Profit']

In [173]:
dummies = pd.get_dummies(X['State'], prefix='State_')
dummies.head()

Unnamed: 0,State__California,State__Florida,State__New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [174]:
X = pd.concat([X.loc[:, X.columns != 'State'], dummies], axis=1)

In [175]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State__California,State__Florida,State__New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [176]:
X = X.drop('State__Florida', axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State__California,State__New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,1,0
2,153441.51,101145.55,407934.54,0,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,0,0


In [177]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [179]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [180]:
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [181]:
y_pred = regressor.predict(X_test)

In [191]:
from sklearn.feature_selection import RFECV

In [192]:
lr = LinearRegression()
selector = RFECV(lr,cv=10)
selector.fit(X_train, y_train)

RFECV(cv=10,
   estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
   n_jobs=1, scoring=None, step=1, verbose=0)

In [193]:
optimized_columns = X_train.columns[selector.support_]
optimized_columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State__California',
       'State__New York'],
      dtype='object')

In [194]:
import statsmodels.formula.api as sm

In [195]:
X['constant'] = np.ones(50).astype(int)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State__California,State__New York,constant
0,165349.2,136897.8,471784.1,0,1,1
1,162597.7,151377.59,443898.53,1,0,1
2,153441.51,101145.55,407934.54,0,0,1
3,144372.41,118671.85,383199.62,0,1,1
4,142107.34,91391.77,366168.42,0,0,1


In [198]:
X_opt = X

In [200]:
regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()

In [201]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 25 Mar 2018",Prob (F-statistic):,1.34e-27
Time:,16:58:29,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
R&D Spend,0.8060,0.046,17.369,0.000,0.712,0.900
Administration,-0.0270,0.052,-0.517,0.608,-0.132,0.078
Marketing Spend,0.0270,0.017,1.574,0.123,-0.008,0.062
State__California,-198.7888,3371.007,-0.059,0.953,-6992.607,6595.030
State__New York,-240.6758,3338.857,-0.072,0.943,-6969.701,6488.349
constant,5.032e+04,7251.767,6.940,0.000,3.57e+04,6.49e+04

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1560000.0


In [204]:
X_opt = X_opt.drop('State__California', axis=1)

In [206]:
X_opt.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State__New York,constant
0,165349.2,136897.8,471784.1,1,1
1,162597.7,151377.59,443898.53,0,1
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,1,1
4,142107.34,91391.77,366168.42,0,1


In [207]:
regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sun, 25 Mar 2018",Prob (F-statistic):,8.5e-29
Time:,17:03:19,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
R&D Spend,0.8059,0.046,17.571,0.000,0.714,0.898
Administration,-0.0269,0.052,-0.521,0.605,-0.131,0.077
Marketing Spend,0.0271,0.017,1.625,0.111,-0.007,0.061
State__New York,-136.5042,2801.719,-0.049,0.961,-5779.456,5506.447
constant,5.018e+04,6747.623,7.437,0.000,3.66e+04,6.38e+04

0,1,2,3
Omnibus:,14.892,Durbin-Watson:,1.284
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.665
Skew:,-0.949,Prob(JB):,1.97e-05
Kurtosis:,5.608,Cond. No.,1430000.0


In [209]:
X_opt = X_opt.drop('State__New York', axis=1)

In [210]:
regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sun, 25 Mar 2018",Prob (F-statistic):,4.53e-30
Time:,17:03:52,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
R&D Spend,0.8057,0.045,17.846,0.000,0.715,0.897
Administration,-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing Spend,0.0272,0.016,1.655,0.105,-0.006,0.060
constant,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [211]:
X_opt = X_opt.drop('Administration', axis=1)
regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sun, 25 Mar 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,17:43:08,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
R&D Spend,0.7966,0.041,19.266,0.000,0.713,0.880
Marketing Spend,0.0299,0.016,1.927,0.060,-0.001,0.061
constant,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [212]:
X_opt = X_opt.drop('Marketing Spend', axis=1)
regressor_OLS = sm.OLS(endog = y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 25 Mar 2018",Prob (F-statistic):,3.5000000000000004e-32
Time:,17:43:31,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
R&D Spend,0.8543,0.029,29.151,0.000,0.795,0.913
constant,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
