In [1]:
#importing neccessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer



In [2]:
#importing data into the IDE
data = pd.read_csv('50_Startups.csv')
data = pd.DataFrame(data)
data.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# converting categorical variable to numerical using Label Encoder
lb = LabelEncoder()
data['State_lb'] = lb.fit_transform(data.iloc[:,3])


In [4]:
# elimating magnitude from categorical variable using OneHotEncoder
oe = OneHotEncoder(handle_unknown = 'ignore')
b = oe.fit_transform(data[['State_lb']]).toarray()
b = pd.DataFrame(b)
data = data.join(b)

In [5]:
data.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_lb,0,1,2
0,165349.2,136897.8,471784.1,New York,192261.83,2,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,California,191792.06,0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,New York,182901.99,2,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0.0,1.0,0.0


In [6]:
# renaming categorical dummies from OneHotEncoder
data['State_NY'] = data[0]
data['State_C'] = data[1]
data['State_F'] = data[2]
data.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,State_lb,0,1,2,State_NY,State_C,State_F
0,165349.2,136897.8,471784.1,New York,192261.83,2,0.0,0.0,1.0,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,California,191792.06,0,1.0,0.0,0.0,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0.0,1.0,0.0,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,New York,182901.99,2,0.0,0.0,1.0,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0.0,1.0,0.0,0.0,1.0,0.0


In [7]:
# independent variables
x = data.drop(['State', 'Profit', 0, 1 , 2, 'State_lb','State_F' ], axis =1)
#note State Florida is dropped to ensure perfect model 
# target variable or dependent variable
y = data['Profit']

In [8]:
y.head(5)

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [9]:
#diving the dataset into test train
x_train, x_test , y_train , y_test = train_test_split(x, y, test_size = .20, random_state = False )
x_train

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_NY,State_C
33,55493.95,103057.49,214634.81,0.0,1.0
35,46014.02,85047.44,205517.64,0.0,0.0
26,75328.87,144135.98,134050.07,0.0,1.0
34,46426.07,157693.92,210797.67,1.0,0.0
18,91749.16,114175.79,294919.57,0.0,1.0
7,130298.13,145530.06,323876.68,0.0,1.0
14,119943.24,156547.42,256512.92,0.0,1.0
45,1000.23,124153.04,1903.93,0.0,0.0
48,542.05,51743.15,0.0,0.0,0.0
29,65605.48,153032.06,107138.38,0.0,0.0


In [10]:
from sklearn.linear_model import LinearRegression


In [11]:
# fitting the model
Lr = LinearRegression()
Lr.fit(x_train,y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
#predecting for x_test
y_pred = Lr.predict(x_test)
y_pred

array([103015.20159776, 132582.27760831, 132447.73845184,  71976.09851266,
       178537.4822107 , 116161.24230157,  67851.69209689,  98791.73374679,
       113969.43533008, 167921.06569569])

In [13]:
# predicting profit in Florida
x_pred = [200546.2, 123455.22, 54326.1, 0, 0]
Lr.predict([x_pred])

array([204418.09814733])

In [14]:
Lr.score(x_train,y_train)

0.9501847627493607

In [15]:
# building model optimization

In [16]:
import statsmodels.formula.api as sm
x['constant'] = np.ones((50,1), dtype= int)
x.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_NY,State_C,constant
0,165349.2,136897.8,471784.1,0.0,0.0,1
1,162597.7,151377.59,443898.53,1.0,0.0,1
2,153441.51,101145.55,407934.54,0.0,1.0,1
3,144372.41,118671.85,383199.62,0.0,0.0,1
4,142107.34,91391.77,366168.42,0.0,1.0,1


In [17]:
'''
from sklearn.feature_selection import f_regression 
fr  = f_regression(x_train, y_train)
data_PF = pd.DataFrame((fr[0], fr[1]),index= ('F-values', 'P-values'))
data_PF'''
x.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_NY,State_C,constant
0,165349.2,136897.8,471784.1,0.0,0.0,1
1,162597.7,151377.59,443898.53,1.0,0.0,1
2,153441.51,101145.55,407934.54,0.0,1.0,1
3,144372.41,118671.85,383199.62,0.0,0.0,1
4,142107.34,91391.77,366168.42,0.0,1.0,1
5,131876.9,99814.71,362861.36,0.0,0.0,1
6,134615.46,147198.87,127716.82,1.0,0.0,1
7,130298.13,145530.06,323876.68,0.0,1.0,1
8,120542.52,148718.95,311613.29,0.0,0.0,1
9,123334.88,108679.17,304981.62,1.0,0.0,1


In [18]:
# creating new data frame for optimization
x_opt = x.copy()
data_opt = pd.DataFrame(x_opt)
y_opt = pd.DataFrame(y)
data_opt = data_opt.join(y_opt)

In [19]:
data_opt.head()
data_opt['RD_Spend'] = data_opt['R&D Spend']
data_opt['Marketing_Spend'] = data_opt['Marketing Spend']
data_opt.head(3)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_NY,State_C,constant,Profit,RD_Spend,Marketing_Spend
0,165349.2,136897.8,471784.1,0.0,0.0,1,192261.83,165349.2,471784.1
1,162597.7,151377.59,443898.53,1.0,0.0,1,191792.06,162597.7,443898.53
2,153441.51,101145.55,407934.54,0.0,1.0,1,191050.39,153441.51,407934.54


In [20]:
R_ols = sm.ols(formula=" Profit ~ RD_Spend + Administration + Marketing_Spend + State_NY + State_C + constant", data = data_opt)

In [21]:
R_ols.fit().summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Thu, 21 May 2020",Prob (F-statistic):,1.34e-27
Time:,16:55:42,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.504e+04,3476.293,7.204,0.000,1.8e+04,3.2e+04
RD_Spend,0.8060,0.046,17.369,0.000,0.712,0.900
Administration,-0.0270,0.052,-0.517,0.608,-0.132,0.078
Marketing_Spend,0.0270,0.017,1.574,0.123,-0.008,0.062
State_NY,41.8870,3256.039,0.013,0.990,-6520.229,6604.003
State_C,240.6758,3338.857,0.072,0.943,-6488.349,6969.701
constant,2.504e+04,3476.293,7.204,0.000,1.8e+04,3.2e+04

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1.05e+22


In [22]:
from sklearn.feature_selection import f_regression 
fr  = f_regression(x_train, y_train)
data_PF = pd.DataFrame((fr[0], fr[1]),index= ('F-values', 'P-values'))
data_PF

Unnamed: 0,0,1,2,3,4
F-values,652.3942,4.725866,49.07662,1.033255,0.009578
P-values,1.56278e-25,0.03601,2.417749e-08,0.315824,0.922551


In [23]:
# backward elemination removing the highest p-value
R_ols_opt2 = sm.ols(formula=" Profit ~ RD_Spend + Administration + Marketing_Spend  + State_C + constant", data = data_opt)
R_ols_opt2.fit().summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Thu, 21 May 2020",Prob (F-statistic):,8.49e-29
Time:,16:55:43,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.505e+04,3323.935,7.537,0.000,1.84e+04,3.17e+04
RD_Spend,0.8060,0.046,17.606,0.000,0.714,0.898
Administration,-0.0270,0.052,-0.523,0.604,-0.131,0.077
Marketing_Spend,0.0270,0.017,1.592,0.118,-0.007,0.061
State_C,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
constant,2.505e+04,3323.935,7.537,0.000,1.84e+04,3.17e+04

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1.04e+22


In [24]:
R_ols_opt3 = sm.ols(formula=" Profit ~ RD_Spend + Administration + Marketing_Spend + constant", data = data_opt)
R_ols_opt3.fit().summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Thu, 21 May 2020",Prob (F-statistic):,4.53e-30
Time:,16:55:43,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.506e+04,3286.176,7.626,0.000,1.84e+04,3.17e+04
RD_Spend,0.8057,0.045,17.846,0.000,0.715,0.897
Administration,-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing_Spend,0.0272,0.016,1.655,0.105,-0.006,0.060
constant,2.506e+04,3286.176,7.626,0.000,1.84e+04,3.17e+04

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1.02e+22


In [25]:
R_ols_opt4 = sm.ols(formula=" Profit ~ RD_Spend  + Marketing_Spend   + constant", data = data_opt)
R_ols_opt4.fit().summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Thu, 21 May 2020",Prob (F-statistic):,2.1600000000000003e-31
Time:,16:55:43,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.349e+04,1344.966,17.464,0.000,2.08e+04,2.62e+04
RD_Spend,0.7966,0.041,19.266,0.000,0.713,0.880
Marketing_Spend,0.0299,0.016,1.927,0.060,-0.001,0.061
constant,2.349e+04,1344.966,17.464,0.000,2.08e+04,2.62e+04

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,9.41e+21


In [26]:
R_ols_opt5 = sm.ols(formula=" Profit ~ RD_Spend + constant", data = data_opt)
R_ols_opt5.fit().summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Thu, 21 May 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,16:55:43,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.452e+04,1268.948,19.320,0.000,2.2e+04,2.71e+04
RD_Spend,0.8543,0.029,29.151,0.000,0.795,0.913
constant,2.452e+04,1268.948,19.320,0.000,2.2e+04,2.71e+04

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,3.14e+21
