In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# assumptions of linear regression
- linearity - assuming each independent variable has a linear relationship with the dependent variable
- homoscedasticity - same variance among independent variables
    - failure would be if flower values are closer to regression line while higher values are further away
- multivariate normality - assume residuals are normally distributed
- independence of errors - errors do not depend on one another
- lack of multicollinearity - one independent variable can be modelled to linearly predict another independent variable

In [23]:
df = pd.read_csv('50_Startups.csv')

In [24]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# what is a p-value
- determines how off from the null hypothesis a given sample is
- larger p-value -> more confident that null hypothesis is correct
- smaller p-value -> less confident that null hypothesis is correct

# 5 methods of building models
- all in 
    - put all variables into model 
    - based on prior/domain knowledge
    - preparing for backward elimination
- backward elimination
    1. select sig level to stay in model i.e SL = .05
    2. fit full model with all indep. vars
    3. consider the predictor with highest p-value if p-value > SL otherwise go to end
    4. remove the predictor 
    5. fit model without this variable, then go back to step 3
    6. finish until p-value are less than SL
- forward selection
    1. select sig level to stay in model i.e SL = .05
    2. fit all simple reg model y ~ $x_n$ select one with lowest p-value
    3. keep this variable and fit all posssible models with one extra predictor added to the ones you already have
    4. consider the predictor with the lowest p-value, if p-value < SL then go to step 3, otherwise continue
    5. keep previous model
    
- bidirectional elimination 
    1. select a sig level to enter and stay in the model
        - SLENTER = .05 SLSTAY = .05
    2. perform the next step of forward selection (new variable must have P < SLENTER to enter)
    3. perform ALL steps of backward elimination (old variables must have P < SLSTAY) repeat step 2
    4. no new variables can enter or stay -> finished model
- score comparison
    1. select criterion for success
    2. construct all possible reg model $2^n - 1$ total combos
    3. select one which meets criterion the best
    

In [97]:
X = df.iloc[:, :-1].values
y = df.iloc[:, 4].values

In [98]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [99]:
label_encoder = LabelEncoder()
X = pd.DataFrame(X)
X.iloc[:, 3] = label_encoder.fit_transform(X.iloc[:, 3])
one_hot = OneHotEncoder(categorical_features = [3])
X = one_hot.fit_transform(X).toarray()

# avoiding the dummy variable trap
X = X[:, 1:]

In [100]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,165349.2,136897.8,471784.1
1,0.0,0.0,162597.7,151377.59,443898.53
2,1.0,0.0,153441.51,101145.55,407934.54
3,0.0,1.0,144372.41,118671.85,383199.62
4,1.0,0.0,142107.34,91391.77,366168.42


In [101]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [102]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
trained_reg = linear_reg.fit(X, y)

In [103]:
y_pred = trained_reg.predict(X_test)

In [104]:
import statsmodels.formula.api as sm

In [105]:
# adding column of ones to beginning of x so that statsmodel lib 
# understands that there is a constant
X = np.append(arr= np.ones((50,1)).astype(int), values = X, axis=1)

# doing backward elimination by hand

In [107]:
X_opt = X[:, [0,1,2,3,4,5]]
ols_regressor = sm.OLS(endog=y, exog=X_opt).fit()
#ols_regressor.summary()

In [109]:
X_opt = X[:, [0,1,3,4,5]]
ols_regressor = sm.OLS(endog=y, exog=X_opt).fit()
#ols_regressor.summary()

In [111]:
X_opt = X[:, [0,3,4,5]]
ols_regressor = sm.OLS(endog=y, exog=X_opt).fit()
#ols_regressor.summary()

In [113]:
X_opt = X[:, [0,3,5]]
ols_regressor = sm.OLS(endog=y, exog=X_opt).fit()
#ols_regressor.summary()

In [119]:
X_opt = X[:, [0,3]]
ols_regressor = sm.OLS(endog=y, exog=X_opt).fit()
#ols_regressor.summary()

In [138]:
def backwards_elimination(X,indices,sl):
    X_opt = X[:, indices]
    tmp_reg = sm.OLS(endog=y,exog=X_opt).fit()
    pvals = tmp_reg.pvalues
    max_p = np.amax(pvals)
    while(max_p > sl):
        idx = np.where(pvals == max_p)[0][0]
        del indices[idx]
        X_opt = X[:, indices]
        tmp_reg = sm.OLS(endog=y,exog=X_opt).fit()
        pvals = tmp_reg.pvalues
        max_p = np.amax(pvals)
    return indices

In [139]:
backwards_elimination(X, [0, 1, 2, 3, 4, 5], .05)

[0, 1, 3, 4, 5]
[0, 3, 4, 5]
[0, 3, 5]
[0, 3]


[0, 3]