## Data Pre-Processing

In [2]:
# Import modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Import dataset
dataset = pd.read_csv('datasets/multiple-linear-regression-data.csv')

# Separate dataset (X) and dependent vector (y)
X = dataset.drop(columns=['Profit'])
y = dataset.filter(items=['Profit'])

# Encode Categorical Data
dummy_vars = ['State']
X_categorical = pd.get_dummies(X[dummy_vars])
X = pd.concat([X_categorical, X.drop(columns=dummy_vars)], axis = 1, sort=False)

# Avoiding the dummy variable trap
# X.drop(columns=['State_California'], inplace=True)
X['State_California'].values[:] = 1 # set as constant to work with multiple linear regression
cols = X.columns.tolist()
cols.insert(0, cols.pop(cols.index('State_California')))

X = X.reindex(columns=cols)

# Splitting Training and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Multiple Linear Regression

### Naive All-In Method

In [3]:
# from sklearn.linear_model import LinearRegression
# lr = LinearRegression()
# lr.fit(X_train, y_train)

# # Predicting the Test set results
# y_pred = pd.DataFrame(lr.predict(X_test), columns=y_train.columns.values)

### Backward Elimination Method

In [6]:

# Building and optimal model using Backward Elimination
import statsmodels.regression.linear_model as lm

def backwardElimination(x, y, sl):
    print('Started backwardElimination:')
    print('--------------------------')
    num_vars = x.shape[1]
    for i in range(0, num_vars):
        # 2. Fit the model with all predictors, # 5. Fit the model without the highest P-value predictor
        ols = lm.OLS(endog=y, exog= x).fit()
        print('After fit iteration: {}'.format(i))
        print(ols.summary())
        # 3. Consider the predictor with the highest P-value, If P > SL, go to 4, otherwise done.
        pvalues = ols.pvalues
        max_var, max_val = 0, float('-inf')
        for i in range(len(pvalues)):
            if pvalues[i] > max_val:
                max_var, max_val = x.columns[i], pvalues[i]
        if max_val > sl:
            # 4. Remove the predictor
            print('Removing Predictor: {}'.format(max_var))
            x = x.drop(columns=max_var)
        else:
            print('All predictors pvalues < sl({}) ... Done'.format(sl))
            break
    print(ols.summary())
    print('Ended backwardElimination:')
    print('--------------------------')
    return x

# 1. Select a significance level to stay in the model --> SL = 0.05
SL = 0.05
X_opt = X
X_modeled = backwardElimination(X, y, SL)

Started backwardElimination:
--------------------------
After iteration: 0
                            OLS Regression Results                            
Dep. Variable:                 Profit   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     169.9
Date:                Thu, 09 Apr 2020   Prob (F-statistic):           1.34e-27
Time:                        17:29:19   Log-Likelihood:                -525.38
No. Observations:                  50   AIC:                             1063.
Df Residuals:                      44   BIC:                             1074.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------