# Multiple Linear Regression - Multicollinearity

<hr>

## Importing packages and reading the data ##

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import sklearn.metrics as metrics
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv("../datasets/MLR_advertising_budget.csv")

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   TV Ad Budget              200 non-null    float64
 1   Radio Ad Budget           200 non-null    float64
 2   Newspaper Ad Budget       200 non-null    float64
 3   Advertising Budget total  200 non-null    float64
 4   Sales                     200 non-null    float64
dtypes: float64(5)
memory usage: 7.9 KB


In [17]:
df.corr().abs()

Unnamed: 0,TV Ad Budget,Radio Ad Budget,Newspaper Ad Budget,Advertising Budget total,Sales
TV Ad Budget,1.0,0.054809,0.056648,0.94533,0.782224
Radio Ad Budget,0.054809,1.0,0.354104,0.293211,0.576223
Newspaper Ad Budget,0.056648,0.354104,1.0,0.343059,0.228299
Advertising Budget total,0.94533,0.293211,0.343059,1.0,0.867712
Sales,0.782224,0.576223,0.228299,0.867712,1.0


In [4]:
df.columns

Index(['TV Ad Budget', 'Radio Ad Budget', 'Newspaper Ad Budget',
       'Advertising Budget total', 'Sales'],
      dtype='object')

## Split the DataFrame into train and test data

In [5]:
train, test = train_test_split(
    df,
    random_state = 13 # this ensures that we get the same answer each time
)

## Train the model

In [19]:
# Identify dependent and independent variables

dependent_var = train['Sales']
independent_var = train[['TV Ad Budget', 'Radio Ad Budget', 'Newspaper Ad Budget',
       'Advertising Budget total']]
independent_var = sm.add_constant(independent_var)

# Build the model 
model = sm.OLS(
    dependent_var,
    independent_var
).fit()

model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.881
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,361.1
Date:,"Fri, 14 Apr 2023",Prob (F-statistic):,2.56e-67
Time:,22:18:18,Log-Likelihood:,-295.9
No. Observations:,150,AIC:,599.8
Df Residuals:,146,BIC:,611.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9395,0.394,7.464,0.000,2.161,3.718
TV Ad Budget,-0.0109,0.003,-3.808,0.000,-0.017,-0.005
Radio Ad Budget,0.1270,0.009,14.646,0.000,0.110,0.144
Newspaper Ad Budget,-0.0589,0.007,-8.368,0.000,-0.073,-0.045
Advertising Budget total,0.0571,0.003,21.485,0.000,0.052,0.062

0,1,2,3
Omnibus:,47.653,Durbin-Watson:,2.259
Prob(Omnibus):,0.0,Jarque-Bera (JB):,112.309
Skew:,-1.334,Prob(JB):,4.1e-25
Kurtosis:,6.293,Cond. No.,1.33e+16


In [20]:
predicted = model.predict(
    sm.add_constant(test[['TV Ad Budget', 'Radio Ad Budget', 'Newspaper Ad Budget',
       'Advertising Budget total']])
)

In [21]:
metrics.r2_score(test['Sales'],predicted)

0.9329679186406997

## Train the model - Second attempt

In [22]:
dependent_var = train['Sales']
independent_var = train[['TV Ad Budget', 'Radio Ad Budget', 'Newspaper Ad Budget']]
independent_var = sm.add_constant(independent_var)

# Build the model 
model = sm.OLS(
    dependent_var,
    independent_var
).fit()

model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.881
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,361.1
Date:,"Fri, 14 Apr 2023",Prob (F-statistic):,2.56e-67
Time:,22:18:55,Log-Likelihood:,-295.9
No. Observations:,150,AIC:,599.8
Df Residuals:,146,BIC:,611.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9395,0.394,7.464,0.000,2.161,3.718
TV Ad Budget,0.0462,0.002,27.242,0.000,0.043,0.050
Radio Ad Budget,0.1841,0.010,17.718,0.000,0.164,0.205
Newspaper Ad Budget,-0.0018,0.008,-0.237,0.813,-0.017,0.013

0,1,2,3
Omnibus:,47.653,Durbin-Watson:,2.259
Prob(Omnibus):,0.0,Jarque-Bera (JB):,112.309
Skew:,-1.334,Prob(JB):,4.1e-25
Kurtosis:,6.293,Cond. No.,462.0


In [23]:
predicted = model.predict(
    sm.add_constant(test[['TV Ad Budget', 'Radio Ad Budget', 'Newspaper Ad Budget']])
)
metrics.r2_score(test['Sales'],predicted)

0.9329679186406996

## Train the model - Third attempt

In [24]:
dependent_var = train['Sales']
independent_var = train[['TV Ad Budget', 'Radio Ad Budget']]
independent_var = sm.add_constant(independent_var)

# Build the model 
model = sm.OLS(
    dependent_var,
    independent_var
).fit()

model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.881
Model:,OLS,Adj. R-squared:,0.88
Method:,Least Squares,F-statistic:,545.2
Date:,"Fri, 14 Apr 2023",Prob (F-statistic):,9.98e-69
Time:,22:21:32,Log-Likelihood:,-295.93
No. Observations:,150,AIC:,597.9
Df Residuals:,147,BIC:,606.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9042,0.363,7.990,0.000,2.186,3.622
TV Ad Budget,0.0462,0.002,27.373,0.000,0.043,0.050
Radio Ad Budget,0.1832,0.010,18.945,0.000,0.164,0.202

0,1,2,3
Omnibus:,47.081,Durbin-Watson:,2.257
Prob(Omnibus):,0.0,Jarque-Bera (JB):,108.623
Skew:,-1.329,Prob(JB):,2.59e-24
Kurtosis:,6.212,Cond. No.,423.0


In [25]:
predicted = model.predict(
    sm.add_constant(test[['TV Ad Budget', 'Radio Ad Budget']])
)
metrics.r2_score(test['Sales'],predicted)

0.9330669780312202