# Jonathan Halverson
# Monday, February 5, 2018
# Hypothesis testing applied to linear regression coefficients

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')
import pandas as pd

In [2]:
df = pd.read_csv('../machine_learning/Advertising.csv', index_col=0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [3]:
df.corr()

Unnamed: 0,TV,Radio,Newspaper,Sales
TV,1.0,0.054809,0.056648,0.782224
Radio,0.054809,1.0,0.354104,0.576223
Newspaper,0.056648,0.354104,1.0,0.228299
Sales,0.782224,0.576223,0.228299,1.0


In [4]:
df.describe()

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
TV           200 non-null float64
Radio        200 non-null float64
Newspaper    200 non-null float64
Sales        200 non-null float64
dtypes: float64(4)
memory usage: 7.8 KB


In [6]:
import statsmodels.formula.api as smf

result = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=df).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Mon, 05 Feb 2018   Prob (F-statistic):           1.58e-96
Time:                        19:01:17   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.9389      0.312      9.422      0.0

### Bootstrapping approach to p-values

In [7]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

In [8]:
intercept_and_coeffs = []
for _ in xrange(10000):
     x = df.sample(n=df.shape[0], replace=True)
     linreg = linreg.fit(x[['TV', 'Radio', 'Newspaper']], x.Sales)
     intercept_and_coeffs.append(np.append(np.array([linreg.intercept_]), linreg.coef_))
weights = np.array(intercept_and_coeffs)

In [9]:
means = weights.mean(axis=0)
means

array([  2.95195641e+00,   4.56741237e-02,   1.88531098e-01,
        -8.85777340e-04])

In [10]:
SE = weights.std(axis=0)
SE

array([ 0.34011473,  0.00192975,  0.01082248,  0.00644772])

In [11]:
T = means / SE
T

array([  8.67929601,  23.66838823,  17.42031678,  -0.13737836])

In [12]:
from scipy.stats import t

In [13]:
p_values = [2 * t.cdf(-abs(t_stat), 200 - 3) for t_stat in T]
p_values

[1.498566965602257e-15,
 1.6669509204948239e-59,
 9.5013802818167565e-42,
 0.89087204026074063]

We see that that intercept and two of the coefficients are statistically significant from zero. The coefficient for the Newspaper feature can not be distinguished from zero so that feature should be removed.