# 4. Multiple Regression Analysis: Inference

In [1]:
%pip install numpy statsmodels wooldridge scipy -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import statsmodels.formula.api as smf
import wooldridge as wool
from scipy import stats

## 4.1 The $t$ Test

### 4.1.1 General Setup

$$H_0: \beta_j = a_j$$

$$H_1: \beta_j \neq a_j \quad \text{or} \quad H_1:\beta_j > a_j \quad \text{or} \quad H_1:\beta_j < a_j$$

$$t = \frac{\hat{\beta}_j - a_j}{se(\hat{\beta}_j)}$$



### 4.1.2 Standard Case

$$H_0: \beta_j = 0, \qquad H_1: \beta_j \neq 0$$

$$t_{\hat{\beta}_j} = \frac{\hat{\beta}_j}{se(\hat{\beta}_j)}$$

$$\text{reject } H_0 \text{ if } |t_{\hat{\beta}_j}| > c$$

$$p_{\hat{\beta}_j} = 2 \cdot F_{t_{n-k-1}}(-|t_{\hat{\beta}_j}|)$$

$$\text{reject } H_0 \text{ if } p_{\hat{\beta}_j} < \alpha$$

### Example 4.3: Determinants of College GPA

$$\text{colGPA} = \beta_0 + \beta_1 \text{hsGPA} + \beta_2 \text{ACT} + \beta_3 \text{skipped} + u$$

In [3]:
# CV for alpha=5% and 1% using the t distribution with 137 d.f.:
alpha = np.array([0.05, 0.01])
cv_t = stats.t.ppf(1 - alpha / 2, 137)
print(f"cv_t: {cv_t}\n")

cv_t: [1.97743121 2.61219198]



In [4]:
# CV for alpha=5% and 1% using the normal approximation:
cv_n = stats.norm.ppf(1 - alpha / 2)
print(f"cv_n: {cv_n}\n")

cv_n: [1.95996398 2.5758293 ]



In [5]:
gpa1 = wool.data("gpa1")

# store and display results:
reg = smf.ols(formula="colGPA ~ hsGPA + ACT + skipped", data=gpa1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:                 colGPA   R-squared:                       0.234
Model:                            OLS   Adj. R-squared:                  0.217
Method:                 Least Squares   F-statistic:                     13.92
Date:                Tue, 24 Sep 2024   Prob (F-statistic):           5.65e-08
Time:                        14:59:27   Log-Likelihood:                -41.501
No. Observations:                 141   AIC:                             91.00
Df Residuals:                     137   BIC:                             102.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3896      0.332

In [6]:
# manually confirm the formulas, i.e. extract coefficients and SE:
b = results.params
se = results.bse

# reproduce t statistic:
tstat = b / se
print(f"tstat: \n{tstat}\n")

# reproduce p value:
pval = 2 * stats.t.cdf(-abs(tstat), 137)
print(f"pval: \n{pval}\n")

tstat: 
Intercept    4.191039
hsGPA        4.396260
ACT          1.393319
skipped     -3.196840
dtype: float64

pval: 
[4.95026897e-05 2.19205015e-05 1.65779902e-01 1.72543113e-03]



### 4.1.3 Other Hypotheses

### Example 4.1: Hourly Wage Equation

$$\log(\text{wage}) = \beta_0 + \beta_1 \text{educ} + \beta_2 \text{exper} + \beta_3 \text{tenure} + u$$

In [7]:
# CV for alpha=5% and 1% using the t distribution with 522 d.f.:
alpha = np.array([0.05, 0.01])
cv_t = stats.t.ppf(1 - alpha, 522)
print(f"cv_t: {cv_t}\n")

cv_t: [1.64777794 2.33351273]



In [8]:
# CV for alpha=5% and 1% using the normal approximation:
cv_n = stats.norm.ppf(1 - alpha)
print(f"cv_n: {cv_n}\n")

cv_n: [1.64485363 2.32634787]



In [9]:
wage1 = wool.data("wage1")

reg = smf.ols(formula="np.log(wage) ~ educ + exper + tenure", data=wage1)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:           np.log(wage)   R-squared:                       0.316
Model:                            OLS   Adj. R-squared:                  0.312
Method:                 Least Squares   F-statistic:                     80.39
Date:                Tue, 24 Sep 2024   Prob (F-statistic):           9.13e-43
Time:                        14:59:28   Log-Likelihood:                -313.55
No. Observations:                 526   AIC:                             635.1
Df Residuals:                     522   BIC:                             652.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2844      0.104

## 4.2 Confidence Intervals

$$\hat{\beta}_j \pm c \cdot se(\hat{\beta}_j)$$

### Example 4.8: Model of R&D Expenditures

$$\log(\text{rd}) = \beta_0 + \beta_1 \log(\text{sales}) + \beta_2 \text{profmarg} + u$$

In [10]:
rdchem = wool.data("rdchem")

# OLS regression:
reg = smf.ols(formula="np.log(rd) ~ np.log(sales) + profmarg", data=rdchem)
results = reg.fit()
print(f"results.summary(): \n{results.summary()}\n")

results.summary(): 
                            OLS Regression Results                            
Dep. Variable:             np.log(rd)   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     162.2
Date:                Tue, 24 Sep 2024   Prob (F-statistic):           1.79e-16
Time:                        14:59:28   Log-Likelihood:                -22.511
No. Observations:                  32   AIC:                             51.02
Df Residuals:                      29   BIC:                             55.42
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -4.3783  

In [11]:
# 95% CI:
CI95 = results.conf_int(0.05)
print(f"CI95: \n{CI95}\n")

CI95: 
                      0         1
Intercept     -5.335478 -3.421068
np.log(sales)  0.961107  1.207332
profmarg      -0.004488  0.047799



In [12]:
# 99% CI:
CI99 = results.conf_int(0.01)
print(f"CI99: \n{CI99}\n")

CI99: 
                      0         1
Intercept     -5.668313 -3.088234
np.log(sales)  0.918299  1.250141
profmarg      -0.013578  0.056890



## 4.3 Linear Restrictions: $F$ Tests

$$\log(\text{salary}) = \beta_0 + \beta_1 \text{years} + \beta_2 \text{gamesyr} + \beta_3 \text{bavg} + \beta_4 \text{hrunsyr} + \beta_5 \text{rbisyr} + u$$

$$F = \frac{SSR_r - SSR_{ur}}{SSR_{ur}} \cdot \frac{n - k - 1}{q} = \frac{R^2_{ur} - R^2_r}{1 - R^2_{ur}} \cdot \frac{n - k - 1}{q}$$

In [13]:
mlb1 = wool.data("mlb1")
n = mlb1.shape[0]

# unrestricted OLS regression:
reg_ur = smf.ols(
    formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
    data=mlb1,
)
fit_ur = reg_ur.fit()
r2_ur = fit_ur.rsquared
print(f"r2_ur: {r2_ur}\n")

r2_ur: 0.6278028485187442



In [14]:
# restricted OLS regression:
reg_r = smf.ols(formula="np.log(salary) ~ years + gamesyr", data=mlb1)
fit_r = reg_r.fit()
r2_r = fit_r.rsquared
print(f"r2_r: {r2_r}\n")

r2_r: 0.5970716339066895



In [15]:
# F statistic:
fstat = (r2_ur - r2_r) / (1 - r2_ur) * (n - 6) / 3
print(f"fstat: {fstat}\n")

fstat: 9.550253521951914



In [16]:
# CV for alpha=1% using the F distribution with 3 and 347 d.f.:
cv = stats.f.ppf(1 - 0.01, 3, 347)
print(f"cv: {cv}\n")

cv: 3.838520048496057



In [17]:
# p value = 1-cdf of the appropriate F distribution:
fpval = 1 - stats.f.cdf(fstat, 3, 347)
print(f"fpval: {fpval}\n")

fpval: 4.473708139829391e-06



In [18]:
mlb1 = wool.data("mlb1")

# OLS regression:
reg = smf.ols(
    formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
    data=mlb1,
)
results = reg.fit()

# automated F test:
hypotheses = ["bavg = 0", "hrunsyr = 0", "rbisyr = 0"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")

fstat: 9.550253521951873

fpval: 4.4737081398389455e-06



In [19]:
mlb1 = wool.data("mlb1")

# OLS regression:
reg = smf.ols(
    formula="np.log(salary) ~ years + gamesyr + bavg + hrunsyr + rbisyr",
    data=mlb1,
)
results = reg.fit()

# automated F test:
hypotheses = ["bavg = 0", "hrunsyr = 2*rbisyr"]
ftest = results.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f"fstat: {fstat}\n")
print(f"fpval: {fpval}\n")

fstat: 0.5117822576247315

fpval: 0.5998780329146685

