# 7. Multiple Regression Analysis with Qualitative Regressors

## 7.1. Linear Regression with Dummy Variables as Regressors

### Example 7.1: Hourly Wage Equation

In [1]:
from supplementaryFunctions import *
import wooldridge as woo
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

wage1 = woo.dataWoo("wage1")

reg = smf.ols(formula = "wage ~ female + educ + exper + tenure",
             data = wage1)
results = reg.fit()

OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.3635$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,-1.5679,-2.164,0.0309,0.7246
female,-1.8109,-6.8379,0.0,0.2648
educ,0.5715,11.5836,0.0,0.0493
exper,0.0254,2.1951,0.0286,0.0116
tenure,0.141,6.6632,0.0,0.0212


In [2]:
wage1.keys()

Index(['wage', 'educ', 'exper', 'tenure', 'nonwhite', 'female', 'married',
       'numdep', 'smsa', 'northcen', 'south', 'west', 'construc', 'ndurman',
       'trcommpu', 'trade', 'services', 'profserv', 'profocc', 'clerocc',
       'servocc', 'lwage', 'expersq', 'tenursq'],
      dtype='object')

In [3]:
reg = smf.ols(formula = "np.log(wage) ~ married*female + educ + exper + I(exper**2) + tenure + I(tenure**2)",
             data = wage1)
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.4609$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.3214,3.2135,0.0014,0.1
married,0.2127,3.8419,0.0001,0.0554
female,-0.1104,-1.9797,0.0483,0.0557
married:female,-0.3006,-4.1885,0.0,0.0718
educ,0.0789,11.7873,0.0,0.0067
exper,0.0268,5.1118,0.0,0.0052
I(exper ** 2),-0.0005,-4.8471,0.0,0.0001
tenure,0.0291,4.3016,0.0,0.0068
I(tenure ** 2),-0.0005,-2.3056,0.0215,0.0002


In [4]:
# save data used for regression in single, concise df.
def regression_df(reg):
    reg_df = reg.data.orig_exog
    reg_df[list(reg.data.orig_endog.keys())] = reg.data.orig_endog
    return reg_df

reg_df = regression_df(reg)
reg_df

Unnamed: 0,Intercept,married,female,married:female,educ,exper,I(exper ** 2),tenure,I(tenure ** 2),np.log(wage)
0,1.0,0.0,1.0,0.0,11.0,2.0,4.0,0.0,0.0,1.131402
1,1.0,1.0,1.0,1.0,12.0,22.0,484.0,2.0,4.0,1.175573
2,1.0,0.0,0.0,0.0,11.0,2.0,4.0,0.0,0.0,1.098612
3,1.0,1.0,0.0,0.0,8.0,44.0,1936.0,28.0,784.0,1.791759
4,1.0,1.0,0.0,0.0,12.0,7.0,49.0,2.0,4.0,1.667707
...,...,...,...,...,...,...,...,...,...,...
521,1.0,1.0,1.0,1.0,16.0,14.0,196.0,2.0,4.0,2.708050
522,1.0,0.0,1.0,0.0,10.0,2.0,4.0,0.0,0.0,0.819780
523,1.0,1.0,0.0,0.0,15.0,13.0,169.0,18.0,324.0,1.541159
524,1.0,1.0,0.0,0.0,16.0,5.0,25.0,1.0,1.0,2.447551


## 7.2 Boolean Variables

Let's check if being a parent impacts your expected wage. If _"numdep"_ for an observation is positive, then the new variable _"parent"_ is True

In [5]:
wage1["parent"] = wage1["numdep"] > 0

In [6]:
reg = smf.ols(formula = "np.log(wage) ~ married*female + educ + exper + parent + I(exper**2) + tenure + I(tenure**2)",
             data = wage1)
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.4654$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.3808,3.6745,0.0003,0.1036
parent[T.True],-0.0808,-2.0968,0.0365,0.0386
married,0.2424,4.2551,0.0,0.057
female,-0.0893,-1.5814,0.1144,0.0565
married:female,-0.3227,-4.4628,0.0,0.0723
educ,0.0754,10.9653,0.0,0.0069
exper,0.0289,5.4313,0.0,0.0053
I(exper ** 2),-0.0006,-5.2513,0.0,0.0001
tenure,0.029,4.3032,0.0,0.0067
I(tenure ** 2),-0.0005,-2.3233,0.0205,0.0002


The result does not greatly improve the explanation of variance, though it does seem to impact the statistical significance of _"female"_. Let's try with an interaction term.

In [7]:
reg = smf.ols(formula = "np.log(wage) ~ married*female + educ + exper + parent*female + I(exper**2) + tenure + I(tenure**2)",
             data = wage1)
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.4657$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.3794,3.6571,0.0003,0.1037
parent[T.True],-0.0637,-1.2037,0.2293,0.0529
married,0.2377,4.1071,0.0,0.0579
female,-0.076,-1.205,0.2288,0.0631
parent[T.True]:female,-0.0344,-0.4719,0.6372,0.0728
married:female,-0.3158,-4.2769,0.0,0.0738
educ,0.0753,10.9302,0.0,0.0069
exper,0.0289,5.421,0.0,0.0053
I(exper ** 2),-0.0006,-5.2442,0.0,0.0001
tenure,0.0283,4.1116,0.0,0.0069


Again, we have not greatly improve the explanation of variance, though this variable does, though the stability of our estimate for the interaction term _"married:female"_ is stable, which should increase our confidence about that interaction term.

## 7.3 Categorical Variables

In [8]:
CPS1985 = pd.read_csv("data/CPS1985.csv")
CPS1985

Unnamed: 0,wage,education,experience,age,ethnicity,region,gender,occupation,sector,union,married
0,5.10,8,21,35,hispanic,other,female,worker,manufacturing,no,yes
1,4.95,9,42,57,cauc,other,female,worker,manufacturing,no,yes
2,6.67,12,1,19,cauc,other,male,worker,manufacturing,no,no
3,4.00,12,4,22,cauc,other,male,worker,other,no,no
4,7.50,12,17,35,cauc,other,male,worker,other,no,yes
...,...,...,...,...,...,...,...,...,...,...,...
529,11.36,18,5,29,cauc,other,male,technical,other,no,no
530,6.10,12,33,51,other,other,female,technical,other,no,yes
531,23.25,17,25,48,other,other,female,technical,other,yes,yes
532,19.88,12,13,31,cauc,south,male,technical,other,yes,yes


In [9]:
freq_gender = CPS1985["gender"].value_counts()
freq_gender

male      289
female    245
Name: gender, dtype: int64

In [10]:
freq_occupation = CPS1985["occupation"].value_counts()
freq_occupation

worker        156
technical     105
office         97
services       83
management     55
sales          38
Name: occupation, dtype: int64

In [11]:
reg = smf.ols(
    formula = "np.log(wage) ~ education + experience + C(gender) + C(occupation)",
    data = CPS1985)
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.3178$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.905,5.2718,0.0,0.1717
C(gender)[T.male],0.2238,5.2979,0.0,0.0423
C(occupation)[T.office],-0.2073,-2.6699,0.0078,0.0776
C(occupation)[T.sales],-0.3601,-3.8455,0.0001,0.0936
C(occupation)[T.services],-0.3626,-4.4305,0.0,0.0818
C(occupation)[T.technical],-0.0101,-0.1363,0.8916,0.074
C(occupation)[T.worker],-0.1525,-1.9981,0.0462,0.0763
education,0.0759,7.5449,0.0,0.0101
experience,0.0119,7.0895,0.0,0.0017


In [12]:
reg_newref = smf.ols(
    formula = "np.log(wage) ~ education + experience + C(gender, Treatment('male')) + C(occupation, Treatment('technical'))",
    data = CPS1985)
results_newref = reg_newref.fit()
OLS_summary(results_newref)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.3178$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,1.1187,6.3393,0.0,0.1765
"C(gender, Treatment('male'))[T.female]",-0.2238,-5.2979,0.0,0.0423
"C(occupation, Treatment('technical'))[T.management]",0.0101,0.1363,0.8916,0.074
"C(occupation, Treatment('technical'))[T.office]",-0.1972,-2.9082,0.0038,0.0678
"C(occupation, Treatment('technical'))[T.sales]",-0.35,-4.0541,0.0001,0.0863
"C(occupation, Treatment('technical'))[T.services]",-0.3525,-4.703,0.0,0.075
"C(occupation, Treatment('technical'))[T.worker]",-0.1425,-2.0218,0.0437,0.0705
education,0.0759,7.5449,0.0,0.0101
experience,0.0119,7.0895,0.0,0.0017


### 7.3.1. ANOVA Tables

In [13]:
reg = smf.ols(
    formula = "np.log(wage) ~ education + experience + gender + occupation",
    data = CPS1985)
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.3178$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,0.905,5.2718,0.0,0.1717
gender[T.male],0.2238,5.2979,0.0,0.0423
occupation[T.office],-0.2073,-2.6699,0.0078,0.0776
occupation[T.sales],-0.3601,-3.8455,0.0001,0.0936
occupation[T.services],-0.3626,-4.4305,0.0,0.0818
occupation[T.technical],-0.0101,-0.1363,0.8916,0.074
occupation[T.worker],-0.1525,-1.9981,0.0462,0.0763
education,0.0759,7.5449,0.0,0.0101
experience,0.0119,7.0895,0.0,0.0017


In [14]:
table_anova = sm.stats.anova_lm(results, typ=2)
table_anova

Unnamed: 0,sum_sq,df,F,PR(>F)
gender,5.414018,1.0,28.067296,1.727015e-07
occupation,7.152529,5.0,7.416013,9.805485e-07
education,10.980589,1.0,56.92545,2.010374e-13
experience,9.695055,1.0,50.261001,4.365391e-12
Residual,101.269451,525.0,,


## 7.4. Breaking a Numeric Variable Into Categories

### Example 7.8: effects of Law School Ranings on Startnig Salaries

In [15]:
lawsch85 = woo.dataWoo("lawsch85")
cutpts = [0, 10, 25, 40, 60, 100, 175]
labels = ["(" + str(cutpts[i]) + ", " + str(cutpts[i+1]) + "]" for i in range(len(cutpts) - 1)]
lawsch85["rc"] = pd.cut(lawsch85["rank"], bins = cutpts, labels = labels)
lawsch85["rc"].value_counts()

(100, 175]    62
(60, 100]     37
(40, 60]      18
(10, 25]      16
(25, 40]      13
(0, 10]       10
Name: rc, dtype: int64

In [16]:
reg = smf.ols(
    formula = "np.log(salary) ~ C(rc, Treatment('(100, 175]')) + LSAT + GPA + np.log(libvol) + np.log(cost)",
             data = lawsch85)
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.9109$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,9.1653,22.277,0.0,0.4114
"C(rc, Treatment('(100, 175]'))[T.(0, 10]]",0.6996,13.078,0.0,0.0535
"C(rc, Treatment('(100, 175]'))[T.(10, 25]]",0.5935,15.0493,0.0,0.0394
"C(rc, Treatment('(100, 175]'))[T.(25, 40]]",0.3751,11.0054,0.0,0.0341
"C(rc, Treatment('(100, 175]'))[T.(40, 60]]",0.2628,9.3991,0.0,0.028
"C(rc, Treatment('(100, 175]'))[T.(60, 100]]",0.1316,6.254,0.0,0.021
LSAT,0.0057,1.8579,0.0655,0.0031
GPA,0.0137,0.185,0.8535,0.0742
np.log(libvol),0.0364,1.3976,0.1647,0.026
np.log(cost),0.0008,0.0335,0.9734,0.0251


In [17]:
table_anova = sm.stats.anova_lm(results, typ=2)
table_anova

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(rc, Treatment('(100, 175]'))",1.868867,5.0,50.962988,1.174406e-28
LSAT,0.025317,1.0,3.4519,0.0655132
GPA,0.000251,1.0,0.034225,0.8535262
np.log(libvol),0.014327,1.0,1.953419,0.1646748
np.log(cost),8e-06,1.0,0.00112,0.9733564
Residual,0.924111,126.0,,


## 7.5. Interactions and Differences in Regression Functions Across Groups

In [18]:
gpa3 = woo.dataWoo("gpa3")

reg = smf.ols(
    formula = "cumgpa ~ female * (sat + hsperc + tothrs)",
    data = gpa3,
    # estimate only for spring data
    subset = gpa3["spring"]==1)
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.4059$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,1.4808,7.1422,0.0,0.2073
female,-0.3535,-0.861,0.3898,0.4105
sat,0.0011,5.8073,0.0,0.0002
hsperc,-0.0085,-6.1674,0.0,0.0014
tothrs,0.0023,2.7182,0.0069,0.0009
female:sat,0.0008,1.9488,0.0521,0.0004
female:hsperc,-0.0005,-0.1739,0.8621,0.0032
female:tothrs,-0.0001,-0.0712,0.9433,0.0016


In [19]:
# F-test
hypotheses = ["female = 0", 
              "female:sat = 0", 
              "female:hsperc = 0", 
              "female:tothrs = 0"]
ftest = results.f_test(hypotheses)
key = str(hypotheses).replace("[","").replace("]","").replace(",", "")
f_results = {key:{}}
f_results[key]["fstat"] = ftest.statistic
f_results[key]["fpval"] = ftest.pvalue
pd.DataFrame(f_results)

Unnamed: 0,'female = 0' 'female:sat = 0' 'female:hsperc = 0' 'female:tothrs = 0'
fpval,3e-06
fstat,8.179112


In [20]:
reg = smf.ols(
    formula = "cumgpa ~ sat + hsperc + tothrs",
    data = gpa3,
    # estimate only for spring data
    subset = (gpa3["spring"]==1) & (gpa3["female"] == 0))
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.3169$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,1.4808,7.1894,0.0,0.206
sat,0.0011,5.8458,0.0,0.0002
hsperc,-0.0085,-6.2082,0.0,0.0014
tothrs,0.0023,2.7362,0.0066,0.0009


In [21]:
gpa3 = woo.dataWoo("gpa3")

reg = smf.ols(
    formula = "cumgpa ~ sat + hsperc + tothrs",
    data = gpa3,
    # estimate only for spring data
    subset = (gpa3["spring"]==1) & (gpa3["female"] == 1))
results = reg.fit()
OLS_summary(results)

Unnamed: 0_level_0,$\beta $,$t$,$$P>|t|$$,$SE$
$$r^2: 0.4014$$,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Intercept,1.1273,3.1176,0.0025,0.3616
sat,0.0018,5.195,0.0,0.0003
hsperc,-0.009,-3.0956,0.0027,0.0029
tothrs,0.0022,1.5817,0.1174,0.0014
