In [36]:
import pandas as pd
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

### Code from Visualization notebook to create wages data frame

In [37]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
    
url = urlopen("https://www.cengage.com/aise/economics/wooldridge_3e_datasets/statafiles.zip")

with ZipFile(BytesIO(url.read())) as zipped:
    file = zipped.open("WAGE1.DTA")

stata = pd.read_stata(file, iterator=True)
wages = stata.read()

wages['area'] = 0
for i, col in enumerate(['northcen', 'south', 'west']):
    wages['area'] += (i+1) * wages[col]
wages['area'] = wages.area.map({0: 'northeast', 1: 'northcen', 2: 'south', 3: 'west'})

occupations = wages.columns.to_list()[12:18] 
wages['occup'] = 0
for i, col in enumerate(occupations):
    wages['occup'] += (i+1) * wages[col]
dct = {0: 'other'}
dct.update({(i+1): occupations[i] for i in range(6)})
wages['occup'] = wages.occup.map(dct)

wages = wages[
    [
        'wage', 
        'educ', 
        'exper', 
        'tenure', 
        'nonwhite', 
        'female', 
        'married',
        'numdep',
        'smsa',
        'area', 
        'occup'
    ]
]
wages.head()

Unnamed: 0,wage,educ,exper,tenure,nonwhite,female,married,numdep,smsa,area,occup
0,3.1,11,2,0,0,1,0,2,1,west,other
1,3.24,12,22,2,0,1,1,3,1,west,services
2,3.0,11,2,0,0,0,0,2,0,west,trade
3,6.0,8,44,28,0,0,1,0,1,west,other
4,5.3,12,7,2,0,0,1,1,0,west,other


### Basic regression

In [38]:
model = smf.ols("wage ~ educ", data=wages)
result = model.fit()
# result.summary()

### Heteroskedasticity consistent standard errors

In [39]:
model = smf.ols("wage ~ educ", data=wages)
result = model.fit(cov_type="HC3")
# result.summary()

### Saving output to Excel

In [40]:
excel = result.summary().tables[1]
pd.DataFrame(excel).to_excel('table.xlsx', header=False, index=False)

### Saving output to latex

In [41]:
stargazer = Stargazer([result])
tex = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(tex)

# print(tex)

### Multivariate

In [42]:
model = smf.ols("wage ~ educ+exper+tenure+numdep", data=wages)
result = model.fit(cov_type="HC3")
# result.summary()


### Dummy and Categorical Variables

We could do C(area) and C(occup) but this is unnecessary for categorical text variables.  We might want to treat numdep as numerical, but using C(numdep) causes it to be treated as categorical (generating dummy variables).

In [43]:
model = smf.ols(
    "wage ~ educ+exper+tenure+C(numdep)+female+nonwhite+married+smsa+area+occup", 
    data=wages
)
result = model.fit(cov_type='HC3')
# result.summary()

In [44]:
stargazer = Stargazer([result])
stargazer.covariate_order(
    [
        'educ', 
        'exper', 
        'tenure', 
        'female', 
        'nonwhite', 
        'married'
    ]
)
tex = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(tex)

# print(tex)

### Multiple models

In [45]:
mod1 = smf.ols(
    "wage ~ educ+C(numdep)+smsa+area+occup", 
    data=wages
)

mod2 = smf.ols(
    "wage ~ educ+exper+tenure+C(numdep)+smsa+area+occup", 
    data=wages
)

mod3 = smf.ols(
    "wage ~ educ+exper+tenure+female+nonwhite+married+C(numdep)+smsa+area+occup", 
    data=wages
)

results = []
for mod in [mod1, mod2, mod3]: 
    result = mod.fit(cov_type="HC3")
    results.append(result)

stargazer = Stargazer(results)
stargazer.covariate_order(
    [
        'educ', 
        'exper', 
        'tenure', 
        'female', 
        'nonwhite', 
        'married'
    ]
)
tex = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(tex)

# print(tex)

### Logit

In [48]:
model = smf.logit("married ~ wage+female", data=wages)
result = model.fit()
stargazer = Stargazer([result])
tex = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(tex)

# result.summary()

Optimization terminated successfully.
         Current function value: 0.635312
         Iterations 6


0,1,2,3
Dep. Variable:,married,No. Observations:,526.0
Model:,Logit,Df Residuals:,523.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 22 Aug 2022",Pseudo R-squ.:,0.05103
Time:,08:37:13,Log-Likelihood:,-334.17
converged:,True,LL-Null:,-352.14
Covariance Type:,nonrobust,LLR p-value:,1.571e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1671,0.250,-0.667,0.505,-0.658,0.324
wage,0.1436,0.034,4.172,0.000,0.076,0.211
female,-0.3854,0.194,-1.984,0.047,-0.766,-0.005
