<a href="https://colab.research.google.com/github/kerryback/2022-BUSI520/blob/main/Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# uncomment and execute the following if necessary

# !pip install stargazer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stargazer
  Downloading stargazer-0.0.5-py3-none-any.whl (9.7 kB)
Installing collected packages: stargazer
Successfully installed stargazer-0.0.5


In [4]:
import pandas as pd
from stargazer.stargazer import Stargazer
import statsmodels.formula.api as smf

### Example data

We'll use the wage data set used in Wooldridge's Introductory Econometrics.  We could just go to the Cengage website and download and extract the zipfile in the usual way and then read the Stata file.  

The data set comes with dummy variables.  That is not normally the way we encounter our data, so I've converted the categorical data back to categories.

In [5]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
    
url = urlopen("https://www.cengage.com/aise/economics/wooldridge_3e_datasets/statafiles.zip")

with ZipFile(BytesIO(url.read())) as zipped:
    file = zipped.open("WAGE1.DTA")
stata = pd.read_stata(file, iterator=True)
wages = stata.read()

wages['area'] = 0
for i, col in enumerate(['northcen', 'south', 'west']):
    wages['area'] += (i+1) * wages[col]
wages['area'] = wages.area.map({0: 'northeast', 1: 'northcen', 2: 'south', 3: 'west'})

occupations = wages.columns.to_list()[12:18] 
wages['occup'] = 0
for i, col in enumerate(occupations):
    wages['occup'] += (i+1) * wages[col]
dct = {0: 'other'}
dct.update({(i+1): occupations[i] for i in range(6)})
wages['occup'] = wages.occup.map(dct)

occup_cats = ['profocc', 'servocc', 'clerocc']
wages['occup_cat'] = 0
for i, col in enumerate(occup_cats):
    wages['occup_cat'] += (i+1) * wages[col]
dct = {0: 'other'}
dct.update({(i+1): occup_cats[i] for i in range(3)})
wages['occup_cat'] = wages.occup_cat.map(dct)

wages = wages[ 
    [ 
        "wage",
        "educ",
        "exper",
        "tenure",
        "nonwhite",
        "female",
        "married",
        "numdep",
        "smsa",
        "area",
        "occup",
        "occup_cat",
    ]
]
wages.head()

Unnamed: 0,wage,educ,exper,tenure,nonwhite,female,married,numdep,smsa,area,occup,occup_cat
0,3.1,11,2,0,0,1,0,2,1,west,other,other
1,3.24,12,22,2,0,1,1,3,1,west,services,servocc
2,3.0,11,2,0,0,0,0,2,0,west,trade,other
3,6.0,8,44,28,0,0,1,0,1,west,other,clerocc
4,5.3,12,7,2,0,0,1,1,0,west,other,other


### Basic regression

In [6]:
model = smf.ols("wage ~ educ", data=wages)
result = model.fit()
# result.summary()

### Heteroskedasticity consistent standard errors

In [7]:
model = smf.ols("wage ~ educ", data=wages)
result = model.fit(cov_type="HC3")
# result.summary()

### Saving output to Excel

In [8]:
table = result.summary().tables[1]
table = pd.DataFrame(table)
table.to_excel('table.xlsx', header=False, index=False)

### Saving output to latex

In [9]:
stargazer = Stargazer([result])
table = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(table)

# print(table)

### Multivariate

In [10]:
model = smf.ols("wage ~ educ + exper + tenure", data=wages)
result = model.fit(cov_type="HC3")
# result.summary()

### Dummy and Categorical Variables

We could do C(area) and C(occup) but this is unnecessary for categorical text variables.  We might want to treat numdep as numerical, but using C(numdep) causes it to be treated as categorical (generating dummy variables).

In [11]:
model = smf.ols(
    "wage ~ educ+exper+tenure+C(numdep)+female+nonwhite+married+smsa+area+occup", 
    data=wages
)
result = model.fit(cov_type='HC3')
# result.summary()

In [12]:
stargazer = Stargazer([result])
stargazer.covariate_order(
    [
        'educ', 
        'exper', 
        'tenure', 
        'female', 
        'nonwhite', 
        'married'
    ]
)
tex = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(tex)

# print(tex)

### Interactions

In [13]:
model = smf.ols(
    "wage ~ educ + exper + educ*exper + tenure + female + nonwhite + female*educ + nonwhite*educ + female*area", 
    data=wages
)
result = model.fit(cov_type='HC3')
# result.summary()

### Multiple models

In [14]:
mod1 = smf.ols(
    "wage ~ educ+C(numdep)+smsa+area+occup", 
    data=wages
)

mod2 = smf.ols(
    "wage ~ educ+exper+tenure+C(numdep)+smsa+area+occup", 
    data=wages
)

mod3 = smf.ols(
    "wage ~ educ+exper+tenure+female+nonwhite+married+C(numdep)+smsa+area+occup", 
    data=wages
)

results = [mod.fit(cov_type="HC3") for mod in [mod1, mod2, mod3]]

stargazer = Stargazer(results)
stargazer.covariate_order(
    [
        'educ', 
        'exper', 
        'tenure', 
        'female', 
        'nonwhite', 
        'married'
    ]
)
tex = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(tex)

# print(tex)

### Logit

In [15]:
model = smf.logit("married ~ wage + female", data=wages)
result = model.fit()
stargazer = Stargazer([result])
tex = stargazer.render_latex()
with open("table.tex", "w") as file:
    file.write(tex)

# result.summary()

Optimization terminated successfully.
         Current function value: 0.635312
         Iterations 6
