# <span style="color:lightblue"> Lecture 11 (Optional): Regression Output </span>

<font size = "5">

This is an optional lecture file

- This is only recommended if you've taken statistics courses 
- This lecture will not be formally evaluated
- Keep this in material in mind for future courses


# <span style="color:lightblue"> I. Import Libraries </span>


In [None]:
# The "pandas" library is used for processing datasets
# The "numpy" is for numeric observations and random numbers
# The "matplotlib.pyplot" library is for creating graphs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<font size = "5">

Install the "statsmodels" library
- Run "pip3 install statsmodels" in the terminal
- Automatically included in Anaconda

In [None]:
# We will "alias" two sublibraries in "statsmodels"
# "statsmodels.formula.api" contains functions to estimate models
# "statsmodels.api" contains general-use statistical options

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col


# <span style="color:lightblue"> II. Generate Simulated Data </span>

<font size = "5">

Create an empty dataset

In [None]:
dataset = pd.DataFrame([])

<font size = "5">

Create three random variables of size ($n = 100$)

In [None]:
n = 100
dataset["x"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["z"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["e"] = np.random.normal(loc = 0,scale = 1, size = n)


<font size = "5">

Create discre random variable ($n = 100$)

In [None]:
dataset["d"] = np.random.choice(a = [1,2,3],
                                size = n,
                                p = [0.2,0.2,0.6])

<font size = "5">

Create data from the linear model

$ y = 2 + 5 x + e$

In [None]:
# We can compute formulas directly over dataset columns
dataset["y"] =2 + 5* dataset["x"] + dataset["e"]

# <span style="color:lightblue"> III. Regression Tables </span>


<font size = "5">

Summaries for univariate regression

In [None]:
# Run the model with multiple variables by using "+"
results_univariate = smf.ols(formula = 'y ~ x',data = dataset).fit(cov = "HC1")

# The "summary_col" functions produces nice outputs
# We can add notation for significance by setting "stars" to True
print(summary_col(results_univariate,
                  stars = True))

<font size = "5">

Summaries for multivariate regression

In [None]:
# Run the model with multiple variables by using "+"
results_multivariate = smf.ols(formula = 'y ~ x + z',
                               data = dataset).fit(cov = "HC1")
print(summary_col(results_multivariate,
                  stars = True))

<font size = "5">

Summaries for multivariate regression + categories

In [None]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_category = smf.ols(formula = 'y ~ x + C(d)',
                                        data = dataset).fit(cov = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_category,
                  stars = True))

<font size = "5">

Summaries for multivariate regression + interaction

In [None]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_interaction = smf.ols(formula = 'y ~ x + z + z:x',
                                        data = dataset).fit(cov = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_interaction,
                  stars = True))

# <span style="color:lightblue"> IV. Professional Tables </span>


<font size = "5">

Summaries for multiple columns

In [None]:
list_results = [results_univariate,
                results_multivariate,
                results_multivariate_category,
                results_multivariate_interaction]

print(summary_col(list_results,
                  stars = True))


<font size = "5">

Summaries for multiple columns (sorted + titled + stats)

In [None]:
# This list inputs the headings of the table
list_headings   = ["Univariate",
                   "Multivariate",
                   "Categorical",
                   "Interaction"]

# This is the list of regressor names (if you want a particular order)
list_regressors = ["x",
                   "z",
                   "z:x",
                   "C(d)[T.2]",
                   "C(d)[T.3]"]

# This is a function that extracts the sample size
# Can use with other summary statistics
# "nobs" is the number of observations
compute_summary = {'N':lambda model: format(int(model.nobs))}

print(summary_col(list_results,
                  stars = True,
                  model_names = list_headings,
                  info_dict={'N':lambda x: format(int(x.nobs))},
                  regressor_order = ["x","z","z:x","C(d)[T.2]","C(d)[T.3]"]))

<font size = "5">

Detailed table

In [None]:
# Detailed Summary
print(results_univariate.summary())