# core

> Main functions of `estout` package

Main functions are:

- `collect_stats`: extracts a given set of attributes from results object generated by stats packages
- `to_df`: takes a list of `collect_stats` outputs and merges them as separate columns in a pd.DataFrame
- `to_tex`: takes one or more outputs from `to_df` and exports them to a LaTex file

In [None]:
#|default_exp core

In [None]:
#|hide 
from nbdev.showdoc import *
from nbdev.test import *

In [None]:
#|exports
from __future__ import annotations
from pathlib import Path 
from typing import List, Dict, Literal
import importlib

import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels import PanelOLS

from estout.utils import *

Set up an example dataset and run a few regressions to showcase the functions in this module.

In [None]:
np.random.seed(123)
df = pd.DataFrame(np.random.rand(9,3), 
                  columns=['y','x','z'],
                  index = pd.MultiIndex.from_product([[1,2,3],[1,2,3]], names=['firmid','time'])
                  ).assign(cons = 1)
sm1 = sm.OLS(df['y'], df[['cons','x']]).fit()
sm2 = sm.OLS(df['y'], df[['cons','x','z']]).fit().get_robustcov_results(cov_type='HAC', maxlags=2)
lmres = PanelOLS(df['y'],  df[['cons','x','z']], entity_effects=True
                 ).fit(cov_type='clustered', cluster_entity=True)

In [None]:
#|export
def collect_stats(res, # Results object to extract stats from
                  get_default_stats = True, # If True, returns all stats implemented by the f'{package}_results' module
                  add_stats: dict=None, # Keys are stats to extract in addition to the default ones; values are attributes of 'res'
                  add_literals: dict=None, # Additional info to be added to output dict; values must be scalars
                  ) -> dict:
    """Collects stats from 'res' object. stats in 'add_stats' can override default stats()"""

    out = {'package': res.__module__.split('.')[0]}
    results_module = importlib.import_module(f"estout.{out['package']}_results")

    if get_default_stats:
        for stat in results_module.__all__:
            out[stat] = rgetattr(results_module, stat)(res)

    if add_stats is not None:
        for stat, attr in add_stats.items():
            out[stat] = rgetattr(res, attr)

    if add_literals is not None:
        out.update(add_literals)
        
    return out

In [None]:
stats1 = collect_stats(sm1)
stats2 = collect_stats(sm2, add_literals={'Cov Type': 'Newey West'})
stats3 = collect_stats(lmres, add_stats={'r2b': 'rsquared_between', 'FE':'included_effects'})

In [None]:
stats1

{'package': 'statsmodels',
 'ynames': ['y'],
 'xnames': ['cons', 'x'],
 'params': cons    0.507852
 x       0.345003
 dtype: float64,
 'tstats': cons    3.905440
 x       1.292246
 dtype: float64,
 'pvalues': cons    0.005858
 x       0.237293
 dtype: float64,
 'covmat':           cons         x
 cons  0.016910 -0.030531
 x    -0.030531  0.071278,
 'se': cons    0.130037
 x       0.266979
 dtype: float64,
 'nobs': 9,
 'r2': 0.19260886185799475}

In [None]:
assert stats2['Cov Type'] == 'Newey West'

In [None]:
stats3['r2b']

0.7954933715233719

In [None]:
stats3['FE']

['Entity']

In [None]:
#|export
def to_df(res_list: List[dict], # List of outputs from `collect_stats()`
          which_xvars: list=None, # If None, report all xvars
          stats_body: list=['params', 'tstats'], # Each element of 'res_list' needs to have these stats as keys; values must be pd.Series
          stats_bottom: list=['r2', 'nobs'], # Each element of 'res_list' needs to have these stats as keys; values must be scalars
          labels: dict=None,
          add_formats: dict=None  
          ) -> pd.DataFrame: 
    """Combines results from multiple `collect_stats()` outputs into a single pd.DataFrame"""  
    
    formats = default_formats()
    if add_formats is not None: formats.update(add_formats)
    
    columns = []
    for i,res in enumerate(res_list):
        newcol = pd.concat([res[x] for x in stats_body], axis=1, ignore_index=True).set_axis(stats_body, axis=1)
        for x in stats_body:
            newcol[x] = newcol[x].map(formats[x].format)
            if x == 'params':
                newcol[x] += get_stars(res['pvalues'])
            else:
                newcol[x] = '(' + newcol[x] + ')'
        newcol = newcol.stack(level=0) #set_index('coeff_names')
        columns.append(newcol)

    out = pd.concat(columns, axis = 1)
    if which_xvars is not None: out = out.loc[which_xvars].copy()
    
    for i,res in enumerate(res_list):
        for x in stats_bottom:
            out.loc[x,i] = formats[x].format(res[x]) if x in formats else res[x]

    if labels is not None:
        for var in set(out.droplevel(1).index):
            if var in labels: out = out.rename(index={var:labels[var]}, level=0) 

    #Clean up row names 
    out = out.reset_index()
    out.loc[~out['level_1'].isin(['params','']), 'level_0'] = ''
    out = out.set_index(['level_0','level_1'])
    out.index.names = (None,None)

    return out.astype('string').fillna('')

In [None]:
d = to_df(res_list=[stats1, stats2, stats3], 
          which_xvars=['cons','x','z'], 
          add_formats={'r2':'{:.2f}'},
          labels={'cons':'Intercept'})
d

Unnamed: 0,Unnamed: 1,0,1,2
Intercept,params,0.51***,0.70***,0.73***
,tstats,(3.91),(21.48),(167.36)
x,params,0.35,0.57**,0.64*
,tstats,(1.29),(2.85),(2.26)
z,params,,-0.64**,-0.77**
,tstats,,(-3.55),(-2.91)
r2,,0.19,0.49,0.35
nobs,,9,9,9


TODO: put tabular environment inside df_to_tex, not in combine_panels. You might even get rid of combine panels now since it would just concatenate the string created with df_to_tex

In [None]:
#|export
def df_to_tex(df: pd.DataFrame, # Output from estout.to_df()
                column_group_names: Dict[str, List[int]]=None, # Keys are group names; values are lists of consecutive indices of columns in the group
                column_names: List[str]|bool=True, # If False, none; if True, use df column names; if list, gives custom column names
                hlines: List[int]=[] # Row indices under which to place hline
                ) -> str: 
    """Creates LaTeX-formatted table from DataFrame."""

    df = df.droplevel(1)

    line_counter = 0
    out = ''
    if 0 in hlines: out += ' \hline \n'

    if column_group_names:
        out += model_groups(column_group_names)
        line_counter += 1
        if 1 in hlines: out += ' \hline \n'

    if column_names is True:
        column_names = [str(x) for x in df.columns]

    if column_names:
        out += ' & '.join([''] + column_names) + ' \\\\ \n'
        line_counter += 1
        if line_counter in hlines: out += ' \hline \n'

    for rownr in range(df.shape[0]):
        out += str(df.index[rownr]) + ' & '  + ' & '.join(list(df.iloc[rownr])) +  ' \\\\ \n' 
        line_counter += 1
        if line_counter in hlines: out += ' \hline \n'

    return out

Column group names(if any) will appear first, then column names (if any), then the body of the table


In [None]:
print(df_to_tex(d))

 & 0 & 1 & 2 \\ 
Intercept & 0.51*** & 0.70*** & 0.73*** \\ 
 & (3.91) & (21.48) & (167.36) \\ 
x & 0.35 & 0.57** & 0.64* \\ 
 & (1.29) & (2.85) & (2.26) \\ 
z &  & -0.64** & -0.77** \\ 
 &  & (-3.55) & (-2.91) \\ 
r2 & 0.19 & 0.49 & 0.35 \\ 
nobs & 9 & 9 & 9 \\ 



In [None]:
#|export
def combine_panels(panels: Dict[str,str], # Keys are panel titles, values are outputs of df_to_tex()
                    nr_columns: List[int], # Number of columns in each panel. 
                    ptitles_over_columns: bool=True,
                    panel_alignment: str='c',
                    space_bw_panels: str='\\rule{0pt}{3ex}',
                    hlines_under_ptitles: bool=True,
                    tex_tabular_env: str='tabularx',
                    ) -> str:
    """keys of 'panels' dict are panels titles, values are outputs from 'to_latex()'"""
    
    nr_columns = max(nr_columns)
    header,footer = tex_table_env(nr_columns, tex_tabular_env)
    out = header + ' \n  \hline \n'

    for ptitle, ptext in panels.items():
        if ptitle!='':
            if ptitles_over_columns:
                out += space_bw_panels \
                        + f' & \multicolumn{{{nr_columns}}}{{{panel_alignment}}}{{{ptitle}}}' \
                        + ' \\\\ \n'
            else:
                out += space_bw_panels \
                        + f'\multicolumn{{{nr_columns+1}}}{{@{{}} {panel_alignment}}}{{{ptitle}}}' \
                        + ' \\\\ \n'

            if hlines_under_ptitles: out += ' \hline \n'
        out += ptext + '  \n'
    
    out += footer + '\n'
    return out

In [None]:
#panels = {'Panel A': df_to_tex(d), 'Panel B': df_to_tex(d)}
#combine_panels(panels, nr_columns=[])

In [None]:
#|export
def to_tex(dfs: pd.DataFrame|List[pd.DataFrame], # (List of) outputs from estout.to_df()
            outfile: Path|str=None, # Where to save resulting tex output
            title: str='Table title', # Table title
            notes: str='Table description', # Some call this the table caption
            notes_on_top: bool=True, # Set to False if you want table description (caption) to be at the bottom
            label: str='', # Table label (for referencing within LaTex document)
            table_type: Literal['table','sidewaystable']='table',
            font_size: str='\scriptsize', # Gets applied to the table contents as well as its caption
            addtocounter: int=0, # Set to -1 for tables that are just a continuation of a table on a new page
            
            panel_titles: List[str] = None, # If 'dfs' is list, this param must have the same size as 'dfs' 
            ptitles_over_columns: bool=True,
            panel_alignment: str='c',
            space_bw_panels: str='\\rule{0pt}{3ex}',
            hlines_under_ptitles: bool=True,
            tex_tabular_env: str='tabularx',

            column_group_names: Dict[str, List[int]]=None, # see df_to_tex(); if 'dfs' is list, this param must be a list of dicts
            column_names: List[str]|bool=True, # see df_to_tex(); if 'dfs' is list, this param must be a list of lists
            hlines: List[int]=[] # see df_to_tex(); if 'dfs' is list, this param must be a list of lists
            ):

    if isinstance(dfs, pd.DataFrame): 
        panels = {'': df_to_tex(dfs, column_group_names, column_names, hlines)} 
        nr_columns = [len(dfs.columns)]
    if isinstance(dfs, list):
         panels = {panel_titles[i]: df_to_tex(dfs[i], column_group_names[i], column_names[i], hlines[i])
                   for i in range(len(dfs))} 
         nr_columns = [len(dfs[i].columns) for i in range(len(dfs))]
    
    body = combine_panels(panels, nr_columns, ptitles_over_columns, panel_alignment, space_bw_panels, hlines_under_ptitles, tex_tabular_env)

    pre = "\\newpage \n \\clearpage \n "
    pre += f"\\begin{{{table_type}}}[!h] {font_size} \n"
    pre += f"\\addtocounter{{table}}{{{addtocounter}}} \n"
    pre += f"\\caption{{\\textbf{{{title}}}}} \n"

    post = f"\\label{{{label}}} \n \\end{{{table_type}}} \n"

    notes_tex = ''
    if addtocounter==0: notes_tex = f"\\par {{{notes}}}"

    if notes_on_top:
        mid = notes_tex + f" \n\n \\vspace{{{'1mm'}}} \n\n {body} \n"
    else:
        mid = f"{body} \n\n \\vspace{{{'1mm'}}} \n\n " + notes_tex + ' \n'

    content = pre + mid + post

    if outfile:
        with open(outfile, "+w") as f:
            f.write(content) 
    return content

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()