# core

> Python package that helps collect outputs of statistical analyses into tables and export them to LaTex and pdf (similar the ``estout`` and ``esttab`` commands in Stata).

In [None]:
#| default_exp core

In [None]:
#| hide 
from nbdev.showdoc import *
from nbdev.test import *

In [None]:
#| export
from typing import List, Dict, Tuple
import importlib
import numpy as np
import pandas as pd

import statsmodels.api as sm
from linearmodels import PanelOLS

from estout.utils import *

In [None]:
np.random.seed(123)
df = pd.DataFrame(np.random.rand(9,3), columns=['y','x','z'])
df['firmid'] = [1]*3 + [2]*3 + [3]*3
df['time'] = [1,2,3]*3
df['cons'] = 1
df = df.set_index(['firmid','time'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,y,x,z,cons
firmid,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,0.696469,0.286139,0.226851,1
1,2,0.551315,0.719469,0.423106,1
1,3,0.980764,0.68483,0.480932,1
2,1,0.392118,0.343178,0.72905,1
2,2,0.438572,0.059678,0.398044,1
2,3,0.737995,0.182492,0.175452,1
3,1,0.531551,0.531828,0.634401,1
3,2,0.849432,0.724455,0.611024,1
3,3,0.722443,0.322959,0.361789,1


In [None]:
sm1 = sm.OLS(df['y'], df[['cons','x']]).fit()
sm2 = sm.OLS(df['y'], df[['cons','x','z']]).fit().get_robustcov_results(cov_type='HAC', maxlags=2)
lmres = PanelOLS(df['y'],  df[['cons','x','z']], entity_effects=True
                 ).fit(cov_type='clustered', cluster_entity=True)

In [None]:
#| export
def collect_stats(res, # results object to extract stats from
                  package: str, # name of package that generated 'res' object
                  get_default_stats = True, # if True, returns all stats implemented by the f'{package}_results' module
                  add_stats: dict=None, # keys are stats to extract in addition to the default ones, values are attributes of 'res'
                  add_literals: dict=None, # additional info to be added to output dict as literal strings
                  ) -> dict:
    """Collects stats from 'res' object. stats in 'add_stats' can override defaults."""

    out = {}
    out['package'] = package
    results_module = importlib.import_module(f"estout.{package}_results")

    if get_default_stats:
        for stat in results_module.default_stats():
            out[stat] = rgetattr(results_module, stat)(res)

    if add_stats is not None:
        for stat, attr in add_stats.items():
            out[stat] = rgetattr(res, attr)

    if add_literals is not None:
        out.update(add_literals)
        
    return out

In [None]:
stats1 = collect_stats(sm1, package='statsmodels')
stats2 = collect_stats(sm2, package='statsmodels', add_literals={'Cov Type': 'Newey West'})
stats3 = collect_stats(lmres, package='linearmodels', add_stats={'r2b': 'rsquared_between', 'FE':'included_effects'})

In [None]:
assert stats2['Cov Type'] == 'Newey West'

In [None]:
stats3

{'package': 'linearmodels',
 'ynames': ['y'],
 'xnames': ['cons', 'x', 'z'],
 'params': cons    0.728016
 x       0.643274
 z      -0.774956
 Name: parameter, dtype: float64,
 'tstats': cons    167.358372
 x         2.262089
 z        -2.909770
 Name: tstat, dtype: float64,
 'pvalues': cons    7.646419e-09
 x       8.648041e-02
 z       4.368821e-02
 Name: pvalue, dtype: float64,
 'covmat':           cons         x         z
 cons  0.000019 -0.000652  0.000580
 x    -0.000652  0.080867 -0.075700
 z     0.000580 -0.075700  0.070931,
 'se': cons    0.004350
 x       0.284371
 z       0.266329
 dtype: float64,
 'r2': 0.35189790336774396,
 'nobs': 9,
 'r2b': 0.7954933715233714,
 'FE': ['Entity']}

In [None]:
stats3['r2b']

0.7954933715233714

In [None]:
stats3['FE']

['Entity']

In [None]:
#| export
def to_df(res_list: List[dict], # list of outputs from `collect_stats()`
          which_xvars: list=None, # if None, report all xvars
          stats_body: list=['params', 'tstats'], # each element of 'res_list' needs to have these stats as keys; values must be pd.Series
          stats_bottom: list=['r2', 'nobs'], # each element of 'res_list' needs to have these stats as keys; values must be scalars
          labels: dict=None,
          add_formats: dict=None  
          ) -> pd.DataFrame: 
    """Combines results from multiple `collect_stats()` outputs into a single pd.DataFrame"""  
    
    formats = default_formats()
    if add_formats is not None: formats.update(add_formats)
    
    columns = []
    for i,res in enumerate(res_list):
        newcol = pd.concat([res[x] for x in stats_body], axis=1, ignore_index=True).set_axis(stats_body, axis=1)
        for x in stats_body:
            newcol[x] = newcol[x].map(formats[x].format)
            if x == 'params':
                newcol[x] += get_stars(res['pvalues'])
            else:
                newcol[x] = '(' + newcol[x] + ')'
        newcol = newcol.stack(level=0) #set_index('coeff_names')
        columns.append(newcol)

    out = pd.concat(columns, axis = 1).loc[which_xvars].copy()
    for i,res in enumerate(res_list):
        for x in stats_bottom:
            out.loc[x,i] = formats[x].format(res[x]) if x in formats else res[x]

    if labels is not None:
        for var in set(out.droplevel(1).index):
            if var in labels: out = out.rename(index={var:labels[var]}, level=0)            

    return out.astype('string').fillna('')

In [None]:
d = to_df(res_list=[stats1, stats2, stats3], 
          which_xvars=['cons','x','z'], 
          add_formats={'r2':'{:.2f}'},
          labels={'cons':'Intercept'})
d

Unnamed: 0,Unnamed: 1,0,1,2
Intercept,params,0.51***,0.70***,0.73***
Intercept,tstats,(3.91),(21.48),(167.36)
x,params,0.35,0.57**,0.64*
x,tstats,(1.29),(2.85),(2.26)
z,params,,-0.64**,-0.77**
z,tstats,,(-3.55),(-2.91)
r2,,0.19,0.49,0.35
nobs,,9,9,9


In [None]:
#| export
def to_tex(get_pdf=True, open_pdf=False):
    pass

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()