# wrds_compa

> Retrieve and process data from WRDS Compustat Annual

Since this is a proprietary dataset, in the documentation below, I can not show any of the data that is being retrieved/generated (will show only column names).

In [None]:
#| default_exp wrds.compa

In [None]:
#|exports
from __future__ import annotations
from pathlib import Path
from typing import List
import os

import pandas as pd
import numpy as np

import pandasmore as pdm
from finsets.wrds import wrds_api
from finsets import RESOURCES

In [None]:
#| export 
def variable_labels(rawfile: str|Path=RESOURCES/'compa_variable_descriptions.csv', # location of the raw variable labels file
             ) -> pd.DataFrame:
    "Loads raw variable labels file, cleans it and returns it as a pd.DataFrame"

    df = pd.read_csv(rawfile)
    df['Variable Label'] = df.apply(lambda row: row['Description'].replace(row['Variable Name'].strip()+' -- ', ''), axis=1)
    df['Variable Label'] = df.apply(lambda row: row['Variable Label'].replace( '(' + row['Variable Name'].strip() + ')', ''), axis=1)
    df['Variable Name'] = df['Variable Name'].str.strip().str.lower()
    df = df[['Variable Name', 'Variable Label', 'Type']].copy()
    df.columns = ['name','label','type']
    return df

In [None]:
variable_labels()

Unnamed: 0,name,label,type
0,gvkey,Global Company Key,string
1,conm,Company Name,string
2,tic,Ticker Symbol,string
3,cusip,CUSIP,string
4,cik,CIK Number,string
...,...,...,...
969,prch_c,Price High - Annual - Calendar,double
970,prch_f,Price High - Annual - Fiscal,double
971,prcl_c,Price Low - Annual - Calendar,double
972,prcl_f,Price Low - Annual - Fiscal,double


In [None]:
#| export
def metadata(wrds_username: str=None
             ) -> pd.DataFrame:
    "Collects metadata from WRDS `comp.funda` table and merges it with `variable_labels`."

    if wrds_username is None:
        wrds_username = os.getenv('WRDS_USERNAME')
        if wrds_username is None: wrds_username = input("Enter your WRDS username: ") 

    with wrds_api.Connection(wrds_username = wrds_username) as db:
        funda = db.describe_table('comp','funda')
        nr_rows = db.get_row_count('comp','funda')
        
    meta = funda[['name','type']].copy()
    meta['nr_rows'] = nr_rows
    meta['wrds_library'] = 'comp'
    meta['wrds_table'] = 'funda'

    meta = meta.merge(variable_labels()[['name','label']], how='left', on='name')
    
    meta['output_of'] = 'wrds.compa.download()'
    meta = pdm.order_columns(meta,these_first=['name','label','output_of'])
    for v in list(meta.columns):
        meta[v] = meta[v].astype('string')
    
    return meta

In [None]:
#| eval: false
metadata()

Loading library list...
Done
Approximately 879854 rows in comp.funda.


Unnamed: 0,name,label,output_of,type,nr_rows,wrds_library,wrds_table
0,gvkey,Global Company Key,wrds.compa.download(),VARCHAR(6),879854,comp,funda
1,datadate,,wrds.compa.download(),DATE,879854,comp,funda
2,fyear,Data Year - Fiscal,wrds.compa.download(),DOUBLE_PRECISION,879854,comp,funda
3,indfmt,,wrds.compa.download(),VARCHAR(12),879854,comp,funda
4,consol,,wrds.compa.download(),VARCHAR(2),879854,comp,funda
...,...,...,...,...,...,...,...
943,au,Auditor,wrds.compa.download(),VARCHAR(8),879854,comp,funda
944,auop,Auditor Opinion,wrds.compa.download(),VARCHAR(8),879854,comp,funda
945,auopic,Auditor Opinion - Internal Control,wrds.compa.download(),VARCHAR(1),879854,comp,funda
946,ceoso,Chief Executive Officer SOX Certification,wrds.compa.download(),VARCHAR(1),879854,comp,funda


In [None]:
#| export
def default_raw_vars():
    """Default variables used in `download` if none are specified."""

    return ['datadate', 'gvkey', 'cusip' ,'cik' ,'tic' ,'fyear' ,'fyr' ,'naicsh', 'sich' ,'exchg',  
            'lt' ,'at' ,'txditc' ,'pstkl' ,'pstkrv' ,'pstk' ,'csho' ,'ajex' , 'rdip',
            'act' ,'dvc' ,'xad','seq' ,'che' ,'lct' ,'dlc' ,'ib' ,'dvp' ,'txdi' ,'dp' ,
            'txp' ,'oancf' ,'ivncf' ,'fincf' ,'dltt' ,'mib','ceq' ,'invt' ,'cogs' , 'revt',
            'sale' ,'capx' ,'xrd' ,'txdb' ,'prcc_f' ,'sstk' ,'prstkc' ,'dltis' ,'dltr' ,'emp' ,
            'dd1' ,'ppegt' ,'ppent' ,'xint' ,'txt' ,'sppe' ,'gdwl' ,'xrent' ,'re' ,'dvpsx_f' ,
            'tstk' ,'wcap' ,'rect' ,'xsga' ,'aqc' ,'oibdp' ,'dpact' ,'fic' ,'ni' ,'ivao' ,'ivst' ,
            'dv' , 'intan' ,'pi' ,'txfo' ,'pifo' ,'xpp' ,'drc' ,'drlt' ,'ap' ,'xacc' ,'itcb']             

In [None]:
print(default_raw_vars())

['datadate', 'gvkey', 'cusip', 'cik', 'tic', 'fyear', 'fyr', 'naicsh', 'sich', 'exchg', 'lt', 'at', 'txditc', 'pstkl', 'pstkrv', 'pstk', 'csho', 'ajex', 'rdip', 'act', 'dvc', 'xad', 'seq', 'che', 'lct', 'dlc', 'ib', 'dvp', 'txdi', 'dp', 'txp', 'oancf', 'ivncf', 'fincf', 'dltt', 'mib', 'ceq', 'invt', 'cogs', 'revt', 'sale', 'capx', 'xrd', 'txdb', 'prcc_f', 'sstk', 'prstkc', 'dltis', 'dltr', 'emp', 'dd1', 'ppegt', 'ppent', 'xint', 'txt', 'sppe', 'gdwl', 'xrent', 're', 'dvpsx_f', 'tstk', 'wcap', 'rect', 'xsga', 'aqc', 'oibdp', 'dpact', 'fic', 'ni', 'ivao', 'ivst', 'dv', 'intan', 'pi', 'txfo', 'pifo', 'xpp', 'drc', 'drlt', 'ap', 'xacc', 'itcb']


In [None]:
#| export
def download(vars: List[str]=None, # If None, downloads `default_raw_vars`; else `permno`, `permco`, and `date` are added by default
             wrds_username: str=None, #If None, looks for WRDS_USERNAME with `os.getenv`, then prompts you if needed
             start_date: str="01/01/1900", # Start date in MM/DD/YYYY format
             end_date: str=None #End date in MM/DD/YYYY format; if None, defaults to current date
             ) -> pd.DataFrame:
    """Downloads `vars` from `start_date` to `end_date` from WRDS `comp.funda` library and adds PERMNO and PERMCO as in CCM"""

    if vars is None: vars = default_raw_vars()
    vars = ','.join(['a.gvkey', 'a.datadate'] + 
                    [f'a.{x}' for x in vars if x not in ['datadate', 'gvkey']])

    sql_string=f"""SELECT b.lpermno as permno, b.lpermco as permco, b.liid as iid, {vars}
                    FROM comp.funda AS a
                    INNER JOIN crsp.ccmxpf_lnkhist AS b ON a.gvkey = b.gvkey
                    WHERE datadate BETWEEN b.linkdt AND COALESCE(b.linkenddt, CURRENT_DATE)
                            AND b.linktype IN ('LU','LC') AND b.linkprim IN ('P','C')
                            AND indfmt='INDL' AND datafmt='STD' AND popsrc='D' AND consol='C'
                            AND datadate BETWEEN '{start_date}' AND COALESCE(%(end)s, CURRENT_DATE)
                """
    return wrds_api.download(sql_string, wrds_username=wrds_username, params={'end':end_date})

In [None]:
#| eval: false
raw = download(start_date='01/01/2022')

Loading library list...
Done


In [None]:
#| eval: false
raw.head(0)

Unnamed: 0,permno,permco,iid,gvkey,datadate,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb


In [None]:
#| export
def clean(df: pd.DataFrame=None, # If None, downloads `vars` using `download` function; else, must contain `permno` and `datadate` columns
          vars: List[str]=None, # If None, downloads `default_raw_vars`
          wrds_username: str=None, #If None, looks for WRDS_USERNAME with `os.getenv`, then prompts you if needed
          start_date: str="01/01/1900", # Start date in MM/DD/YYYY format
          end_date: str=None, # End date. Default is current date          
          clean_kwargs: dict={}, # Params to pass to `pdm.setup_panel` other than `panel_ids`, `time_var`, and `freq`
          ) -> pd.DataFrame:
    """Applies `pandasmore.setup_panel` to `df`. If `df` is None, downloads `vars` using `download` function."""

    if df is None: df = download(vars=vars, wrds_username=wrds_username, start_date=start_date, end_date=end_date)
    df = pdm.setup_panel(df, panel_ids='permno', time_var='datadate', freq='Y', **clean_kwargs)
    return df 

In [None]:
#| eval: false
df = clean(raw)

In [None]:
#| eval: false
df.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,iid,gvkey,cusip,cik,tic,fyear,fyr,...,intan,pi,txfo,pifo,xpp,drc,drlt,ap,xacc,itcb
permno,Ydate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1


In [None]:
#| export
def book_equity(df: pd.DataFrame=None, # If None, downloads (and cleans) only required vars
                add_itcb=False,
                return_metadata: bool=False # If true, just returns a list of the required variables
                ) -> pd.DataFrame:

    metadata = {'inputs': {'wrds.compa.clean()': ['at', 'lt', 'seq', 'ceq', 'txditc', 'pstk', 'pstkrv', 'pstkl', 'itcb']},
                'outputs': {'wrds.compa.book_equity()': ['bookeq','shreq','pref_stock']},
                'labels': {'bookeq': 'Book equity', 'shreq': 'Shareholder equity', 'pref_stock': 'Preferred stock'}
    }      
    if return_metadata: return metadata

    reqs = metadata['inputs']['wrds.compa.clean()']
    if df is None: df = clean(vars=reqs)
    df = df[reqs].copy()

    df['pstk'] = df['pstk'].fillna(0)
    df['pref_stock'] = np.where(df['pstkrv'].isnull(), df['pstkl'], df['pstkrv'])
    df['pref_stock'] = np.where(df['pref_stock'].isnull(),df['pstk'], df['pref_stock'])

    df['shreq'] = np.where(df['seq'].isnull(), df['ceq'] + df['pstk'], df['seq'])
    df['shreq'] = np.where(df['shreq'].isnull(), df['at'] - df['lt'], df['shreq'])

    df['bookeq'] = df['shreq'] + df['txditc'].fillna(0) - df['pref_stock']
    if add_itcb: df['bookeq'] = df['bookeq'] + df['itcb'].fillna(0)
    
    return df[metadata['outputs']['wrds.compa.book_equity()']].copy()

In [None]:
#| eval: false
book_equity(return_metadata=True)

{'inputs': {'wrds.compa.clean()': ['at',
   'lt',
   'seq',
   'ceq',
   'txditc',
   'pstk',
   'pstkrv',
   'pstkl',
   'itcb']},
 'outputs': {'wrds.compa.book_equity()': ['bookeq', 'shreq', 'pref_stock']},
 'labels': {'bookeq': 'Book equity',
  'shreq': 'Shareholder equity',
  'pref_stock': 'Preferred stock'}}

In [None]:
#| eval: false
beq = book_equity(df)

NameError: name 'df' is not defined

In [None]:
#| eval: false
beq.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,bookeq,shreq,pref_stock
permno,Ydate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [None]:
#| eval: false
beq_from_scratch = book_equity()

Loading library list...
Done


In [None]:
#| eval: false
beq_from_scratch.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,bookeq,shreq,pref_stock
permno,Ydate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [None]:
#| export 
def investment_vars(df: pd.DataFrame=None, # If None, downloads (and cleans) only required vars 
                    list_reqs: bool=False # If true, just returns a list of the required variables
                    ) -> pd.DataFrame:
    
    reqs = ['ppent','capx','at']
    if list_reqs: return reqs
    df = df[reqs].copy()

    df['ppentpch'] = pdm.rpct_change(df['ppent'])
    df['capx2la'] = df['capx'] / pdm.lag(df['at'])

    return df[['ppentpch','capx2la']].copy()

In [None]:
#| eval: false
inv = investment_vars(df)

In [None]:
#| eval: false
inv.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,ppentpch,capx2la
permno,Ydate,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
#| export 
def profitability_vars(df: pd.DataFrame, 
                    list_reqs: bool=False # If true, just returns a list of the required variables
                    ) -> pd.DataFrame:
    
    reqs = ['ib','at']
    if list_reqs: return reqs
    df = df[reqs].copy()

    df['roa'] = df['ib'] / df['at']

    return df[['roa']].copy()

In [None]:
#| eval: false
prof = profitability_vars(df)

In [None]:
#| eval: false
prof.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,roa
permno,Ydate,Unnamed: 2_level_1


In [None]:
#| export 
def cashflow_vars(df: pd.DataFrame, 
                    list_reqs: bool=False # If true, just returns a list of the required variables
                    ) -> pd.DataFrame:
    
    reqs = ['dtdate','oancf','ib','dp','at']
    if list_reqs: return reqs
    df = df[reqs].copy()

    df['cflow2la_is'] = (df['ib']+df['dp']) / pdm.lag(df['at'])
    df['cflow2la_cfs'] = df['oancf'] / pdm.lag(df['at'])
    df['cflow2la_full'] = np.where(df.dtdate.dt.year<1987, df['cflow2la_is'], df['cflow2la_cfs'])
    
    return df[['cflow2la_is', 'cflow2la_cfs', 'cflow2la_full']].copy()

In [None]:
#| eval: false
cflow = cashflow_vars(df)

In [None]:
#| eval: false
cflow.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,cflow2la_is,cflow2la_cfs,cflow2la_full
permno,Ydate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [None]:
#| export 
def liquidity_vars(df: pd.DataFrame, 
                    list_reqs: bool=False # If true, just returns a list of the required variables
                    ) -> pd.DataFrame:
    
    reqs = ['che','at']
    if list_reqs: return reqs
    df = df[reqs].copy()

    df['cash2a'] = df['che'] / df['at']

    return df[['cash2a']].copy()

In [None]:
#| eval: false
liq = liquidity_vars(df)

In [None]:
#| eval: false
liq.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,cash2a
permno,Ydate,Unnamed: 2_level_1


In [None]:
#| export 
def leverage_vars(df: pd.DataFrame, 
                    list_reqs: bool=False # If true, just returns a list of the required variables
                    ) -> pd.DataFrame:
    
    reqs = ['dltt','dlc','at']
    if list_reqs: return reqs
    df = df[reqs].copy()

    df['booklev'] = (df['dltt'] + df['dlc']) / df['at']
    df.loc[df.booklev<0, 'booklev'] = 0
    df.loc[df.booklev>1, 'booklev'] = 1
        
    return df[['booklev']].copy()

In [None]:
#| eval: false
lev = leverage_vars(df)

In [None]:
#| eval: false
lev.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,booklev
permno,Ydate,Unnamed: 2_level_1


In [None]:
#| export 
def payout_vars(df: pd.DataFrame, 
                    list_reqs: bool=False # If true, just returns a list of the required variables
                    ) -> pd.DataFrame:
    
    reqs = ['dvc','prstkc','at']
    if list_reqs: return reqs
    df = df[reqs].copy()

    df['div2la'] = df['dvc'].fillna(0) / pdm.lag(df['at'])
    df['rep2la'] = df['prstkc'].fillna(0) / pdm.lag(df['at'])

    return df[['div2la','rep2la']].copy()

In [None]:
#| eval: false
payout = payout_vars(df)

In [None]:
#| eval: false
payout.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,div2la,rep2la
permno,Ydate,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
#| export 
def value_vars(df: pd.DataFrame, 
                list_reqs: bool=False # If true, just returns a list of the required variables
                ) -> pd.DataFrame:
    
    reqs = ['at','prcc_f','csho'] + [x for x in book_equity(list_reqs=True) if x not in ['at','prcc_f','csho']]
    if list_reqs: return reqs
    df = df[reqs].copy()

    beq = book_equity(df)[['bookeq']].copy()
    df = df.join(beq)

    df['tobinq'] = (df['at'] - df['bookeq'] + df['prcc_f'] * df['csho']) / df['at']

    return  df[['tobinq']].copy()

In [None]:
#| eval: false
tobinq = value_vars(df)

In [None]:
#| eval: false
tobinq.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,tobinq
permno,Ydate,Unnamed: 2_level_1


In [None]:
#| export
def issuance_vars(df: pd.DataFrame, 
                list_reqs: bool=False # If true, just returns a list of the required variables
                ) -> pd.DataFrame:
    
    reqs_subset = ['at','sstk','prstkc','dltis','dltr', 're', 'dlc','dltt']
    reqs = reqs_subset + [x for x in book_equity(list_reqs=True) if x not in reqs_subset]
    if list_reqs: return reqs
    df = df[reqs].copy()

    beq = book_equity(df)[['bookeq']].copy()
    df = df.join(beq)
    
    df['lag_at'] = pdm.lag(df['at'])

    df['equityiss_cfs'] = (df['sstk'].fillna(0) - df['prstkc'].fillna(0)) / df['lag_at']
    df['debtiss_cfs'] = (df['dltis'].fillna(0) - df['dltr'].fillna(0)) / df['lag_at']

    df['debtiss_bs'] = (pdm.rdiff(df['dltt']) + pdm.rdiff(df['dlc'].fillna(0))) / df['lag_at']

    df['equityiss_tot'] = (pdm.rdiff(df['bookeq']) - pdm.rdiff(df['re'])) / df['lag_at']
    df['debtiss_tot'] = (pdm.rdiff(df['at']) - pdm.rdiff(df['bookeq'])) / df['lag_at']

    return df[['equityiss_tot','equityiss_cfs', 'debtiss_tot', 'debtiss_cfs', 'debtiss_bs']].copy()

In [None]:
#| eval: false
iss = issuance_vars(df)

In [None]:
#| eval: false
iss.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,equityiss_tot,equityiss_cfs,debtiss_tot,debtiss_cfs,debtiss_bs
permno,Ydate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()