# peters_taylor_2016

> Total Q, and intangible capital measures from Peters and Taylor (2016)

In [None]:
#| default_exp papers.peters_taylor_2016

In [None]:
#|exports
from __future__ import annotations
from typing import List

import pandas as pd
import numpy as np

import pandasmore as pdm
from finsets.wrds import wrds_api

In [None]:
#| exports
PROVIDER = 'Wharton Research Data Services (WRDS)'
URL = 'https://wrds-www.wharton.upenn.edu/pages/get-data/peters-and-taylor-total-q/peters-and-taylor-total-q/'
LIBRARY = 'totalq'
TABLE = 'total_q'
LINK_LIBRARY = 'crsp'
LINK_TABLE = 'ccmxpf_lnkhist'
FREQ = 'A'
MIN_YEAR = 1950
MAX_YEAR = None
ENTITY_ID_IN_RAW_DSET = 'permno'
ENTITY_ID_IN_CLEAN_DSET = 'permno'
TIME_VAR_IN_RAW_DSET = 'datadate'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'

In [None]:
#| export
def list_all_vars() -> pd.DataFrame:
    "Collects names of all available variables from WRDS f`{LIBRARY}.{TABLE}` and `{LIBRARY}.{COMPANY_TABLE}`."

    try:
        db = wrds_api.Connection()
        funda = db.describe_table(LIBRARY,TABLE).assign(wrds_library=LIBRARY, wrds_table=TABLE)
    finally:
        db.close()

    return funda[['name','type','wrds_library','wrds_table']].copy()

In [None]:
#| eval: false
all_vars = list_all_vars()

In [None]:
#| eval: false
all_vars

Unnamed: 0,name,type,wrds_library,wrds_table
0,gvkey,VARCHAR(6),totalq,total_q
1,datadate,DATE,totalq,total_q
2,fyear,DOUBLE_PRECISION,totalq,total_q
3,k_int_know,DOUBLE_PRECISION,totalq,total_q
4,k_int_org,DOUBLE_PRECISION,totalq,total_q
5,k_int_offbs,DOUBLE_PRECISION,totalq,total_q
6,k_int,DOUBLE_PRECISION,totalq,total_q
7,q_tot,DOUBLE_PRECISION,totalq,total_q


In [None]:
#| export
def get_raw_data(
        vars: List[str]='*', # Default is to get all available variables
        required_vars: List[str]=['gvkey','datadate'], #list of variables that will get downloaded, even if not in `vars`
        nrows: int=None, #Number of rows to download. If None, full dataset will be downloaded
        start_date: str=None, # Start date in MM/DD/YYYY format
        end_date: str=None #End date in MM/DD/YYYY format
) -> pd.DataFrame:
    """Downloads `vars` from `start_date` to `end_date` from WRDS `{LIBRARY}.{TABLE}` library and adds PERMNO and PERMCO as in CCM"""
 
    wrds_api.validate_dates([start_date, end_date])

    sql_string=f"""SELECT c.lpermno as permno, c.lpermco as permco, c.liid, c.linkprim as linkprim, 
                          a.*, 
                          b.xrd, b.xsga, b.cogs, b.rdip, b.at, b.capx, b.ppegt, b.ppent, b.dp
                    FROM {LIBRARY}.{TABLE} AS a
                    LEFT JOIN comp.funda AS b ON a.gvkey = b.gvkey AND a.datadate = b.datadate
                    INNER JOIN {LINK_LIBRARY}.{LINK_TABLE} AS c ON a.gvkey = c.gvkey 
                    WHERE a.datadate BETWEEN c.linkdt AND COALESCE(c.linkenddt, CURRENT_DATE)
                            AND c.linktype IN ('LU','LC') AND c.linkprim IN ('P','C')
                """
    if start_date is not None: sql_string += r" AND a.datadate >= %(start_date)s"
    if end_date is not None: sql_string += r" AND a.datadate <= %(end_date)s"
    if nrows is not None: sql_string += r" LIMIT %(nrows)s"
    
    return wrds_api.download(sql_string,
                             params={'start_date':start_date, 'end_date':end_date, 'nrows':nrows})

In [None]:
#| eval: false
raw = get_raw_data(vars='*', nrows=1000)
raw

Unnamed: 0,permno,permco,liid,linkprim,gvkey,datadate,fyear,k_int_know,k_int_org,k_int_offbs,...,q_tot,xrd,xsga,cogs,rdip,at,capx,ppegt,ppent,dp
0,25881.0,23369.0,01,P,001000,1970-12-31,1970.0,0.0,5.328974,5.328974,...,0.817259,,9.420,30.529,,33.450,2.767,14.517,8.876,1.352
1,25881.0,23369.0,01,P,001000,1971-12-31,1971.0,0.0,7.607690,7.607690,...,0.508109,,10.548,33.973,,29.330,1.771,13.269,7.639,1.399
2,25881.0,23369.0,01,P,001000,1971-12-31,1971.0,0.0,7.607690,7.607690,...,0.508109,,,23.380,,,,,,0.868
3,25881.0,23369.0,01,P,001000,1972-12-31,1972.0,0.0,8.550650,8.550650,...,0.516247,,7.551,22.702,,19.907,1.254,11.709,7.013,1.200
4,25881.0,23369.0,01,P,001000,1972-12-31,1972.0,0.0,8.550650,8.550650,...,0.516247,,,21.444,,,,,,0.933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,58958.0,25619.0,01,P,001059,1977-07-31,1977.0,0.0,86.632510,86.632510,...,-0.217882,,75.810,203.647,,170.852,4.974,33.030,21.142,2.091
996,58958.0,25619.0,01,P,001059,1977-07-31,1977.0,0.0,86.632510,86.632510,...,-0.217882,,75.907,203.647,,170.852,,,21.142,1.994
997,58958.0,25619.0,01,P,001059,1978-07-31,1978.0,0.0,99.602570,99.602570,...,-0.202014,,83.362,236.856,,202.794,10.214,44.028,28.967,2.522
998,58958.0,25619.0,01,P,001059,1978-07-31,1978.0,0.0,99.602570,99.602570,...,-0.202014,,83.463,236.856,,202.794,,,28.967,2.421


In [None]:
#| export
def process_raw_data(
        df: pd.DataFrame=None,  # Must contain `permno` and `datadate` columns   
        clean_kwargs: dict={},  # Params to pass to `pdm.setup_panel` other than `panel_ids`, `time_var`, and `freq`
) -> pd.DataFrame:
    """Applies `pandasmore.setup_panel` to `df`"""

    # Change some variables to categorical
    for col in ['permno','permco']:
        if col in df.columns:
            df[col] = df[col].astype('Int64').astype('category')

    for col in ['gvkey']:
        if col in df.columns:
            df[col] = df[col].astype('string').astype('category')
            
    # Set up panel structure
    df = pdm.setup_panel(df, panel_ids=ENTITY_ID_IN_RAW_DSET, time_var=TIME_VAR_IN_RAW_DSET, freq=FREQ, panel_ids_toint=False, **clean_kwargs)
    return df 

In [None]:
#| eval: false
df_clean = process_raw_data(raw)

In [None]:
#| eval: false
df_clean

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,liid,linkprim,gvkey,fyear,k_int_know,k_int_org,k_int_offbs,...,q_tot,xrd,xsga,cogs,rdip,at,capx,ppegt,ppent,dp
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10006,1951,1951-04-30,1951-04-30,22156,00X,C,001010,1950.0,0.417389,0.000000,0.417389,...,,,,,,145.100,3.500,118.300,60.400,3.970
10006,1952,1952-04-30,1952-04-30,22156,00X,C,001010,1951.0,1.028123,0.000000,1.028123,...,,,,,,149.500,4.600,114.200,57.200,3.930
10006,1953,1953-04-30,1953-04-30,22156,00X,C,001010,1952.0,1.838980,3.843000,5.681980,...,,,12.810,219.770,,165.200,3.200,108.100,51.500,4.110
10006,1954,1954-04-30,1954-04-30,22156,00X,C,001010,1953.0,2.938649,7.366661,10.305310,...,,,14.230,208.390,,153.100,8.700,109.500,54.200,3.800
10006,1955,1955-04-30,1955-04-30,22156,00X,C,001010,1954.0,4.466903,10.267680,14.734590,...,,1.960,16.470,155.620,,172.800,14.600,121.100,63.100,4.520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85041,1992,1992-06-30,1992-06-30,7357,01,P,001055,1992.0,62.660400,144.561500,207.221900,...,-0.289738,,195.734,639.026,,580.613,17.811,,63.787,11.793
85041,1993,1993-06-30,1993-06-30,7357,01,P,001055,1993.0,70.681720,177.075700,247.757400,...,-0.347495,,225.276,1113.230,,886.159,20.894,,94.922,13.222
85041,1994,1994-06-30,1994-06-30,7357,01,P,001055,1994.0,82.367810,225.553300,307.921100,...,-0.280901,,306.244,1993.680,,1005.620,30.045,,103.441,25.861
85041,1995,1995-06-30,1995-06-30,7357,01,P,001055,1995.0,87.190140,279.999400,367.189500,...,0.068307,36.383,351.365,2196.797,,1021.501,26.080,165.261,101.255,25.311


In [None]:
#| export
def features(df: pd.DataFrame=None
             ) -> pd.DataFrame:

    out = df.copy()

    for x in ['xrd','xsga']:
        out[f'{x}0'] = np.where(out[x].isnull() & out['at'].notnull(), 0, out[x])
        out[f'{x}0'] = np.where(out[f'{x}0'].isnull() & out['at'].isnull(), out[f'{x}0'].interpolate(), out[f'{x}0'])

    out['sga'] = np.where(out['xsga'].isnull() | out['xrd0'].between(out['xsga0'],out['cogs']) 
                        ,out['xsga0'].fillna(0),
                        out['xsga0'] - out['xrd0'] - out['rdip'].fillna(0))    

    out['k_phy'] = out['ppegt']
    out['k_tot'] = out['k_phy'] + out['k_int']

    out['i_phy'] = out['capx']
    out['i_int'] = out['xrd0'] + 0.3*out['sga']
    out['i_tot'] = out['i_phy'] + out['i_int']

    out['i2k_int'] = out['i_int'] / pdm.lag(out['k_tot'])
    out['i2k_phy'] = out['i_phy'] / pdm.lag(out['k_tot'])
    out['i2k_tot'] = out['i2k_int'] + out['i2k_phy']    

    out = out.replace([np.inf, -np.inf], np.nan)
    return out 


In [None]:
#| eval: false
ftrs = features(df_clean)

In [None]:
#| eval: false
ftrs

Unnamed: 0_level_0,Unnamed: 1_level_0,datadate,dtdate,permco,liid,linkprim,gvkey,fyear,k_int_know,k_int_org,k_int_offbs,...,xsga0,sga,k_phy,k_tot,i_phy,i_int,i_tot,i2k_int,i2k_phy,i2k_tot
permno,Adate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10006,1951,1951-04-30,1951-04-30,22156,00X,C,001010,1950.0,0.417389,0.000000,0.417389,...,0.000,0.000,118.300,118.736389,3.500,0.0000,3.5000,,,
10006,1952,1952-04-30,1952-04-30,22156,00X,C,001010,1951.0,1.028123,0.000000,1.028123,...,0.000,0.000,114.200,115.248123,4.600,0.0000,4.6000,0.000000,0.038741,0.038741
10006,1953,1953-04-30,1953-04-30,22156,00X,C,001010,1952.0,1.838980,3.843000,5.681980,...,12.810,12.810,108.100,113.804980,3.200,3.8430,7.0430,0.033345,0.027766,0.061112
10006,1954,1954-04-30,1954-04-30,22156,00X,C,001010,1953.0,2.938649,7.366661,10.305310,...,14.230,14.230,109.500,119.805310,8.700,4.2690,12.9690,0.037512,0.076447,0.113958
10006,1955,1955-04-30,1955-04-30,22156,00X,C,001010,1954.0,4.466903,10.267680,14.734590,...,16.470,14.510,121.100,135.834590,14.600,6.3130,20.9130,0.052694,0.121864,0.174558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85041,1992,1992-06-30,1992-06-30,7357,01,P,001055,1992.0,62.660400,144.561500,207.221900,...,195.734,195.734,,,17.811,58.7202,76.5312,,,
85041,1993,1993-06-30,1993-06-30,7357,01,P,001055,1993.0,70.681720,177.075700,247.757400,...,225.276,225.276,,,20.894,67.5828,88.4768,,,
85041,1994,1994-06-30,1994-06-30,7357,01,P,001055,1994.0,82.367810,225.553300,307.921100,...,306.244,306.244,,,30.045,91.8732,121.9182,,,
85041,1995,1995-06-30,1995-06-30,7357,01,P,001055,1995.0,87.190140,279.999400,367.189500,...,351.365,314.982,165.261,569.685500,26.080,130.8776,156.9576,,,


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()