# wrds_crspm

> Retrieve and process data from WRDS CRSP Monthly Stock File

Since this is a proprietary dataset, in the documentation below, I can not show any of the data that is being retrieved/generated (will show only column names).

In [None]:
#| default_exp wrds.crspm

In [None]:
#|exports
from __future__ import annotations
from pathlib import Path
from typing import List
import os

import pandas as pd
import numpy as np

import pandasmore as pdm
from finsets.wrds import wrds_api

In [None]:
#| export 
def variable_labels(rawfile: str|Path='../../resources/crspm_variable_descriptions.csv', # location of the raw metadata file
             outfile: str|Path='../../metadata/crspm_variable_labels_types.pkl', # where to save the cleaned metadata file 
             ) -> pd.DataFrame:

    df = pd.read_csv(rawfile)
    df['Variable Label'] = df.apply(lambda row: row['Description'].replace(row['Variable Name'].strip()+' -- ', ''), axis=1)
    df['Variable Label'] = df.apply(lambda row: row['Variable Label'].replace( '(' + row['Variable Name'].strip() + ')', ''), axis=1)
    df['Variable Name'] = df['Variable Name'].str.strip().str.lower()
    df = df[['Variable Name', 'Variable Label', 'Type']].copy()
    df.columns = ['name','label','type']
    df.to_pickle(outfile)
    return df

In [None]:
variable_labels()

Unnamed: 0,name,label,type
0,cusip,Cusip,string
1,ncusip,Ncusip,string
2,comnam,Company Name,string
3,ticker,Ticker,string
4,permco,CRSP Permanent Company Number,double
...,...,...,...
57,vwretd,Value-Weighted Return (includes distributions),double
58,vwretx,Value-Weighted Return (excluding dividends),double
59,ewretd,Equal-Weighted Return (includes distributions),double
60,ewretx,Equal-Weighted Return (excluding dividends),double


In [None]:
def metadata():
    with wrds_api.Connection(wrds_username = os.getenv('WRDS_USERNAME')) as db:
        msf = db.describe_table('crsp','msf')
        msf_rows = db.get_row_count('crsp','msf')
        mse = db.describe_table('crsp','msenames')
        mse_rows = db.get_row_count('crsp','msenames')
        
    msf_meta = msf[['name','type']].copy()
    msf_meta['nr_rows'] = msf_rows
    msf_meta['wrds_library'] = 'crsp'
    msf_meta['wrds_table'] = 'msf'

    mse_meta = mse[['name','type']].copy()
    mse_meta['nr_rows'] = mse_rows
    mse_meta['wrds_library'] = 'crsp'
    mse_meta['wrds_table'] = 'msenames'

    crsp_meta = (pd.concat([msf_meta, mse_meta],axis=0, ignore_index=True)
                .merge(variable_labels()[['name','label']], how='left', on='name'))
    
    crsp_meta['fetched_by_module'] = 'wrds.crspm'
    for v in list(crsp_meta.columns):
        crsp_meta[v] = crsp_meta[v].astype('string')
    crsp_meta.to_pickle('../../metadata/crspm_metadata.pkl')
    crsp_meta.to_csv('../../metadata/crspm_metadata.csv', index=False)
    
    return crsp_meta

In [None]:
metadata()

Loading library list...
Done
Approximately 4922867 rows in crsp.msf.
Approximately 111623 rows in crsp.msenames.


Unnamed: 0,name,type,nr_rows,wrds_library,wrds_table,label,fetched_by_module
0,cusip,VARCHAR(8),4922867,crsp,msf,Cusip,wrds.crspm
1,permno,DOUBLE_PRECISION,4922867,crsp,msf,,wrds.crspm
2,permco,DOUBLE_PRECISION,4922867,crsp,msf,CRSP Permanent Company Number,wrds.crspm
3,issuno,DOUBLE_PRECISION,4922867,crsp,msf,Nasdaq Issue Number,wrds.crspm
4,hexcd,DOUBLE_PRECISION,4922867,crsp,msf,Header Exchange Code,wrds.crspm
5,hsiccd,DOUBLE_PRECISION,4922867,crsp,msf,Header SIC Code,wrds.crspm
6,date,DATE,4922867,crsp,msf,,wrds.crspm
7,bidlo,DOUBLE_PRECISION,4922867,crsp,msf,Bid or Low,wrds.crspm
8,askhi,DOUBLE_PRECISION,4922867,crsp,msf,Ask or High,wrds.crspm
9,prc,DOUBLE_PRECISION,4922867,crsp,msf,Price,wrds.crspm


In [None]:
#| export
def default_raw_vars():
    """Default variables used in `download` if none are specified. Takes about 2 min to download."""
    
    return ['permno','permco','date',
            'ret', 'retx', 'shrout', 'prc', 
            'shrcd', 'exchcd','siccd','ticker','cusip','ncusip']            

In [None]:
print(default_raw_vars())

['permno', 'permco', 'date', 'ret', 'retx', 'shrout', 'prc', 'shrcd', 'exchcd', 'siccd', 'ticker', 'cusip', 'ncusip']


In [None]:
def parse_varlist(vars: List[str]=None,
                  wrds_username: str=None
                  ) -> str:
    """Figure out which `vars` come from the `crsp.msf` table and which come from the `crsp.msenames` table and add a. and b. prefixes"""

    if wrds_username is None:
        wrds_username = os.getenv('WRDS_USERNAME')
        if wrds_username is None: wrds_username = input("Enter your WRDS username: ") 

    if vars is None: vars = default_raw_vars()
    vars = ['permno','permco','date','exchcd'] + [x for x in vars if x not in ['permno','permco','date','exchcd']]

    with wrds_api.Connection(wrds_username = wrds_username) as db:
        all_msf_vars = list(db.describe_table('crsp','msf').name)
        all_mse_vars = list(db.describe_table('crsp','msenames').name)
        my_msf_vars = [f'a.{x}' for x in vars if x in all_msf_vars]
        my_mse_vars = [f'b.{x}' for x in vars if (x in all_mse_vars) and (x not in all_msf_vars)]
        varlist_string = ','.join(my_msf_vars + my_mse_vars)
        db.close()
    return varlist_string

In [None]:
def delist_adj_ret(df: pd.DataFrame, # Requires `ret`,`exchcd`, ` `dlret`, and `dlstcd` variables
                       adj_ret_var: str
                       ) -> pd.DataFrame:
    """Adjust for delisting returns using Shumway and Warther (1999) and Johnson and Zhao (2007)"""

    df['npdelist'] = (df['dlstcd']==500) | df['dlstcd'].between(520,584)
    df['dlret'] = np.where(df.dlret.isna() & df.npdelist & df.exchcd.isin([1,2]), -0.35, df.dlret)
    df['dlret'] = np.where(df.dlret.isna() & df.npdelist & df.exchcd.isin([3]), -0.55, df.dlret)
    df['dlret'] = np.where(df.dlret.notna() & df.dlret < -1, -1, df.dlret)
    df['dlret'] = df.dlret.fillna(0)

    df[adj_ret_var] = (1 + df.ret) * (1 + df.dlret) - 1
    df[adj_ret_var] = np.where(df[adj_ret_var].isna() & (df.dlret!=0), df.dlret, df[adj_ret_var])
    df = df.drop('npdelist', axis=1) 
    return df

In [None]:
#| export
def download(vars: List[str]=None, # If None, downloads `default_raw_vars`; else `permno`, `permco`, `date`, and 'exchcd' are added by default
             wrds_username: str=None, #If None, looks for WRDS_USERNAME with `os.getenv`, then prompts you if needed
             start_date: str="01/01/1900", # Start date in MM/DD/YYYY format
             end_date: str=None, # End date in MM/DD/YYYY format; if None, defaults to current date  
             add_delist_adj_ret: bool=True, # Whether to calculate delisting-adjusted returns 
             adj_ret_var: str='ret_adj' # What to call the returns adjusted for delisting bias
             ) -> pd.DataFrame:
    """Downloads `vars` from `start_date` to `end_date` from WRDS crsp.msf and crsp.msenames libraries. 
        Creates `ret_adj` for delisting based on Shumway and Warther (1999) and Johnson and Zhao (2007)"""

    varlist_string = parse_varlist(vars, wrds_username)
    sql_string = f"""SELECT {varlist_string},  c.dlstcd, c.dlret 
                        FROM crsp.msf AS a 
                        LEFT JOIN crsp.msenames AS b
                            ON a.permno=b.permno AND b.namedt<=a.date AND a.date<=b.nameendt                     
                        LEFT JOIN crsp.msedelist as c
                            ON a.permno=c.permno AND date_trunc('month', a.date) = date_trunc('month', c.dlstdt)                            
                            WHERE a.date BETWEEN '{start_date}' AND COALESCE(%(end)s, CURRENT_DATE) 
                """
    df = wrds_api.download(sql_string, wrds_username=wrds_username, params={'end':end_date})
    if add_delist_adj_ret: df = delist_adj_ret(df, adj_ret_var)
    else: df = df.drop(['dlret','dlstcd'], axis=1)
    return df 

In [None]:
#| eval: false
raw = download(start_date='01/01/2021')

Loading library list...
Done
Approximately 4922867 rows in crsp.msf.
Approximately 111623 rows in crsp.msenames.
Loading library list...
Done


In [None]:
#| eval: false
raw.head(0)

Unnamed: 0,permno,permco,date,ret,retx,shrout,prc,cusip,exchcd,shrcd,siccd,ticker,ncusip,dlstcd,dlret,ret_adj


In [None]:
#| export
def clean(df: pd.DataFrame=None, # If None, downloads `vars` using `download` function; else, must contain `permno` and `date` columns
          vars: List[str]=None, # If None, downloads `default_raw_vars`
          wrds_username: str=None, #If None, looks for WRDS_USERNAME with `os.getenv`, then prompts you if needed
          start_date: str="01/01/1900", # Start date in MM/DD/YYYY format
          end_date: str=None, # End date. Default is current date          
          clean_kwargs: dict={}, # Params to pass to `pdm.setup_panel` other than `panel_ids`, `time_var`, and `freq`
          ) -> pd.DataFrame:
    """Applies `pandasmore.setup_panel` to `df`. If `df` is None, downloads `vars` using `download` function."""

    if df is None: df = download(vars=vars, wrds_username=wrds_username, start_date=start_date, end_date=end_date)
    df = pdm.setup_panel(df, panel_ids='permno', time_var='date', freq='M', **clean_kwargs)
    return df 

In [None]:
#| eval: false
df = clean(df=raw)

In [None]:
#| eval: false
df.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,permco,ret,retx,shrout,prc,cusip,exchcd,shrcd,siccd,ticker,ncusip,dlstcd,dlret,ret_adj
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [None]:
#| eval: false
df = clean(vars=['ret','shrcd','exchcd'], start_date='01/01/2020', end_date='12/31/2020')

Loading library list...
Done
Approximately 4922867 rows in crsp.msf.
Approximately 111623 rows in crsp.msenames.
Loading library list...
Done


In [None]:
#| eval: false
df.head(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,dtdate,permco,ret,exchcd,shrcd,dlstcd,dlret,ret_adj
permno,Mdate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()