In [72]:
import pandas as pd
import numpy as np
import wrds
import time

db = wrds.Connection()

Enter your WRDS username [DRNPRO]:nikoldy
Enter your password:········
WRDS recommends setting up a .pgpass file.
You can find more info here:
https://www.postgresql.org/docs/9.5/static/libpq-pgpass.html.
Loading library list...
Done


In [85]:
db.list_libraries()
#db.list_tables(library='crsp')
#db.describe_table(library="comp", table="fundq").name.tolist()
#db.describe_table(library="crsp", table="msf").name.tolist()

['aha',
 'aha_sample',
 'ahasamp',
 'audit',
 'audit_audit_comp',
 'audit_corp_legal',
 'bank',
 'blab',
 'block',
 'boardex',
 'boardex_na',
 'boardex_trial',
 'boardsmp',
 'bvd',
 'bvd_ama_large',
 'bvd_ama_medium',
 'bvd_ama_small',
 'bvd_ama_verylarge',
 'bvdsamp',
 'calcbench_trial',
 'calcbnch',
 'cboe',
 'centris',
 'ciq',
 'ciq_capstrct',
 'ciq_capstrct_new',
 'ciq_common',
 'ciq_keydev',
 'ciq_pplintel',
 'ciqsamp',
 'ciqsamp_common',
 'ciqsamp_transcripts',
 'cisdm',
 'cisdmsmp',
 'clrvt',
 'clrvtsmp',
 'comp',
 'comp_bank',
 'comp_bank_daily',
 'comp_execucomp',
 'comp_global',
 'comp_global_daily',
 'comp_na_daily_all_new',
 'comp_segments_hist',
 'comp_segments_hist_daily',
 'compa',
 'compb',
 'compbd',
 'compdcur',
 'compg',
 'compgd',
 'comph',
 'compm',
 'compmcur',
 'compnad',
 'compsamp',
 'compsamp_snapshot',
 'compseg',
 'compsegd',
 'compsnap',
 'comscore',
 'contrib',
 'contrib_char_returns',
 'contrib_char_returns_new',
 'contrib_general',
 'contrib_general_new'

# QUERIES

####  QUERY + EXPORT COMPUSTAT QUARTERLY FUNDAMENTALS

In [73]:
start = time.time()

comp_fundq = db.raw_sql("""select gvkey, iid, datadate, datacqtr, fyearq, fqtr, fyr, tic, conm, 
                        exchg, atq, ltq, niq, ceqq, cheq
                        from comp.fundq
                        where datadate >='01/01/2019'
                        and exchg between 11 and 20
                        """, date_cols=['datadate','datacqtr'])

print('Shape:',comp_fundq.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))
comp_fundq.to_csv('data/comp_fundq.csv')

Shape: (70501, 15)
Elapsed time (minutes): 0.0841943860054016


####  QUERY + EXPORT CRSP MONTHLY STOCK FILE

In [74]:
start = time.time()

crsp_msf = db.raw_sql("""select permno, hsiccd, date, prc, ret, shrout, cfacpr, cfacshr, retx
                        from crsp.msf
                        where date>='01/01/2019'
                        and hsiccd not between 6000 and 6999
                        and ret > -50
                        """, date_cols=['date'])

print('Shape:',crsp_msf.shape)
crsp_msf.to_csv('data/crsp_msf.csv')
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (47896, 9)
Elapsed time (minutes): 0.062420364220937094


####  QUERY + EXPORT COMPHIST (CRSP/COMPUSTAT Merged - Company Header History)


In [None]:
#get comphist deletion data
start = time.time()

comphist = db.raw_sql("""select gvkey, HCHGDT, HCHGENDDT, HDLDTE, HSIC, HGIND, HDLRSN
                    from crsp.comphist
                    where hsic not between 6000 and 7000
                    and
                 """, date_cols=['hdldte','hchgenddt'])

print('Shape:',comphist.shape)
comphist.to_csv('data/comphist.csv')
print('Elapsed time (minutes):',((time.time() - start)/60))
# http://www.crsp.org/products/documentation/master-header-and-header-history
#HCHGDT	Comphist description effective date
#HCHGENDDT	Comphist description last effective date
#HDLDTE	Historical research company – deletion date
#HFYRC	Historical fiscal year end month / current
#HSIC	Historical SIC Code
#HGIND	Historical GICS industries

####  QUERY LINKING TABLE

In [40]:
start = time.time()
link_table = db.raw_sql("""select *
                        from crsp_a_ccm.ccmxpf_lnkhist 
                        """, date_cols=['linkenddt','linkdt'])

link_table.gvkey=link_table.gvkey.astype(float)

link_table.to_csv('data/link_table.csv')
print('Shape:',link_table.shape)
print('Elapsed time (seconds):',(time.time() - start))

Shape: (103457, 8)
Elapsed time (seconds): 7.956339120864868


# READ IN DATA

#### READ IN COMP_FUNDQ

In [82]:
start = time.time()

dtypes = {
          'gvkey':float,  'iid':'str',  'datadate':'str',  'datacqtr': 'str',  'fyearq':'str',   'fqtr':'str',
          'fyr':'str',    'tic':'str',  'conm':'str',      'atq':float,        'ltq':float,      'niq':float, 
          'ceqq':float,   'cheq':float
          }

parse_dates = ['datadate', 'datacqtr']

cols = list(pd.read_csv("data/comp_fundq.csv", nrows =1))

comp_fundq = pd.read_csv('data/comp_fundq.csv',dtype=dtypes,parse_dates=parse_dates,
                         usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',comp_fundq.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (70501, 15)
Elapsed time (minutes): 0.008519419034322103


#### READ IN CRSP_MSF

In [42]:
start = time.time()

dtypes = {
          'permno':float,  'hsiccd':float,    'date': 'str',     'prc':float,   'ret':float,
          'shrout':float,  'cfacpr':float,    'cfacshr':float,   'retx':float
          }

parse_dates = ['date']

cols = list(pd.read_csv("data/crsp_msf.csv", nrows =1))

crsp_msf = pd.read_csv('data/crsp_msf.csv',dtype=dtypes,parse_dates=parse_dates,
                         usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',crsp_msf.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (47896, 9)
Elapsed time (minutes): 0.003252116839090983


####  READ IN COMPHIST

In [43]:
start = time.time()

dtypes = {'gvkey':float, 'hchgdt':float, 'hchgenddt': 'str', 'hdldte':float, 'hgind':float,}

parse_dates = ['hchgdt','hchgenddt','hdldte']

cols = list(pd.read_csv("data/comphist.csv", nrows =1))

comphist = pd.read_csv('data/comphist.csv',dtype=dtypes,parse_dates=parse_dates,
                         usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',comphist.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (260397, 6)
Elapsed time (minutes): 0.009194529056549073


#### READ IN LINK_TABLE

In [44]:
start = time.time()

cols = list(pd.read_csv("data/link_table.csv", nrows =1))

link_table = pd.read_csv('data/link_table.csv',usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',link_table.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (103457, 8)
Elapsed time (minutes): 0.011220868428548176


# DATA

In [79]:
comphist.dtypes

gvkey               float64
hchgdt       datetime64[ns]
hchgenddt    datetime64[ns]
hdldte       datetime64[ns]
hsic                float64
hgind               float64
dtype: object

In [80]:
link_table.dtypes

gvkey        float64
linkprim      object
liid          object
linktype      object
lpermno      float64
lpermco      float64
linkdt        object
linkenddt     object
dtype: object

In [81]:
comp_fundq.dtypes

gvkey               object
iid                 object
datadate    datetime64[ns]
datacqtr    datetime64[ns]
fyearq             float64
fqtr               float64
fyr                float64
tic                 object
conm                object
exchg              float64
atq                float64
ltq                float64
niq                float64
ceqq               float64
cheq               float64
dtype: object

In [77]:
crsp_msf.shape

(47896, 9)

# MERGE DATA

#### MERGE + EXPORT DATA

In [83]:
start = time.time()

df1 = pd.merge(comp_fundq, link_table, how='inner', on = "gvkey")
df2 = pd.merge(df1, comphist, how='inner', on = "gvkey")
df3 = pd.merge(df2, crsp_msf, how='inner', left_on = "lpermno", right_on='permno')

print('Shape:',df3.shape)
print('Elapsed time (minutes):',(time.time() - start)/60)

Shape: (4421869, 36)
Elapsed time (minutes): 0.1329185684521993


In [84]:
df3

Unnamed: 0,gvkey,iid,datadate,datacqtr,fyearq,fqtr,fyr,tic,conm,exchg,...,hgind,permno,hsiccd,date,prc,ret,shrout,cfacpr,cfacshr,retx
0,1410.0,01,2019-01-31,2018-10-01,2019.0,1.0,10.0,ABM,ABMINDUSTRIESINC,11.0,...,202010.0,47730.0,7349.0,2019-01-31,34.189999,0.070383,66029.0,1.0,1.0,0.064777
1,1410.0,01,2019-01-31,2018-10-01,2019.0,1.0,10.0,ABM,ABMINDUSTRIESINC,11.0,...,202010.0,47730.0,7349.0,2019-02-28,35.660000,0.042995,66210.0,1.0,1.0,0.042995
2,1410.0,01,2019-01-31,2018-10-01,2019.0,1.0,10.0,ABM,ABMINDUSTRIESINC,11.0,...,202010.0,47730.0,7349.0,2019-03-29,36.349998,0.019349,66239.0,1.0,1.0,0.019349
3,1410.0,01,2019-01-31,2018-10-01,2019.0,1.0,10.0,ABM,ABMINDUSTRIESINC,11.0,...,202010.0,47730.0,7349.0,2019-04-30,37.970001,0.049519,66239.0,1.0,1.0,0.044567
4,1410.0,01,2019-01-31,2018-10-01,2019.0,1.0,10.0,ABM,ABMINDUSTRIESINC,11.0,...,202010.0,47730.0,7349.0,2019-05-31,36.250000,-0.045299,66239.0,1.0,1.0,-0.045299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4421864,331856.0,01,2020-09-30,2020-07-01,2020.0,3.0,12.0,IMUX,IMMUNICINC,14.0,...,352010.0,14615.0,9999.0,2019-08-30,17.010000,0.245242,10093.0,1.0,1.0,0.245242
4421865,331856.0,01,2020-09-30,2020-07-01,2020.0,3.0,12.0,IMUX,IMMUNICINC,14.0,...,352010.0,14615.0,9999.0,2019-09-30,10.000000,-0.412111,10071.0,1.0,1.0,-0.412111
4421866,331856.0,01,2020-09-30,2020-07-01,2020.0,3.0,12.0,IMUX,IMMUNICINC,14.0,...,352010.0,14615.0,9999.0,2019-10-31,8.460000,-0.154000,10071.0,1.0,1.0,-0.154000
4421867,331856.0,01,2020-09-30,2020-07-01,2020.0,3.0,12.0,IMUX,IMMUNICINC,14.0,...,352010.0,14615.0,9999.0,2019-11-29,7.030000,-0.169031,10117.0,1.0,1.0,-0.169031


In [49]:
# caution takes a while
start = time.time()

df3.to_csv('data/merged_data.csv')

print('Elapsed time (minutes):',(time.time() - start)/60)

Elapsed time (minutes): 4.1635857661565145


#### READ IN MERGED_DATA

In [50]:
start = time.time()

cols = list(pd.read_csv("data/merged_data.csv", nrows =1))
merged_data = pd.read_csv('data/merged_data.csv',usecols =[i for i in cols if i != 'Unnamed: 0'])
merged_data.dtypes

print('Elapsed time (minutes):',(time.time() - start)/60)

KeyboardInterrupt: 