In [1]:
import pandas as pd
import numpy as np
import wrds
import time

db = wrds.Connection()

Enter your WRDS username [DRNPRO]:nikoldy
Enter your password:········
WRDS recommends setting up a .pgpass file.
You can find more info here:
https://www.postgresql.org/docs/9.5/static/libpq-pgpass.html.
Loading library list...
Done


In [2]:
#db.list_libraries()
#db.list_tables(library='crsp')
#db.describe_table(library="comp", table="fundq").name.tolist()
#db.describe_table(library="crsp", table="comphist").name.tolist()

# QUERIES

####  QUERY + EXPORT COMPUSTAT QUARTERLY FUNDAMENTALS

In [3]:
start = time.time()

comp_fundq = db.raw_sql("""select gvkey, iid, datadate, datacqtr, fyearq, fqtr, fyr, tic, conm, 
                        atq, ltq, niq, ceqq, cheq
                        from comp.fundq
                        where datadate >='01/01/2019'
                        """, date_cols=['datadate','datacqtr'])

print('Shape:',comp_fundq.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))
comp_fundq.to_csv('data/comp_fundq.csv')

Shape: (80089, 14)
Elapsed time (minutes): 0.033609445889790854


####  QUERY + EXPORT CRSP MONTHLY STOCK FILE

In [4]:
start = time.time()

crsp_msf = db.raw_sql("""select permno, hsiccd, date, prc, ret, shrout, cfacpr, cfacshr, retx
                        from crsp.dsf
                        where date>='12/01/2019'
                        and hsiccd not between 6000 and 7000
                        """, date_cols=['date'])

print('Shape:',crsp_msf.shape)
crsp_msf.to_csv('data/crsp_msf.csv')
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (84437, 9)
Elapsed time (minutes): 0.05665660301844279


####  QUERY + EXPORT COMPHIST (CRSP/COMPUSTAT Merged - Company Header History)


In [5]:
#get comphist deletion data
start = time.time()

comphist = db.raw_sql("""select gvkey, HCHGDT, HCHGENDDT, HDLDTE, HSIC, HGIND
                    from crsp.comphist
                    where hsic not between 6000 and 7000
                 """, date_cols=['hdldte','hchgenddt'])

print('Shape:',comphist.shape)
comphist.to_csv('data/comphist.csv')
print('Elapsed time (minutes):',((time.time() - start)/60))
# http://www.crsp.org/products/documentation/master-header-and-header-history
#HCHGDT	Comphist description effective date
#HCHGENDDT	Comphist description last effective date
#HDLDTE	Historical research company – deletion date
#HFYRC	Historical fiscal year end month / current
#HSIC	Historical SIC Code
#HGIND	Historical GICS industries

Shape: (260397, 6)
Elapsed time (minutes): 0.12371431589126587


####  QUERY LINKING TABLE

In [6]:
start = time.time()
link_table = db.raw_sql("""select *
                        from crsp_a_ccm.ccmxpf_lnkhist 
                        """, date_cols=['linkenddt','linkdt'])

link_table.gvkey=link_table.gvkey.astype(float)

link_table.to_csv('data/link_table.csv')
print('Shape:',link_table.shape)
print('Elapsed time (seconds):',(time.time() - start))

Shape: (103457, 8)
Elapsed time (seconds): 3.1941397190093994


# READ IN DATA

#### READ IN COMP_FUNDQ

In [7]:
start = time.time()

dtypes = {
          'gvkey':float,  'iid':'str',  'datadate':'str',  'datacqtr': 'str',  'fyearq':'str',   'fqtr':'str',
          'fyr':'str',    'tic':'str',  'conm':'str',      'atq':float,        'ltq':float,      'niq':float, 
          'ceqq':float,   'cheq':float
          }

parse_dates = ['datadate', 'datacqtr']

cols = list(pd.read_csv("data/comp_fundq.csv", nrows =1))

comp_fundq = pd.read_csv('data/comp_fundq.csv',dtype=dtypes,parse_dates=parse_dates,
                         usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',comp_fundq.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (80089, 14)
Elapsed time (minutes): 0.003978848457336426


#### READ IN CRSP_MSF

In [8]:
start = time.time()

dtypes = {
          'permno':float,  'hsiccd':float,    'date': 'str',     'prc':float,   'ret':float,
          'shrout':float,  'cfacpr':float,    'cfacshr':float,   'retx':float
          }

parse_dates = ['date']

cols = list(pd.read_csv("data/crsp_msf.csv", nrows =1))

crsp_msf = pd.read_csv('data/crsp_msf.csv',dtype=dtypes,parse_dates=parse_dates,
                         usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',crsp_msf.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (84437, 9)
Elapsed time (minutes): 0.00234830379486084


####  READ IN COMPHIST

In [9]:
start = time.time()

dtypes = {'gvkey':float, 'hchgdt':float, 'hchgenddt': 'str', 'hdldte':float, 'hgind':float,}

parse_dates = ['hchgdt','hchgenddt','hdldte']

cols = list(pd.read_csv("data/comphist.csv", nrows =1))

comphist = pd.read_csv('data/comphist.csv',dtype=dtypes,parse_dates=parse_dates,
                         usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',comphist.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (260397, 6)
Elapsed time (minutes): 0.005789665381113688


#### READ IN LINK_TABLE

In [10]:
start = time.time()

cols = list(pd.read_csv("data/link_table.csv", nrows =1))

link_table = pd.read_csv('data/link_table.csv',usecols =[i for i in cols if i != 'Unnamed: 0'])

print('Shape:',link_table.shape)
print('Elapsed time (minutes):',((time.time() - start)/60))

Shape: (103457, 8)
Elapsed time (minutes): 0.002365883191426595


# DATA

In [11]:
comphist.head()

Unnamed: 0,gvkey,hchgdt,hchgenddt,hdldte,hsic,hgind
0,1000.0,2007-04-14,NaT,1978-06-30,3089.0,
1,1001.0,2007-04-14,2011-02-11,1986-07-31,5812.0,253010.0
2,1001.0,2011-02-12,2012-03-09,1986-07-31,5812.0,253010.0
3,1001.0,2012-03-10,NaT,1986-07-31,5812.0,253010.0
4,1002.0,2007-04-14,2010-01-08,1977-03-31,3825.0,


In [12]:
comp_fundq.head()

Unnamed: 0,gvkey,iid,datadate,datacqtr,fyearq,fqtr,fyr,tic,conm,atq,ltq,niq,ceqq,cheq
0,1410.0,1,2019-01-31,2018-10-01,2019.0,1.0,10.0,ABM,ABMINDUSTRIESINC,3686.4,2225.3,13.0,1461.1,30.6
1,1562.0,1,2019-01-31,2018-10-01,2018.0,3.0,4.0,AMSWA,AMERICANSOFTWARE-CLA,160.821,46.465,2.301,114.356,83.164
2,1618.0,1,2019-01-31,2018-10-01,2018.0,3.0,4.0,AXR,AMREPCORP,104.51,16.539,-0.032,87.971,14.233
3,1632.0,1,2019-01-31,2018-10-01,2019.0,1.0,10.0,ADI,ANALOGDEVICES,21828.278,10242.841,355.006,11585.437,605.864
4,1704.0,1,2019-01-31,2018-10-01,2019.0,1.0,10.0,AMAT,APPLIEDMATERIALSINC,18922.0,10713.0,771.0,8209.0,3712.0


In [13]:
crsp_msf.head()

Unnamed: 0,permno,hsiccd,date,prc,ret,shrout,cfacpr,cfacshr,retx
0,10026.0,2052.0,2019-12-02,184.25,-0.004054,18899.0,1.0,1.0,-0.004054
1,10028.0,5094.0,2019-12-02,1.38,-0.014286,26924.0,1.0,1.0,-0.014286
2,10032.0,3670.0,2019-12-02,74.669998,-0.016206,29179.0,1.0,1.0,-0.016206
3,10044.0,2060.0,2019-12-02,8.8,0.005714,5995.0,1.0,1.0,0.005714
4,10051.0,8093.0,2019-12-02,26.52,0.012214,37338.0,1.0,1.0,0.012214


# MERGE DATA

#### MERGE + EXPORT DATA

In [14]:
start = time.time()

df1 = pd.merge(comp_fundq, link_table, how='inner', on = "gvkey")
df2 = pd.merge(df1, comphist, how='inner', on = "gvkey")
df3 = pd.merge(df2, crsp_msf, how='inner', left_on = "lpermno", right_on='permno')

print('Shape:',df3.shape)
print('Elapsed time (minutes):',(time.time() - start)/60)

Shape: (7760323, 35)
Elapsed time (minutes): 0.0694479505221049


In [15]:
# caution takes a while
start = time.time()

df3.to_csv('data/merged_data.csv')

print('Elapsed time (minutes):',(time.time() - start)/60)

KeyboardInterrupt: 

#### READ IN MERGED_DATA

In [None]:
start = time.time()

cols = list(pd.read_csv("data/merged_data.csv", nrows =1))
merged_data = pd.read_csv('data/merged_data.csv',usecols =[i for i in cols if i != 'Unnamed: 0'])
merged_data.dtypes

print('Elapsed time (minutes):',(time.time() - start)/60)