# Install WRDS

In [5]:
pip install wrds

Collecting wrds
  Using cached wrds-3.1.6-py3-none-any.whl (12 kB)
Collecting sqlalchemy<2
  Downloading SQLAlchemy-1.4.48-cp310-cp310-win_amd64.whl (1.6 MB)
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     ------------- -------------------------- 0.5/1.6 MB 16.8 MB/s eta 0:00:01
     --------------------------- ------------ 1.1/1.6 MB 13.7 MB/s eta 0:00:01
     ---------------------------------------  1.6/1.6 MB 14.2 MB/s eta 0:00:01
     ---------------------------------------- 1.6/1.6 MB 11.2 MB/s eta 0:00:00
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl (1.2 MB)
     ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
     ---------------- ----------------------- 0.5/1.2 MB 15.5 MB/s eta 0:00:01
     ----------------------------- ---------- 0.9/1.2 MB 9.2 MB/s eta 0:00:01
     ---------------------------------------  1.2/1.2 MB 9.2 MB/s eta 0:00:01
     ---------------------------------------- 1.2/1

In [20]:
import pandas as pd
import numpy as np

# Extract S&P CIK codes

In [21]:
sp_500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
sp_500
cik = []
for i in range(len(sp_500["CIK"])):
    temp = str(sp_500["CIK"][i])
    while len(temp) != 10:
        temp = "0" + temp
    cik.append(temp)
sp_500["CIK"] = pd.Series(cik)

In [22]:
sp_500

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,0000066740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,0000091142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,0000001800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,0001551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,0001467373,1989
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,0001041061,1997
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,0000877212,1969
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,0001136869,1927
501,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,0000109380,1873


# Import WRDS and connect to web (enter your credentials when prompted)

In [23]:
import wrds
db = wrds.Connection()

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


# Look at all the tables available under ciq library

In [24]:
db.list_tables("ciq")

['chars',
 'ciqaddress',
 'ciqaddresstype',
 'ciqadvisortype',
 'ciqbusinessdescription',
 'ciqbusinessdescriptionlong',
 'ciqcapstdtasrptdclasstype',
 'ciqcapstdtclasstype',
 'ciqcapstdtcompntasrptddata',
 'ciqcapstdtcomponent',
 'ciqcapstdtconvertibletype',
 'ciqcapstdtcumulativetype',
 'ciqcapstdtdescription',
 'ciqcapstdtintbenchmarktype',
 'ciqcapstdtinterestrate',
 'ciqcapstdtintratetype',
 'ciqcapstdtleveltype',
 'ciqcapstdtnonrecoursetype',
 'ciqcapstdtparticipatingtype',
 'ciqcapstdtredeemabletype',
 'ciqcapstdtsecuredtype',
 'ciqcapstdtsubtype',
 'ciqcapstdttype',
 'ciqcapsteqauthrzdsharestype',
 'ciqcapsteqcomponent',
 'ciqcapsteqcomponentdata',
 'ciqcapsteqconvertibletype',
 'ciqcapsteqsubtype',
 'ciqcapsteqtype',
 'ciqcapsteqvotingrightstype',
 'ciqchartype',
 'ciqchartypetosubtype',
 'ciqcommittee',
 'ciqcompany',
 'ciqcompanyindustrytree',
 'ciqcompanyrel',
 'ciqcompanyreltype',
 'ciqcompanystatustype',
 'ciqcompanytype',
 'ciqcompensation',
 'ciqcompensationadjustment',

# Access the table that links CIK to CIQ IDs

In [25]:
db.describe_table(library = "ciq", table = "wrds_cik")

Approximately 372840 rows in ciq.wrds_cik.


Unnamed: 0,name,nullable,type,comment
0,companyid,True,DOUBLE_PRECISION,
1,cik,True,VARCHAR(10),
2,startdate,True,DATE,
3,enddate,True,DATE,
4,companyname,True,VARCHAR(400),


In [26]:
CIK = tuple(sp_500['CIK'])

In [27]:
sql_query = 'SELECT * from ciq.wrds_cik WHERE cik IN'
sql_query += f'{CIK}'
sql_query
# Run query,  result in Pandas dataframe format
data = db.raw_sql(sql_query)
data

Unnamed: 0,companyid,cik,startdate,enddate,companyname
0,2.474830e+05,0000001800,,,Abbott Laboratories
1,1.688640e+05,0000002488,,,"Advanced Micro Devices, Inc."
2,2.483560e+05,0000002969,,,"Air Products and Chemicals, Inc."
3,2.494890e+05,0000004127,,2002-06-25,"Alpha Industries, Inc."
4,1.462309e+06,0000004127,2002-06-26,,"Skyworks Solutions, Inc."
...,...,...,...,...,...
520,6.312364e+08,0001792044,,2020-11-10,Upjohn Inc.
521,6.537486e+08,0001821825,,,Organon & Co.
522,1.698380e+05,0001841666,,,APA Corporation
523,3.136719e+06,0001868275,,,Constellation Energy Corporation


## Remove duplicated, legacy CIK entries

In [28]:
CIQ = data[data["enddate"].isna()]
CIQ

Unnamed: 0,companyid,cik,startdate,enddate,companyname
0,2.474830e+05,0000001800,,,Abbott Laboratories
1,1.688640e+05,0000002488,,,"Advanced Micro Devices, Inc."
2,2.483560e+05,0000002969,,,"Air Products and Chemicals, Inc."
4,1.462309e+06,0000004127,2002-06-26,,"Skyworks Solutions, Inc."
6,4.162645e+08,0000004281,2016-10-28,,Howmet Aerospace Inc.
...,...,...,...,...,...
519,2.902030e+05,0001792044,2020-11-11,,Viatris Inc.
521,6.537486e+08,0001821825,,,Organon & Co.
522,1.698380e+05,0001841666,,,APA Corporation
523,3.136719e+06,0001868275,,,Constellation Energy Corporation


In [29]:
# Analysis for duplicate S&P entries
# a = data[data["enddate"].isna()]
# b = sp_500["CIK"].to_list()
# empty = []
# for i in b:
#     if b.count(i) == 2:
#         empty.append(i)
# empty

# sp_500[sp_500["CIK"].isin(a["cik"])]

In [30]:
sp_500

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,0000066740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,0000091142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,0000001800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,0001551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,0001467373,1989
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,0001041061,1997
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,0000877212,1969
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,0001136869,1927
501,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,0000109380,1873


# Access the tables that contain earnings transcript for given CIQs

In [31]:
db.describe_table(library = "ciq", table = "wrds_transcript_detail")

Approximately 1454740 rows in ciq.wrds_transcript_detail.


Unnamed: 0,name,nullable,type,comment
0,companyid,True,DOUBLE_PRECISION,
1,keydevid,True,DOUBLE_PRECISION,
2,transcriptid,True,DOUBLE_PRECISION,
3,headline,True,VARCHAR(381),
4,mostimportantdateutc,True,DATE,
5,mostimportanttimeutc,True,DOUBLE_PRECISION,
6,keydeveventtypeid,True,DOUBLE_PRECISION,
7,keydeveventtypename,True,VARCHAR(400),
8,companyname,True,VARCHAR(400),
9,transcriptcollectiontypeid,True,DOUBLE_PRECISION,


In [32]:
db.describe_table(library = "ciq", table = "wrds_transcript_person")

Approximately 75881768 rows in ciq.wrds_transcript_person.


Unnamed: 0,name,nullable,type,comment
0,transcriptid,True,DOUBLE_PRECISION,
1,transcriptcomponentid,True,DOUBLE_PRECISION,
2,componentorder,True,DOUBLE_PRECISION,
3,transcriptcomponenttypeid,True,DOUBLE_PRECISION,
4,transcriptcomponenttypename,True,VARCHAR(400),
5,transcriptpersonid,True,DOUBLE_PRECISION,
6,transcriptpersonname,True,VARCHAR(800),
7,proid,True,DOUBLE_PRECISION,
8,companyofperson,True,VARCHAR(800),
9,speakertypeid,True,DOUBLE_PRECISION,


In [33]:
db.describe_table(library = "ciq", table = "ciqtranscriptcomponent")

Approximately 75972808 rows in ciq.ciqtranscriptcomponent.


Unnamed: 0,name,nullable,type,comment
0,transcriptcomponentid,True,INTEGER,
1,transcriptid,True,INTEGER,
2,componentorder,True,SMALLINT,
3,transcriptcomponenttypeid,True,SMALLINT,
4,transcriptpersonid,True,INTEGER,
5,componenttext,True,VARCHAR,


In [34]:
CIQ_list = tuple(CIQ['companyid'])

In [35]:
def WRDSpuller(year, CIQ = CIQ_list):
    # Create a sql query string
    sql_query = '''SELECT a.companyid, a.transcriptid, a.headline, a.mostimportantdateutc, a.companyname, 
    b.word_count, c.componenttext FROM'''
    sql_query += f'(SELECT * FROM ciq.wrds_transcript_detail WHERE companyid in {CIQ}'
    sql_query += '''and date_part('year',mostimportantdateutc)='''
    sql_query += f'{year}'
    sql_query += ''' and keydeveventtypename = 'Earnings Calls')  as a, ciq.wrds_transcript_person as b, ciq.ciqtranscriptcomponent as c WHERE a.transcriptid=b.transcriptid and b.transcriptcomponentid=c.transcriptcomponentid
    ORDER by a.transcriptid, b.componentorder'''
    # Run query,  result in Pandas dataframe format
    transcripts = db.raw_sql(sql_query)
    transcripts.to_pickle(f'S&P{year}.pkl')
    return transcripts

In [17]:
transcripts_2013 = WRDSpuller(2013)
transcripts_2013 #can delete

Unnamed: 0,companyid,transcriptid,headline,mostimportantdateutc,companyname,word_count,componenttext
0,890498.0,400010.0,"The Mosaic Company, Q2 2013 Earnings Call, Jan...",2013-01-04,The Mosaic Company,36.0,"Good morning, ladies and gentlemen, and welcom..."
1,890498.0,400010.0,"The Mosaic Company, Q2 2013 Earnings Call, Jan...",2013-01-04,The Mosaic Company,196.0,"Thank you, and welcome to our second quarter f..."
2,890498.0,400010.0,"The Mosaic Company, Q2 2013 Earnings Call, Jan...",2013-01-04,The Mosaic Company,1739.0,"Good morning. And we at Mosaic wish you, all, ..."
3,890498.0,400010.0,"The Mosaic Company, Q2 2013 Earnings Call, Jan...",2013-01-04,The Mosaic Company,586.0,"Thank you, Jim, and good morning, everyone. As..."
4,890498.0,400010.0,"The Mosaic Company, Q2 2013 Earnings Call, Jan...",2013-01-04,The Mosaic Company,410.0,"Thank you, Larry. Global agriculture experienc..."
...,...,...,...,...,...,...,...
15611,317627.0,2520890.0,"Boston Scientific Corporation, Q3 2013 Earning...",2013-10-24,Boston Scientific Corporation,178.0,Yes. We thought it was important to support th...
15612,317627.0,2520890.0,"Boston Scientific Corporation, Q3 2013 Earning...",2013-10-24,Boston Scientific Corporation,41.0,And just one quick question on Vessix. Can you...
15613,317627.0,2520890.0,"Boston Scientific Corporation, Q3 2013 Earning...",2013-10-24,Boston Scientific Corporation,36.0,We are not publicizing the details of the IDE ...
15614,317627.0,2520890.0,"Boston Scientific Corporation, Q3 2013 Earning...",2013-10-24,Boston Scientific Corporation,40.0,"Okay. With that, we would like to conclude the..."


In [18]:
transcripts_2014 = WRDSpuller(2014)
transcripts_2014 # can delete

Unnamed: 0,companyid,transcriptid,headline,mostimportantdateutc,companyname,word_count,componenttext
0,289030.0,565393.0,"Micron Technology Inc., Q1 2014 Earnings Call,...",2014-01-07,"Micron Technology, Inc.",57.0,"Good afternoon. My name is Saeed, and I'll be ..."
1,289030.0,565393.0,"Micron Technology Inc., Q1 2014 Earnings Call,...",2014-01-07,"Micron Technology, Inc.",211.0,"Thank you very much, and welcome to Micron Tec..."
2,289030.0,565393.0,"Micron Technology Inc., Q1 2014 Earnings Call,...",2014-01-07,"Micron Technology, Inc.",175.0,"During the course of this meeting, we may make..."
3,289030.0,565393.0,"Micron Technology Inc., Q1 2014 Earnings Call,...",2014-01-07,"Micron Technology, Inc.",14.0,And now I'd like to turn the call over to Mark...
4,289030.0,565393.0,"Micron Technology Inc., Q1 2014 Earnings Call,...",2014-01-07,"Micron Technology, Inc.",690.0,"Thanks, Kipp. I'd like to start today with an ..."
...,...,...,...,...,...,...,...
11398,319404.0,2511489.0,"Old Dominion Freight Line Inc., Q3 2014 Earnin...",2014-10-30,"Old Dominion Freight Line, Inc.",23.0,So the conclusion is the acceleration in LTL v...
11399,319404.0,2511489.0,"Old Dominion Freight Line Inc., Q3 2014 Earnin...",2014-10-30,"Old Dominion Freight Line, Inc.",73.0,"No, not at all. We're -- we focus on all of th..."
11400,319404.0,2511489.0,"Old Dominion Freight Line Inc., Q3 2014 Earnin...",2014-10-30,"Old Dominion Freight Line, Inc.",21.0,"And with no further questions, I'd like to tur..."
11401,319404.0,2511489.0,"Old Dominion Freight Line Inc., Q3 2014 Earnin...",2014-10-30,"Old Dominion Freight Line, Inc.",46.0,"Ladies and gentlemen, as we thank you for your..."


In [19]:
for i in range(2012, 2024):
    year_variable = "WRDSpuller_" + str(i)
    exec(year_variable + " = WRDSpuller(i)")
    print("Successfully pulled out for year", i)

Successfully pulled out for year 2016
Successfully pulled out for year 2017
Successfully pulled out for year 2018
Successfully pulled out for year 2019
Successfully pulled out for year 2020
Successfully pulled out for year 2021
Successfully pulled out for year 2022
Successfully pulled out for year 2023


In [36]:
for i in range(2003, 2013):
    year_variable = "WRDSpuller_" + str(i)
    exec(year_variable + " = WRDSpuller(i)")
    print("Successfully pulled out for year", i)

Successfully pulled out for year 2003
Successfully pulled out for year 2004
Successfully pulled out for year 2005
Successfully pulled out for year 2006
Successfully pulled out for year 2007
Successfully pulled out for year 2008
Successfully pulled out for year 2009
Successfully pulled out for year 2010
Successfully pulled out for year 2011


In [None]:
transcripts_2013 = WRDSpuller(2013)
transcripts_2013 # can delete