In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from pathlib import Path

In [2]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.1_paragraphs_containing_keywords_20210101-20220617\20210101-20220617_paragraphs_containing_keywords_combined.gzip")
paragraphs_combined_sorted_outputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.1_paragraphs_containing_keywords_20210101-20220617\20210101-20220617_paragraphs_containing_keywords_combined_sorted.gzip")
entryfiles_combined_outputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.2_entryfiles_20210101-20220617\entryfilescombined_20210101-20220617.xlsx")
keywords_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\03_identify_paragraphs_containing_keywords\reference_files\keywords.txt")

# Import df and keywords

In [3]:
# Import df
df = pd.read_parquet(inputfilepath)
df["Date"] = df["Date"].apply(pd.to_datetime)
print("df.shape:", df.shape)
df.head(3)

df.shape: (6899, 18)


Unnamed: 0,Keyword,Paragraph,Report,PPV,TOC,Title,Subtitle,Date,Pages,Price,Contributor,Analyst,Language,Collection,Call,file,hasnumber,filestem
0,cost of capital,brian alexitch - greenwich investment manageme...,71114841,N,Y,GEG.OQ - EVENT TRANSCRIPT OF FOREST INVESTMENT...,GEG.OQ - Event Transcript of Forest Investment...,2020-12-29,7,Subscription,THOMSON REUTERS STREETEVENTS,"RESEARCH DEPARTMENT, ET AL",English,INV,\n CORPORATE PARTICIPANTS\nPeter Andrew Reed ...,20201229-20210101_1.csv,1.0,20201229-20210101_1
0,internal rate of return,"as you'll see on the slide, we've made a numbe...",71155272,N,Y,SVB FINANCIAL GROUP,SIVB.OQ - Event Transcript of SVB Financial Gr...,2021-01-04,14,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel J. Beck SVB...,20210102-20210105_1.csv,1.0,20210102-20210105_1
1,internal rate of return,a. highlights: 1. boston private will...,71158711,N,Y,SVB FINANCIAL GROUP,SIVB.OQ - Event Brief of SVB Financial Group c...,2021-01-04,15,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel J. Beck SVB...,20210102-20210105_1.csv,1.0,20210102-20210105_1


In [4]:
# Import keywords
keywords = pd.read_csv(keywords_filepath, sep = "\t", header = None)
keywords = keywords.rename(columns = {0: "Keyword"})
keywords.head(3)

Unnamed: 0,Keyword
0,ROIC
1,return on invested capital
2,hurdle premium


# Create dictionary for keyword priorities

In [5]:
# Initialize non-zero values
keyword_priority_dict = defaultdict(int)
keyword_priority_dict["hurdle rate"] = 5
keyword_priority_dict["cost of equity"] = 4
keyword_priority_dict["cost of capital"] = 3
keyword_priority_dict["IRR"] = 2
keyword_priority_dict["internal rate of return"] = 2
keyword_priority_dict["ROIC"] = 1
keyword_priority_dict["return on invested capital"] = 1

# Match keywords to priorities - keywords not in dict will automatically have priority of 0
keywords["priority"] = keywords['Keyword'].apply(lambda x: keyword_priority_dict[x])

# Recreate dict, including keywords with priority of 0 
priority_dict = dict(keywords.set_index('Keyword')["priority"])
priority_dict

{'ROIC': 1,
 'return on invested capital': 1,
 'hurdle premium': 0,
 'discount rate': 0,
 'opportunity cost of capital': 0,
 'OCC': 0,
 'fudge factor': 0,
 'required return': 0,
 'required rate of return': 0,
 'require a return': 0,
 'expected return': 0,
 'expected rate of return': 0,
 'expect a return': 0,
 'CAPM': 0,
 'capital asset pricing model': 0,
 'internal rate of return': 2,
 'IRR': 2,
 'weighted cost of capital': 0,
 'weighted average cost of capital': 0,
 'WACC': 0,
 'hurdle rate': 5,
 'cost of capital': 3,
 'cost of equity': 4,
 'cost of debt': 0,
 'return on assets': 0,
 'return on net assets': 0}

# Main

In [6]:
# Match keyword to priority for each row
df['priority'] = df['Keyword'].apply(lambda x: keyword_priority_dict[x])

# Sort df: highest priority keyword first, then title/firm name (A first), then date (earliest first).
df = df.sort_values(["priority", "Title", "Date"], ascending = [False, True, True])
df

Unnamed: 0,Keyword,Paragraph,Report,PPV,TOC,Title,Subtitle,Date,Pages,Price,Contributor,Analyst,Language,Collection,Call,file,hasnumber,filestem,priority
0,hurdle rate,please turn to slide 21 and 22. we continue to...,72106173,N,Y,180 DEGREE CAPITAL CORP,TURN.OQ - Event Transcript of 180 Degree Capit...,2021-05-12,9,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel B. Wolfe 18...,20210510-20210513_11.csv,1.0,20210510-20210513_11,5
3,hurdle rate,please turn to slide 22 and 23. we continue to...,73444001,N,Y,180 DEGREE CAPITAL CORP,TURN.OQ - Event Transcript of 180 Degree Capit...,2021-11-10,13,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel B. Wolfe 18...,20211110-20211113_20.csv,1.0,20211110-20211113_20,5
0,hurdle rate,please turn to slide 21 and 22. we continue to...,74201961,N,Y,180 DEGREE CAPITAL CORP,TURN.OQ - Event Transcript of 180 Degree Capit...,2022-02-25,10,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel B. Wolfe 18...,20220222-20220225_3.csv,1.0,20220222-20220225_3,5
0,hurdle rate,please turn to slide 21 and 22. we provide the...,74937637,N,Y,180 DEGREE CAPITAL CORP,TURN.OQ - Event Transcript of 180 Degree Capit...,2022-05-12,16,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel B. Wolfe 18...,20220509-20220512_11.csv,1.0,20220509-20220512_11,5
7,hurdle rate,michael preston - advansix inc. - cfo & senior...,73050152,N,Y,ADVANSIX INC,ASIX.N - Event Transcript of AdvanSix Inc conf...,2021-09-28,27,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nAdam Kressel Advan...,20210927-20210930_4.csv,1.0,20210927-20210930_4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,irr,"raffaele sadun - selectquote, inc. - cfo yes. ...",72838963,N,Y,ZEBU,SLQT.N - Event Transcript of SelectQuote Inc c...,2021-08-25,18,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nMatthew Scott Gunt...,20210822-20210825_2.csv,1.0,20210822-20210825_2,0
1,return on assets,our cost-to-income ratio within the period was...,72879640,N,Y,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,2021-09-01,18,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDennis Olisa Zenit...,20210830-20210902_3.csv,1.0,20210830-20210902_3,0
2,return on assets,"so our capital base is a strength for us, alth...",72879640,N,Y,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,2021-09-01,18,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDennis Olisa Zenit...,20210830-20210902_3.csv,1.0,20210830-20210902_3,0
0,return on assets,return on asset and return on equity we realiz...,74255694,N,Y,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,2022-03-03,17,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDennis Olisa Zenit...,20220302-20220305_5.csv,1.0,20220302-20220305_5,0


In [7]:
# Save paragraphs_combined_sorted
df.to_parquet(paragraphs_combined_sorted_outputfilepath, index = False, compression = "gzip")
print("Saved paragraphs_combined_sorted to:", paragraphs_combined_sorted_outputfilepath)

# Save entryfiles_combined
df_entryfiles = df[['Keyword', 'Paragraph', 'Date', 'Title', 'Subtitle', 'Report']]
df_entryfiles.to_excel(entryfiles_combined_outputfilepath, index=False)
print("Saved entryfiles_combined to:", entryfiles_combined_outputfilepath)

Saved paragraphs_combined_sorted to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.1_paragraphs_containing_keywords_20210101-20220617\20210101-20220617_paragraphs_containing_keywords_combined_sorted.gzip
Saved entryfiles_combined to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.2_convert_paragraphs_to_entryfiles_20210101-20220617\entryfilescombined_20210101-20220617.xlsx
