In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from pathlib import Path

In [2]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.1_paragraphs_containing_keywords_20210101-20220617\20210101-20220617_paragraphs_containing_keywords_combined.gzip")
paragraphs_combined_sorted_outputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.1_paragraphs_containing_keywords_20210101-20220617\20210101-20220617_paragraphs_containing_keywords_combined_sorted.gzip")
entryfiles_combined_outputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.2_entryfiles_20210101-20220617\entryfilescombined_20210101-20220617.xlsx")
keywords_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\code\03_identify_paragraphs_containing_keywords\reference_files\keywords.txt")

# Import df and keywords

In [3]:
# Import df
df = pd.read_parquet(inputfilepath)
print("df.shape:", df.shape)
df.head(3)

df.shape: (6899, 16)


Unnamed: 0,Keyword,Paragraph,Report,PPV,TOC,Title,Subtitle,Date,Pages,Price,Contributor,Analyst,Language,Collection,Call,filestem
0,cost of capital,Brian Alexitch - Greenwich Investment Manageme...,71114841,N,Y,GEG.OQ - EVENT TRANSCRIPT OF FOREST INVESTMENT...,GEG.OQ - Event Transcript of Forest Investment...,2020-12-29,7,Subscription,THOMSON REUTERS STREETEVENTS,"RESEARCH DEPARTMENT, ET AL",English,INV,\n CORPORATE PARTICIPANTS\nPeter Andrew Reed ...,20201229-20210101_1
0,internal rate of return,"As you'll see on the slide, we've made a numbe...",71155272,N,Y,SVB FINANCIAL GROUP,SIVB.OQ - Event Transcript of SVB Financial Gr...,2021-01-04,14,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel J. Beck SVB...,20210102-20210105_1
1,internal rate of return,A. Highlights: 1. Boston Private will...,71158711,N,Y,SVB FINANCIAL GROUP,SIVB.OQ - Event Brief of SVB Financial Group c...,2021-01-04,15,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel J. Beck SVB...,20210102-20210105_1


In [4]:
# Import keywords
keywords = pd.read_csv(keywords_filepath, sep = "\t", header = None)
keywords = keywords.rename(columns = {0: "Keyword"})
keywords.head(3)

Unnamed: 0,Keyword
0,ROIC
1,return on invested capital
2,hurdle premium


# Create dictionary for keyword priorities

In [5]:
# Initialize non-zero values
keyword_priority_dict = defaultdict(int)
keyword_priority_dict["hurdle rate"] = 5
keyword_priority_dict["cost of equity"] = 4
keyword_priority_dict["cost of capital"] = 3
keyword_priority_dict["IRR"] = 2
keyword_priority_dict["internal rate of return"] = 2
keyword_priority_dict["ROIC"] = 1
keyword_priority_dict["return on invested capital"] = 1

# Match keywords to priorities - keywords not in dict will automatically have priority of 0
keywords["priority"] = keywords['Keyword'].apply(lambda x: keyword_priority_dict[x])

# Recreate dict, including keywords with priority of 0 
priority_dict = dict(keywords.set_index('Keyword')["priority"])
priority_dict

{'ROIC': 1,
 'return on invested capital': 1,
 'hurdle premium': 0,
 'discount rate': 0,
 'opportunity cost of capital': 0,
 'OCC': 0,
 'fudge factor': 0,
 'required return': 0,
 'required rate of return': 0,
 'require a return': 0,
 'expected return': 0,
 'expected rate of return': 0,
 'expect a return': 0,
 'CAPM': 0,
 'capital asset pricing model': 0,
 'internal rate of return': 2,
 'IRR': 2,
 'weighted cost of capital': 0,
 'weighted average cost of capital': 0,
 'WACC': 0,
 'hurdle rate': 5,
 'cost of capital': 3,
 'cost of equity': 4,
 'cost of debt': 0,
 'return on assets': 0,
 'return on net assets': 0}

# Main

In [6]:
# Match keyword to priority for each row
df['priority'] = df['Keyword'].apply(lambda x: keyword_priority_dict[x])

# Sort df: title/firm name (A-Z), then date (oldest to most recent), then keyword priority (highest to lowest).
# Note: sorting by date works even without setting date as a datetime object.
df = df.sort_values(["Title", "Date", "priority"], ascending = [True, True, False])
df

Unnamed: 0,Keyword,Paragraph,Report,PPV,TOC,Title,Subtitle,Date,Pages,Price,Contributor,Analyst,Language,Collection,Call,filestem,priority
1,IRR,"Ultimately, with that, I'd probably go to a sl...",73024325,N,Y,- EVENT TRANSCRIPT OF ADELAIDE CAPITAL MARKETS...,- Event Transcript of Adelaide Capital Markets...,2021-09-22,15,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDaniel W. Dickson ...,20210919-20210922_2,2
4,discount rate,"And finally, on Slide 59, we have the changes ...",71457819,N,Y,- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS...,- Event Transcript of Caixa Geral de Depositos...,2021-02-12,13,Subscription,THOMSON REUTERS STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nJose Antonio da Si...,20210211-20210214_2,0
1,discount rate,"On Slide 46, we have the distributable items a...",72669541,N,Y,- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS...,- Event Transcript of Caixa Geral de Depositos...,2021-08-02,9,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nJose Antonio da Si...,20210802-20210805_37,0
3,IRR,"Important to note, that Mantos Blancos' proces...",74937592,N,Y,- EVENT TRANSCRIPT OF CAPSTONE MINING CORP CON...,- Event Transcript of Capstone Mining Corp con...,2022-05-13,12,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nCashel Aran Meaghe...,20220513-20220516_4,2
8,cost of debt,Herbie Goldstein - Howard Energy Partners - VP...,74453973,N,Y,- EVENT TRANSCRIPT OF HOWARD MIDSTREAM ENERGY ...,- Event Transcript of Howard Midstream Energy ...,2022-03-29,10,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nMatt Lawrence Howa...,20220326-20220329_3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,IRR,"Raffaele Sadun - SelectQuote, Inc. - CFO Yes. ...",72838963,N,Y,ZEBU,SLQT.N - Event Transcript of SelectQuote Inc c...,2021-08-25,18,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nMatthew Scott Gunt...,20210822-20210825_2,2
1,return on assets,Our cost-to-income ratio within the period was...,72879640,N,Y,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,2021-09-01,18,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDennis Olisa Zenit...,20210830-20210902_3,0
2,return on assets,"So our capital base is a strength for us, alth...",72879640,N,Y,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,2021-09-01,18,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDennis Olisa Zenit...,20210830-20210902_3,0
0,return on assets,Return on asset and return on equity we realiz...,74255694,N,Y,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,2022-03-03,17,Subscription,REFINITIV STREETEVENTS,ANON,English,INV,\n CORPORATE PARTICIPANTS\nDennis Olisa Zenit...,20220302-20220305_5,0


In [7]:
# Save paragraphs_combined_sorted
df.to_parquet(paragraphs_combined_sorted_outputfilepath, index = False, compression = "gzip")
print("Saved paragraphs_combined_sorted to:", paragraphs_combined_sorted_outputfilepath)

# Save entryfiles_combined
df_entryfiles = df[['Keyword', 'Paragraph', 'Date', 'Title', 'Subtitle', 'Report']]
df_entryfiles.to_excel(entryfiles_combined_outputfilepath, index=False)
print("Saved entryfiles_combined to:", entryfiles_combined_outputfilepath)

Saved paragraphs_combined_sorted to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.1_paragraphs_containing_keywords_20210101-20220617\20210101-20220617_paragraphs_containing_keywords_combined_sorted.gzip
Saved entryfiles_combined to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.2_entryfiles_20210101-20220617\entryfilescombined_20210101-20220617.xlsx
