In [1]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress
import string

In [2]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.2_entryfiles_20210101-20220617\entryfilescombined_20210101-20220617.xlsx")
entryfilescombined_withcleanfirmnames_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_firm_names_to_match_20210101-20220617\entryfilescombined_with_cleanfirmnames_20210101-20220617.xlsx")
cleanfirmnamestomatch_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_firm_names_to_match_20210101-20220617\cleanfirmnames_to_match_20210101-20220617.xlsx")

In [3]:
df = pd.read_excel(inputfilepath)
df.head(3)

Unnamed: 0,Keyword,Paragraph,Date,Title,Subtitle,Report
0,IRR,"Ultimately, with that, I'd probably go to a sl...",2021-09-22,- EVENT TRANSCRIPT OF ADELAIDE CAPITAL MARKETS...,- Event Transcript of Adelaide Capital Markets...,73024325
1,discount rate,"And finally, on Slide 59, we have the changes ...",2021-02-12,- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS...,- Event Transcript of Caixa Geral de Depositos...,71457819
2,discount rate,"On Slide 46, we have the distributable items a...",2021-08-02,- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS...,- Event Transcript of Caixa Geral de Depositos...,72669541


In [4]:
def get_clean_firm_name(title):
    # Convert to upper case
    clean_firm_name = title.upper()
    
    # Use the same abbreviated firm suffix (e.g. incorporated to inc, corporation to corp, limited to ltd)
    clean_firm_name = re.sub('INCORPORATED', 'INC', clean_firm_name)
    clean_firm_name = re.sub('CORPORATION', 'CORP', clean_firm_name)
    clean_firm_name = re.sub('LIMITED', 'LTD', clean_firm_name)
    clean_firm_name = re.sub('AKTIENGESELLSCHAFT', 'AG', clean_firm_name)
    
    
    # We want to convert titles like:
    # "- EVENT TRANSCRIPT OF ADELAIDE CAPITAL MARKETS CONFERENCE CALL" to "ADELAIDE CAPITAL MARKETS", and
    # "- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS SA CONFERENCE C" to "CAIXA GERAL DE DEPOSITOS SA"
    
    # Remove everything before "EVENT TRANSCRIPT OF" 
    clean_firm_name = re.sub('^.+EVENT TRANSCRIPT OF ', '', clean_firm_name)
    
    # Remove everything after "CONFERENCE"
    clean_firm_name = re.sub(' CONFERENCE.+$', '', clean_firm_name)
    
    # Remove punctuation
    clean_firm_name = clean_firm_name.translate(str.maketrans('', '', string.punctuation))

    # Remove extra spaces
    clean_firm_name = clean_firm_name.strip()
    clean_firm_name = re.sub(' +', ' ', clean_firm_name)
    
    return clean_firm_name

In [5]:
# Clean titles to get clean firm names
df['clean_firm_name'] = df['Title'].apply(get_clean_firm_name)
df

Unnamed: 0,Keyword,Paragraph,Date,Title,Subtitle,Report,clean_firm_name
0,IRR,"Ultimately, with that, I'd probably go to a sl...",2021-09-22,- EVENT TRANSCRIPT OF ADELAIDE CAPITAL MARKETS...,- Event Transcript of Adelaide Capital Markets...,73024325,ADELAIDE CAPITAL MARKETS
1,discount rate,"And finally, on Slide 59, we have the changes ...",2021-02-12,- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS...,- Event Transcript of Caixa Geral de Depositos...,71457819,CAIXA GERAL DE DEPOSITOS SA
2,discount rate,"On Slide 46, we have the distributable items a...",2021-08-02,- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS...,- Event Transcript of Caixa Geral de Depositos...,72669541,CAIXA GERAL DE DEPOSITOS SA
3,IRR,"Important to note, that Mantos Blancos' proces...",2022-05-13,- EVENT TRANSCRIPT OF CAPSTONE MINING CORP CON...,- Event Transcript of Capstone Mining Corp con...,74937592,CAPSTONE MINING CORP
4,cost of debt,Herbie Goldstein - Howard Energy Partners - VP...,2022-03-29,- EVENT TRANSCRIPT OF HOWARD MIDSTREAM ENERGY ...,- Event Transcript of Howard Midstream Energy ...,74453973,HOWARD MIDSTREAM ENERGY PARTNERS LLC CON
...,...,...,...,...,...,...,...
6894,IRR,"Raffaele Sadun - SelectQuote, Inc. - CFO Yes. ...",2021-08-25,ZEBU,SLQT.N - Event Transcript of SelectQuote Inc c...,72838963,ZEBU
6895,return on assets,Our cost-to-income ratio within the period was...,2021-09-01,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,72879640,ZENITH BANK
6896,return on assets,"So our capital base is a strength for us, alth...",2021-09-01,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,72879640,ZENITH BANK
6897,return on assets,Return on asset and return on equity we realiz...,2022-03-03,ZENITH BANK,ZENITHB.LG - Event Transcript of Zenith Bank P...,74255694,ZENITH BANK


In [6]:
# Get unique clean firm names. These will be the firms we want to match against compustat/hassan.
clean_firm_names = pd.DataFrame(df['clean_firm_name'].value_counts())
clean_firm_names.reset_index(inplace=True)
clean_firm_names = clean_firm_names.rename(columns = {'clean_firm_name': 'count', 'index':'clean_firm_name'})
clean_firm_names

Unnamed: 0,clean_firm_name,count
0,CAMDEN PROPERTY TRUST,41
1,BUNGE LTD,34
2,ROYALTY PHARMA PLC,34
3,SUNRUN INC,32
4,SUNNOVA ENERGY INTERNATIONAL INC,32
...,...,...
1971,IMMOFINANZ AG,1
1972,IMPACT HEALTHCARE REIT PLC,1
1973,IMPERIAL METALS CORP,1
1974,INDEPENDENT BANK CORPMI,1


In [7]:
# Save output as .xlsx files
df.to_excel(entryfilescombined_withcleanfirmnames_filepath, index=False)
print("Saved entryfiles combined with clean firm names to:", entryfilescombined_withcleanfirmnames_filepath)

clean_firm_names.to_excel(cleanfirmnamestomatch_filepath, index=False)
print("\nSaved clean firm names to match to:", cleanfirmnamestomatch_filepath)

Saved entryfiles combined with clean firm names to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_firm_names_to_match_20210101-20220617\entryfilescombined_with_cleanfirmnames_20210101-20220617.xlsx

Saved clean firm names to match to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_firm_names_to_match_20210101-20220617\cleanfirmnames_to_match_20210101-20220617.xlsx
