In [1]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress
import string

In [10]:
inputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\03_identify_paragraphs_containing_keywords\03.2_entryfiles_20200101-20210909\entryfiles_combined_v5_withparagraphs_andgvkey.xlsx")
entryfilescombined_withcleanfirmnames_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.2_firm_names_to_match\multiple_data_pulls_combined\20200101-20210909\xlscombined_with_cleanfirmnames_20200101-20210909.csv")
cleanfirmnamestomatch_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.2_firm_names_to_match\multiple_data_pulls_combined\20200101-20210909\cleanfirmnames_to_match_20200101-20210909.csv")

In [3]:
df = pd.read_excel(inputfilepath)
df.head(3)

Unnamed: 0,Keywords,Paragraph,Date,gvkey,Title,Subtitle,Report
0,interest rate,MS. LYNNE PARSHALL: We ended the first quarter...,2002-07-03,24040.0,ISIS PHARMACEUTICALS (ISIS) - ISIS PHARMACEUTI...,Â,8008277
1,interest rate,"35%, we anticipate margins in the fourth quar...",2002-07-24,65570.0,AMERICAN ITALIAN PASTA (PLB) - Q3 2002 FINANCI...,Â,8009744
2,interest rate,MR. DINNI JAIN: Sure. As of June 30th we had 1...,2002-07-24,122096.0,INSIGHT COMMUNICATIONS (ICCI) - Q2 2002 FINANC...,Â,8009801


In [4]:
def get_clean_firm_name(title):
    # Convert to upper case
    clean_firm_name = title.upper()
    
    # Use the same abbreviated firm suffix (e.g. incorporated to inc, corporation to corp, limited to ltd)
    clean_firm_name = re.sub('INCORPORATED', 'INC', clean_firm_name)
    clean_firm_name = re.sub('CORPORATION', 'CORP', clean_firm_name)
    clean_firm_name = re.sub('LIMITED', 'LTD', clean_firm_name)
    clean_firm_name = re.sub('COMPANY', 'CO', clean_firm_name)
    clean_firm_name = re.sub('AKTIENGESELLSCHAFT', 'AG', clean_firm_name)
    clean_firm_name = re.sub('INTERNATIONAL', 'INTL', clean_firm_name)
    
    # We want to convert titles like:
    # "- EVENT TRANSCRIPT OF ADELAIDE CAPITAL MARKETS CONFERENCE CALL" to "ADELAIDE CAPITAL MARKETS", and
    # "- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS SA CONFERENCE C" to "CAIXA GERAL DE DEPOSITOS SA"
    
    # Note: more removals after including older conference calls
    
    # Remove everything before "EVENT TRANSCRIPT OF" or "EVENT BRIEF OF" 
    clean_firm_name = re.sub('^.*EVENT TRANSCRIPT OF ', '', clean_firm_name)
    clean_firm_name = re.sub('^.*EVENT BRIEF OF ', '', clean_firm_name)
    
    # Remove formats like "... (<firm ticker>) - ..."
    clean_firm_name = re.sub('\(\w+\) - .+$', '', clean_firm_name)
    
    # Remove formats like "- PRELIMINARY TRANSCRIPT ..."
    clean_firm_name = re.sub('- *PRELIMINARY TRANSCRIPT.*$', '', clean_firm_name)
    clean_firm_name = re.sub('- *FINAL TRANSCRIPT.*$', '', clean_firm_name)
    clean_firm_name = re.sub('- *EDITED BRIEF.*$', '', clean_firm_name)
    clean_firm_name = re.sub('- *EDITED TRANSCRIPT.*$', '', clean_firm_name)
    clean_firm_name = re.sub('- *PRELIMINARY.*$', '', clean_firm_name)
    
    # Remove everything after "CONFERENCE"
    clean_firm_name = re.sub('- .+ EARNINGS .*$', '', clean_firm_name)
    clean_firm_name = re.sub('EARNINGS CONFERENCE.*$', '', clean_firm_name)
    clean_firm_name = re.sub(' CONFERENCE.*$', '', clean_firm_name)
    clean_firm_name = re.sub('FINANCIAL RELEASE C.*$', '', clean_firm_name)
    
    # Remove everything after "Q\d \d\d\d\d", e.g. "Q1 2001"
    # Also "\dQ \d\d\d\d", e.g. "1Q 2001".
    # Used * instead of + because the title may end with Q1 2001, e.g. "<firm> Q1 2001"
    clean_firm_name = re.sub(' Q\d \d{4}.*$', '', clean_firm_name)
    clean_firm_name = re.sub(' \dQ \d{4}.*$', '', clean_firm_name)
    
    # Remove punctuation
    clean_firm_name = clean_firm_name.translate(str.maketrans('', '', string.punctuation))

    # Remove extra spaces
    clean_firm_name = clean_firm_name.strip()
    clean_firm_name = re.sub(' +', ' ', clean_firm_name)
    
    return clean_firm_name

In [5]:
# Clean titles to get clean firm names
df['clean_firm_name'] = df['Title'].apply(get_clean_firm_name)
df

Unnamed: 0,Keywords,Paragraph,Date,gvkey,Title,Subtitle,Report,clean_firm_name
0,interest rate,MS. LYNNE PARSHALL: We ended the first quarter...,2002-07-03,24040.0,ISIS PHARMACEUTICALS (ISIS) - ISIS PHARMACEUTI...,Â,8008277,ISIS PHARMACEUTICALS
1,interest rate,"35%, we anticipate margins in the fourth quar...",2002-07-24,65570.0,AMERICAN ITALIAN PASTA (PLB) - Q3 2002 FINANCI...,Â,8009744,AMERICAN ITALIAN PASTA
2,interest rate,MR. DINNI JAIN: Sure. As of June 30th we had 1...,2002-07-24,122096.0,INSIGHT COMMUNICATIONS (ICCI) - Q2 2002 FINANC...,Â,8009801,INSIGHT COMMUNICATIONS
3,discount rate,MR. GERRY LUTERMAN: This is Gerry Luterman. Le...,2002-07-25,6799.0,KEYSPAN CORP. (KSE) - Q2 2002 FINANCIAL RELEAS...,Â,8009965,KEYSPAN CORP
4,irr,people per year. It has 300 retail stores inc...,2002-07-31,,SIMON PROPERTY GROUP INCORPORATED (SPG) - Q2 2...,,8710501,SIMON PROPERTY GROUP INC
...,...,...,...,...,...,...,...,...
80637,cost of equity,We're confident Stage 3 is well positioned for...,2021-09-07,29453.0,CHENIERE ENERGY INC,LNG.A - Event Transcript of Cheniere Energy In...,72918196,CHENIERE ENERGY INC
80638,cost of debt,Michael Webber - Webber Research & Advisory LL...,2021-09-07,29453.0,CHENIERE ENERGY INC,LNG.A - Event Brief of Cheniere Energy Inc con...,72918224,CHENIERE ENERGY INC
80639,irr,"On ESG, we just discussed. On performance, oka...",2021-09-07,296577.0,APERAM SA,APAM.AS - Event Transcript of Aperam SA confer...,72931747,APERAM SA
80640,interest rate,Looking at Global Broking revenue by asset cla...,2021-09-07,241016.0,TP ICAP GROUP PLC,TCAPI.L - Event Transcript of TP ICAP Group PL...,72939535,TP ICAP GROUP PLC


In [6]:
# Get unique clean firm names. These will be the firms we want to match against compustat/hassan.
clean_firm_names = pd.DataFrame(df['clean_firm_name'].value_counts())
clean_firm_names.reset_index(inplace=True)
clean_firm_names = clean_firm_names.rename(columns = {'clean_firm_name': 'count', 'index':'clean_firm_name'})
clean_firm_names

Unnamed: 0,clean_firm_name,count
0,PLEXUS CORP,210
1,ARCHERDANIELSMIDLAND CO,209
2,AUTOZONE INC,197
3,ILLINOIS TOOL WORKS INC,180
4,ALASKA AIR GROUP INC,171
...,...,...
9149,SHANDA GAMES LTD,1
9150,ORCHARD SUPPLY HARDWARE STORES CORP,1
9151,LOCAL COM CORP,1
9152,OCULUS INNOVATIVE SCIENCES INC,1


In [11]:
# Save output as .xlsx files
df.to_csv(entryfilescombined_withcleanfirmnames_filepath, index=False)
print("Saved entryfiles combined with clean firm names to:", entryfilescombined_withcleanfirmnames_filepath)

clean_firm_names.to_csv(cleanfirmnamestomatch_filepath, index=False)
print("\nSaved clean firm names to match to:", cleanfirmnamestomatch_filepath)

Saved entryfiles combined with clean firm names to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.2_firm_names_to_match\multiple_data_pulls_combined\20200101-20210909\xlscombined_with_cleanfirmnames_20200101-20210909.csv

Saved clean firm names to match to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.2_firm_names_to_match\multiple_data_pulls_combined\20200101-20210909\cleanfirmnames_to_match_20200101-20210909.csv
