In [1]:
from pathlib import Path
import pandas as pd
import re, sys
import string

In [2]:
hassan_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_process_compustat_and_hassan_files\hassan_processed\20220331\hassan_processed.csv")
compustat_filepath= Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_process_compustat_and_hassan_files\compustat_processed\20220705\ciqcompany_mergedwithgvkeyandcountry.csv")
outputfilepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_process_compustat_and_hassan_files\compustat_and_hassan_firm_names_withgvkeyandcountry\20220705\compustat_and_hassan_firm_names_withgvkeyandcountry.csv")

In [3]:
# Import df
hassan = pd.read_csv(hassan_filepath)
compustat = pd.read_csv(compustat_filepath)

In [4]:
# Combine dfs
df = pd.concat([hassan, compustat])

print("Size of processed Hassan df:", hassan.shape)
print("Size of processed Compustat df:", compustat.shape)
print("Size of combined df:", df.shape)

Size of processed Hassan df: (13710, 3)
Size of processed Compustat df: (119662, 3)
Size of combined df: (133372, 3)


In [5]:
# Check if there are rows with duplicate company names (last run: about 1000 rows)
print("Counting rows of df:")
print("Number of rows:", df.shape[0])
print("Number of unique company names:", df['company_name'].nunique())

# Remove rows with duplicate company names
df = df.drop_duplicates('company_name')
print("\nRemoved rows with duplicate company names, recounting rows of df:")
print("Number of rows:", df.shape[0])
print("Number of unique company names:", df['company_name'].nunique())

Counting rows of df:
Number of rows: 133372
Number of unique company names: 132228

Removed rows with duplicate company names, recounting rows of df:
Number of rows: 132228
Number of unique company names: 132228


In [6]:
def get_clean_firm_name(title):
    # Convert to upper case
    clean_firm_name = title.upper()
    
    # Use the same abbreviated firm suffix (e.g. incorporated to inc, corporation to corp, limited to ltd)
    clean_firm_name = re.sub('INCORPORATED', 'INC', clean_firm_name)
    clean_firm_name = re.sub('CORPORATION', 'CORP', clean_firm_name)
    clean_firm_name = re.sub('LIMITED', 'LTD', clean_firm_name)
    clean_firm_name = re.sub('COMPANY', 'CO', clean_firm_name)
    clean_firm_name = re.sub('AKTIENGESELLSCHAFT', 'AG', clean_firm_name)
    clean_firm_name = re.sub('INTERNATIONAL', 'INTL', clean_firm_name)
    
    # We want to convert titles like:
    # "- EVENT TRANSCRIPT OF ADELAIDE CAPITAL MARKETS CONFERENCE CALL" to "ADELAIDE CAPITAL MARKETS", and
    # "- EVENT TRANSCRIPT OF CAIXA GERAL DE DEPOSITOS SA CONFERENCE C" to "CAIXA GERAL DE DEPOSITOS SA"
    
    # Remove everything before "EVENT TRANSCRIPT OF" 
    clean_firm_name = re.sub('^.+EVENT TRANSCRIPT OF ', '', clean_firm_name)
    
    # Remove everything after "CONFERENCE"
    clean_firm_name = re.sub(' CONFERENCE.+$', '', clean_firm_name)
    
    # Remove punctuation
    clean_firm_name = clean_firm_name.translate(str.maketrans('', '', string.punctuation))

    # Remove extra spaces
    clean_firm_name = clean_firm_name.strip()
    clean_firm_name = re.sub(' +', ' ', clean_firm_name)
    
    return clean_firm_name

In [7]:
# Clean firm names with the same function used to clean firm names for conference calls, in entryfilescombined.
df['clean_firm_name'] = df['company_name'].apply(get_clean_firm_name)
df = df.sort_values(by='gvkey')
df

Unnamed: 0,gvkey,company_name,hqcountry,clean_firm_name
88163,1000.0,A & E Plastik Pak Inc.,US,A E PLASTIK PAK INC
88568,1001.0,A & M Food Services Inc.,US,A M FOOD SERVICES INC
28053,1002.0,AAI Corporation,US,AAI CORP
29449,1003.0,"A.A. Importing Company, Inc.",US,AA IMPORTING CO INC
5151,1004.0,AAR Corp.,US,AAR CORP
...,...,...,...,...
119639,353451.0,VistaREIT Inc,PH,VISTAREIT INC
116343,353452.0,HigHPoint Service Network Corp.,TW,HIGHPOINT SERVICE NETWORK CORP
119661,353453.0,Tianhong Asset Management Co. Ltd. - CSI Star ...,CN,TIANHONG ASSET MANAGEMENT CO LTD CSI STAR AND ...
109816,353454.0,Golden Rock Global Plc,HK,GOLDEN ROCK GLOBAL PLC


In [8]:
# Check if there are rows with duplicate clean firm names (last run: about 10k duplicate rows)
print("Counting rows of df:")
print("Number of rows:", df.shape[0])
print("Number of unique clean firm names:", df['clean_firm_name'].nunique())
print("Number of unique company names:", df['company_name'].nunique())
print("Number of unique gvkeys:", df['gvkey'].nunique())

# Remove rows with duplicate clean firm names
df = df.drop_duplicates('clean_firm_name')
print("\nRemoved rows with duplicate clean firm names, recounting rows of df:")
print("Number of rows:", df.shape[0])
print("Number of unique clean firm names:", df['clean_firm_name'].nunique())
print("Number of unique company names:", df['company_name'].nunique())
print("Number of unique gvkeys:", df['gvkey'].nunique())

Counting rows of df:
Number of rows: 132228
Number of unique clean firm names: 121898
Number of unique company names: 132228
Number of unique gvkeys: 115035

Removed rows with duplicate clean firm names, recounting rows of df:
Number of rows: 121898
Number of unique clean firm names: 121898
Number of unique company names: 121898
Number of unique gvkeys: 114686


In [9]:
# Save df as .csv
print("This combined df will be the set of Compustat and Hassan firm names with gvkeys.")
print("This file will be used to match against firm names for conference calls, in entryfilescombined.")
df.to_csv(outputfilepath, index=False)
print("df saved to:", outputfilepath)

This combined df will be the set of Compustat and Hassan firm names with gvkeys.
This file will be used to match against firm names for conference calls, in entryfilescombined.
df saved to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_process_compustat_and_hassan_files\compustat_and_hassan_firm_names_withgvkeyandcountry\20220705\compustat_and_hassan_firm_names_withgvkeyandcountry.csv
