In [1]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress
import math

In [2]:
exactmatch_yes_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.3_exact_matches\20210101-20220617\exactmatch_yes.csv")
fuzzymatch_filledin_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.4_fuzzy_matches\20210101-20220617\fuzzymatch_manually_filled_in.xlsx")
compustathassan_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_process_compustat_and_hassan_files\compustat_and_hassan_firm_names_withgvkeyandcountry\20220705\compustat_and_hassan_firm_names_withgvkeyandcountry.csv")
fuzzymatch_done_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.4_fuzzy_matches\20210101-20220617\fuzzymatch_done.csv")
cleanfirmnames_matched_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.5_firm_names_matched\20210101-20220617\cleanfirmnames_matched_20210101-20220617.csv")

In [3]:
# Import files
exactmatch = pd.read_csv(exactmatch_yes_filepath)
fuzzymatch = pd.read_excel(fuzzymatch_filledin_filepath)
compustathassan = pd.read_csv(compustathassan_filepath)
compustathassan = compustathassan.rename(columns = {'clean_firm_name': "clean_firm_name_compustathassan"})

In [4]:
exactmatch.head(1)

Unnamed: 0,clean_firm_name_entryfile,clean_firm_name_compustathassan,gvkey,company_name,hqcountry,count_entryfile,exact_match
0,CAMDEN PROPERTY TRUST,CAMDEN PROPERTY TRUST,28629.0,Camden Property Trust,US,41,1


In [5]:
fuzzymatch.head(1)

Unnamed: 0,clean_firm_name_entryfile,count_entryfile,exact_match,choice_1_name,choice_1_score,choice_2_name,choice_2_score,choice_3_name,choice_3_score,best_choice
0,EUROPEAN RELIANCE GENERAL INSURANCE CO S A,1,0,EUROPEAN RELIANCE GENERAL INSURANCE CO SA,98.795181,A E PLASTIK PAK INC,85.5,A M FOOD SERVICES INC,85.5,1


In [6]:
compustathassan.head(1)

Unnamed: 0,gvkey,company_name,hqcountry,clean_firm_name_compustathassan
0,1000.0,A & E Plastik Pak Inc.,US,A E PLASTIK PAK INC


In [7]:
# View value counts of best choices (manually entered)
print("Counts of manually entered best choices in fuzzymatch:")
fuzzymatch['best_choice'].value_counts()

Counts of manually entered best choices in fuzzymatch:


1                                                231
2                                                 16
3                                                  2
INTL MEAL CO ALIMENTACAO SA                        1
TENET HEALTHCARE CORP                              1
NOVA LJUBLJANSKA BANKA DD LJUBLJANA                1
DSV AS                                             1
SWISS RE AG                                        1
PROSUS NV                                          1
REPUBLIC SERVICES INC                              1
PIPER SANDLER COMPANIES                            1
SULZER AG                                          1
SEVEN I HOLDINGS CO LTD                            1
PTT EXPLORATION AND PRODUCTION PCL                 1
HOIST FINANCE AB PUBL                              1
TURK TELEKOMUNIKASYON AS                           1
CSB BANK LTD                                       1
APPLUS SERVICES SA                                 1
BANK POLSKA KASA OPIEKI SA                    

In [8]:
# View value counts of manual matching
one_count = fuzzymatch[fuzzymatch['best_choice'] == 1].shape[0]
two_count = fuzzymatch[fuzzymatch['best_choice'] == 2].shape[0]
three_count = fuzzymatch[fuzzymatch['best_choice'] == 3].shape[0]
out_of_top3_count = fuzzymatch[fuzzymatch['best_choice'].str.len() > 0].shape[0] # identify strings
nan_count = fuzzymatch['best_choice'].isna().sum() # identify NaNs

print("Number of entries in fuzzymatch:", fuzzymatch.shape[0])
print("Number of 1s in fuzzymatch:", one_count)
print("Number of 2s in fuzzymatch:", two_count)
print("Number of 3s in fuzzymatch:", three_count)
print("Number of matches out of top 3 in fuzzymatch:", out_of_top3_count)
print("Number of non-matches in fuzzymatch:", nan_count)
assert(one_count + two_count + three_count + out_of_top3_count + nan_count == fuzzymatch.shape[0])

Number of entries in fuzzymatch: 297
Number of 1s in fuzzymatch: 231
Number of 2s in fuzzymatch: 16
Number of 3s in fuzzymatch: 2
Number of matches out of top 3 in fuzzymatch: 41
Number of non-matches in fuzzymatch: 7


In [9]:
def get_correct_compustathassan_clean_firm_name(row):
    best_choice = row['best_choice']
    if best_choice == 1:
        return row['choice_1_name']
    elif best_choice == 2:
        return row['choice_2_name']
    elif best_choice == 3:
        return row['choice_3_name']
    else:
        return best_choice

In [10]:
# Get the correct clean_firm_name from compustathassan based on the choice ('company_name')
fuzzymatch['clean_firm_name_compustathassan'] = fuzzymatch.apply(get_correct_compustathassan_clean_firm_name, axis=1)

# Check that number of NaNs is correct
assert(fuzzymatch['clean_firm_name_compustathassan'].isna().sum() == fuzzymatch['best_choice'].isna().sum())

# Filter to get the relevant columns from fuzzymatch
fuzzymatch = fuzzymatch[['clean_firm_name_entryfile', 'clean_firm_name_compustathassan', 'count_entryfile', 'exact_match']]
fuzzymatch.head(3)

Unnamed: 0,clean_firm_name_entryfile,clean_firm_name_compustathassan,count_entryfile,exact_match
0,EUROPEAN RELIANCE GENERAL INSURANCE CO S A,EUROPEAN RELIANCE GENERAL INSURANCE CO SA,1,0
1,GENWORTH MORTGAGE INSURANC AUSTRALIA LTD,GENWORTH MORTGAGE INSURANCE AUSTRALIA LTD,1,0
2,CALUMET SPECIALTY PRODUCTS PARTNERS L P,CALUMET SPECIALTY PRODUCTS PARTNERS LP,1,0


In [11]:
# View value counts of clean_firm_name_compustathassan
print("Value counts of clean_firm_name_compustathassan in fuzzymatch:")
fuzzymatch['clean_firm_name_compustathassan'].value_counts()

Value counts of clean_firm_name_compustathassan in fuzzymatch:


FEDERAL REALTY INVESTMENT TRUST    2
APOLLO GLOBAL MANAGEMENT INC       2
MIRVAC PROPERTY TRUST              1
MAGNIT PAO                         1
ROCKWOOL AS                        1
                                  ..
WEATHERFORD INTL PLC               1
BARRATT DEVELOPMENTS PLC           1
CREDIT SUISSE GROUP AG             1
BHP GROUP                          1
VERBUND AG                         1
Name: clean_firm_name_compustathassan, Length: 288, dtype: int64

In [12]:
# Merge with compustathassan to get gvkey, company_name (OG firm name in compustathassan) and hqcountry.
# Validate m:1, since it's possible to have multiple clean_firm_names in entryfile to refer to the same clean_firm_name in compustathassan.
# But the other way around (1:m) should not be possible, because we have dropped duplicates for clean_firm_name in compustathassan.
print("Merging with compustathassan to get gvkey, company_name (OG firm name in compustathassan) and hqcountry:")
fuzzymatch_merge = fuzzymatch.merge(compustathassan, how='left', on='clean_firm_name_compustathassan', validate='m:1')

# Rearrange columns
fuzzymatch_merge = fuzzymatch_merge.reindex(columns=['clean_firm_name_entryfile', 'clean_firm_name_compustathassan', 
                                                'gvkey', 'company_name', 'hqcountry', 'count_entryfile', 'exact_match'])

# Print stats
print("Number of rows in fuzzymatch_merge:", fuzzymatch_merge.shape[0])
print("Number of NaNs in fuzzymatch_merge:", fuzzymatch_merge['clean_firm_name_compustathassan'].isna().sum())
fuzzymatch_merge.head(3)

Merging with compustathassan to get gvkey, company_name (OG firm name in compustathassan) and hqcountry:
Number of rows in fuzzymatch_merge: 297
Number of NaNs in fuzzymatch_merge: 7


Unnamed: 0,clean_firm_name_entryfile,clean_firm_name_compustathassan,gvkey,company_name,hqcountry,count_entryfile,exact_match
0,EUROPEAN RELIANCE GENERAL INSURANCE CO S A,EUROPEAN RELIANCE GENERAL INSURANCE CO SA,243128.0,European Reliance General Insurance Co SA,GR,1,0
1,GENWORTH MORTGAGE INSURANC AUSTRALIA LTD,GENWORTH MORTGAGE INSURANCE AUSTRALIA LTD,317594.0,Genworth Mortgage Insurance Australia Ltd,AU,1,0
2,CALUMET SPECIALTY PRODUCTS PARTNERS L P,CALUMET SPECIALTY PRODUCTS PARTNERS LP,165846.0,"Calumet Specialty Products Partners, L.P.",US,1,0


In [14]:
# Combine exactmatch with fuzzymatch to get full list of clean firm names matched.
cleanfirmnames_matched = pd.concat([exactmatch, fuzzymatch_merge])

# Print stats
print("Number of rows in cleanfirmnames_matched:", cleanfirmnames_matched.shape[0])
print("Number of exact matches in cleanfirmnames_matched:", cleanfirmnames_matched[cleanfirmnames_matched['exact_match'] == 1].shape[0])
print("Number of fuzzy matches in cleanfirmnames_matched:", cleanfirmnames_matched[cleanfirmnames_matched['exact_match'] == 0].shape[0])
print("Number of NaNs in cleanfirmnames_matched:", cleanfirmnames_matched['clean_firm_name_compustathassan'].isna().sum())
cleanfirmnames_matched

Number of rows in cleanfirmnames_matched: 1976
Number of exact matches in cleanfirmnames_matched: 1679
Number of fuzzy matches in cleanfirmnames_matched: 297
Number of NaNs in cleanfirmnames_matched: 7


Unnamed: 0,clean_firm_name_entryfile,clean_firm_name_compustathassan,gvkey,company_name,hqcountry,count_entryfile,exact_match
0,CAMDEN PROPERTY TRUST,CAMDEN PROPERTY TRUST,28629.0,Camden Property Trust,US,41,1
1,BUNGE LTD,BUNGE LTD,144435.0,Bunge Limited,US,34,1
2,ROYALTY PHARMA PLC,ROYALTY PHARMA PLC,36474.0,Royalty Pharma plc,US,34,1
3,SUNRUN INC,SUNRUN INC,24905.0,Sunrun Inc,US,32,1
4,SUNNOVA ENERGY INTL INC,SUNNOVA ENERGY INTL INC,35395.0,Sunnova Energy International Inc.,US,32,1
...,...,...,...,...,...,...,...
292,GRIFFIN INDUSTRIAL REALTY INC,INDUS REALTY TRUST INC,64934.0,Indus Realty Trust Inc,US,1,0
293,INTL MEAL CO HOLDINGS SA,INTL MEAL CO ALIMENTACAO SA,297003.0,International Meal Company Alimentacao SA,BR,1,0
294,ERAMETSLN,ERAMET SA,223520.0,Eramet SA,FR,1,0
295,ZEBU,SELECTQUOTE INC,36234.0,"SelectQuote, Inc.",US,9,0


In [15]:
# Save files
fuzzymatch_merge.to_csv(fuzzymatch_done_filepath, index=False)
print("Saved completed fuzzymatch to:", fuzzymatch_done_filepath)
cleanfirmnames_matched.to_csv(cleanfirmnames_matched_filepath, index=False)
print("Saved matched cleanfirmnames to:", cleanfirmnames_matched_filepath)

Saved completed fuzzymatch to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.4_fuzzy_matches\20210101-20220617\fuzzymatch_done.csv
Saved matched cleanfirmnames to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.5_firm_names_matched\20210101-20220617\cleanfirmnames_matched_20210101-20220617.csv
