In [1]:
# Import Packages
import pandas as pd, numpy as np
import os, sys, glob, re
from pathlib import Path
from itertools import compress
from rapidfuzz import process, fuzz

In [2]:
entryfiles_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.2_firm_names_to_match_20210101-20220617\cleanfirmnames_to_match_20210101-20220617.xlsx")
compustathassan_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.1_process_compustat_and_hassan_files\compustat_and_hassan_firm_names_withgvkeyandcountry\20220705\compustat_and_hassan_firm_names_withgvkeyandcountry.csv")
exactmatch_yes_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.3_exact_matches\20210101-20220617\exactmatch_yes.csv")
exactmatch_no_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.3_exact_matches\20210101-20220617\exactmatch_no.csv")
fuzzymatch_filepath = Path(r"C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.4_fuzzy_matches\20210101-20220617\fuzzymatch_not_manually_filled_in.xlsx")

In [3]:
# Import files
entryfiles = pd.read_excel(entryfiles_filepath)
compustathassan = pd.read_csv(compustathassan_filepath)

In [4]:
entryfiles.head(1)

Unnamed: 0,clean_firm_name,count
0,CAMDEN PROPERTY TRUST,41


In [5]:
compustathassan.head(1)

Unnamed: 0,gvkey,company_name,hqcountry,clean_firm_name
0,1000.0,A & E Plastik Pak Inc.,US,A E PLASTIK PAK INC


# First try exact match

In [6]:
exactmatch = entryfiles.merge(compustathassan, how='left', on='clean_firm_name')
print("Number of clean firm names in entryfiles:", exactmatch.shape[0])
print("Number of clean firm names with exact matches:", exactmatch['gvkey'].count())
print("Number of clean firm names without exact matches:", exactmatch['gvkey'].isna().sum())
assert(exactmatch['gvkey'].isna().sum() == exactmatch.shape[0] - exactmatch['gvkey'].count())

Number of clean firm names in entryfiles: 1976
Number of clean firm names with exact matches: 1679
Number of clean firm names without exact matches: 297


In [7]:
# Filter out the clean firm names with exact matches, and without
exactmatch_yes = exactmatch.dropna(subset=['gvkey'])
exactmatch_no = exactmatch[exactmatch['gvkey'].isna()]

In [8]:
# Save exactmatch dfs to .csv
exactmatch_yes.to_csv(exactmatch_yes_filepath, index=False)
exactmatch_no.to_csv(exactmatch_no_filepath, index=False)
print("Saved clean firm names with exact matches to:", exactmatch_yes_filepath)
print("Saved clean firm names without exact matches to:", exactmatch_no_filepath)

Saved clean firm names with exact matches to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.3_exact_matches\20210101-20220617\exactmatch_yes.csv
Saved clean firm names without exact matches to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.3_exact_matches\20210101-20220617\exactmatch_no.csv


# Then do fuzzy match on the remaining clean firm names

In [9]:
exactmatch_no.head(3)

Unnamed: 0,clean_firm_name,count,gvkey,company_name,hqcountry
13,SONOCO PRODUCTS,23,,,
20,MAGNIT OAO,19,,,
44,BLUESCOPE STEEL,16,,,


In [10]:
choice_1_name, choice_1_score, choice_2_name, choice_2_score, choice_3_name, choice_3_score = [], [], [], [], [], []

for index, row in exactmatch_no.iterrows():
    # Perform fuzzy matching to get the top 3 choices/matches
    fuzzy = process.extract(row['clean_firm_name'], compustathassan['clean_firm_name'], scorer=fuzz.WRatio, limit=3)
    
    # Record the top 3 choices
    choice_1_name.append(fuzzy[0][0])
    choice_1_score.append(fuzzy[0][1]) 
    choice_2_name.append(fuzzy[1][0])
    choice_2_score.append(fuzzy[1][1])
    choice_3_name.append(fuzzy[2][0])
    choice_3_score.append(fuzzy[2][1])

In [11]:
# Append to exactmatch_no df to get fuzzymatch df
fuzzymatch = exactmatch_no.copy()[['clean_firm_name', 'count']]
fuzzymatch['choice_1_name'] = choice_1_name
fuzzymatch['choice_1_score'] = choice_1_score
fuzzymatch['choice_2_name'] = choice_2_name
fuzzymatch['choice_2_score'] = choice_2_score
fuzzymatch['choice_3_name'] = choice_3_name
fuzzymatch['choice_3_score'] = choice_3_score

# Create an empty column for manual entry
fuzzymatch['best_choice'] = ''

# Sort by decreasing choice_1_score, then by count.
# Idea: The fuzzy matches most likely to be correct are at the top.
fuzzymatch = fuzzymatch.sort_values(by = ['choice_1_score', 'count'], ascending = [False, False])

In [12]:
# Save fuzzymatch df to .xlsx
fuzzymatch.to_excel(fuzzymatch_filepath, index=False)
print("Saved fuzzy-matched clean firm names (not manually filled in) to:", fuzzymatch_filepath)

Saved fuzzy-matched clean firm names (not manually filled in) to: C:\Users\jasonjia\Dropbox\Projects\conference_call\output\04_match_firm_names_to_gvkeys\04.4_fuzzy_matches\20210101-20220617\fuzzymatch_not_manually_filled_in.xlsx


In [14]:
# Guide for filling in best_choice manually
print("Guide for filling in best_choice manually:")
print("If choice 1 is correct, enter 1.")
print("If choice 2 is correct, enter 2.")
print("If choice 3 is correct, enter 3.")
print("If all choices are wrong, enter NaN.")

Guide for filling in best_choice manually:
If choice 1 is correct, enter 1.
If choice 2 is correct, enter 2.
If choice 3 is correct, enter 3.
If all choices are wrong, enter NaN.
