In [1]:
# Doing the required imports
import pandas as pd
import numpy as np
import os
import string
import nltk
import re
from rapidfuzz import process, fuzz
import dask.dataframe as dd

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [7]:
def get_relevant_info(title):
    """
    This function gets relevant information from a Conference Call title. This includes:
        - name
        - ticker
        - random extra stuff
        
    Returns the name, ticker as well as any miscellaneous information.
    
    title: str - Title of a conference call
    """
    
    if " - " in title:
        l = title.split(" - ")
        name_info = l[0]
        ticker_info = name_info.split()[-1].replace("(", "").replace(")", "")
        other_info = l[1]
        return name_info, ticker_info, other_info
    else:
        return title, np.nan, np.nan
    
def tickerBrackets(name, t):
    """
    Takes in the extracted firm name and ticker and outputs a cleaned name for the firm
    by taking out the ticker if it exists.
    
    name: str - Extracted name of a firm
    t: str - Extracted ticker of a firm
    """
    t_string = "({})".format(t)
    try:
        ind = name.index(t_string)
        cleansed_name = name[:(ind - 1)]
        return cleansed_name
    except:
        return name
    
def get_names(df):
    """
    Function to do all the name/ticker extraction from a dataset.
    
    df: pd.DataFrame - Conference Call dataset
    """
    names = []
    tickers = []
    miscs = []
    for title in df["Title"]:
        name, ticker, misc = get_relevant_info(title)
        names.append(name)
        tickers.append(ticker)
        miscs.append(misc)
    
    df["Name"], df["Ticker"], df["Other Information"] = names, tickers, miscs
    df["Cleaned_Name"] = df.apply(lambda x: tickerBrackets(x["Name"], x["Ticker"]), axis = 1)
    return df

In [8]:
def clean_title(title):
    """
    This function further cleans up some problematic Conference Call titles.
    
    title: str - Conference Call title to clean
    """
    title = title.split("CONFERENCE")[0].split("OF")[-1].strip().split("CONFER")[0]
    return title

In [9]:
compustat_df = pd.read_csv("../CC_Keyword_Extraction/ciqcompany_mergedwithgvkeyandcountry.csv")
compustat_df = compustat_df[compustat_df["gvkey"].notnull()]
compustat_df

Unnamed: 0,companyid,companyname,countryid,gvkey,country,isocountry2
5,1.850700e+04,2M Invest A/S,55.0,235716.0,Denmark,DK
7,1.851100e+04,3i Group plc,212.0,210835.0,United Kingdom,GB
14,1.852700e+04,ABB Ltd,195.0,210418.0,Switzerland,CH
70,1.867100e+04,Albemarle Corporation,213.0,29751.0,United States,US
86,1.871100e+04,The Allstate Corporation,213.0,28349.0,United States,US
...,...,...,...,...,...,...
25531497,1.679592e+09,PT. Geoprima Solusi Tbk,94.0,350207.0,Indonesia,ID
25531557,1.679596e+09,"Guotai Asset Management Co., Ltd. - Cathay Pac...",43.0,350228.0,China,CN
25535832,1.679868e+09,PT Prima Andalan Mandiri Tbk,94.0,350221.0,Indonesia,ID
25539907,1.680025e+09,Elementia Materiales S.A.B. De C.V.,131.0,350235.0,Mexico,MX


In [10]:
def remove_punc(words):
    """
    This function removes punctuations from a string.
    
    words: str - Text to remove punctuation from
    """
    no_punc_words = [s.translate(str.maketrans("", "", string.punctuation)) for s in words]
    return " ".join(no_punc_words)

compustat_df["companyname_No_Punctuations"] = compustat_df["companyname"].str.upper().str.split().apply(lambda x: remove_punc(x))

In [11]:
def parseResults(results):
    """
    Function to parse fuzzy matching results and reutrn 2 neat lists containing
    closest firms (in terms of similarity) as well as the calculated similarity.
    
    results: tuple - Fuzzy matching results
    """
    c_firms, sims = [], []
    for c_firm, sim, _  in results:
        c_firms.append(c_firm)
        sims.append(sim)
    return c_firms, sims

#a, b = parseResults(results)
#matched_df = pd.DataFrame({"CC Firm": full_df[:10000]["Final_Cleaned_Name_No_Punctuations"], "Compustat Firm": a, "Similarity": b})
#matched_df

In [12]:
full_cc_df = pd.read_csv("xls_compiled_01_1_xls_1_and_2.csv").drop("0", axis = 1)
full_cc_df = full_cc_df.drop_duplicates("Title")
full_cc_df = full_cc_df[full_cc_df["Title"].notnull()]
full_cc_df = get_names(full_cc_df)
full_cc_df["Final_Cleaned_Name"] = full_cc_df["Title"].apply(clean_title)
full_cc_df

  full_cc_df = pd.read_csv("xls_compiled_01_1_xls_1_and_2.csv").drop("0", axis = 1)


Unnamed: 0,Analyst,Collection,Contributor,Date,Language,PPV,Pages,Price,Ratings,Report #,Subtitle,TOC,Title,Count,Name,Ticker,Other Information,Cleaned_Name,Final_Cleaned_Name
0,ANON,INV,THOMSON REUTERS STREETEVENTS,11/08/12,English,N,17.0,Subscription,,21044925.0,FSYS.OQ - Event Transcript of Fuel Systems Sol...,Y,FUEL SYSTEMS SOLUTIONS INC,,FUEL SYSTEMS SOLUTIONS INC,,,FUEL SYSTEMS SOLUTIONS INC,FUEL SYSTEMS SOLUTIONS INC
1,ANON,INV,THOMSON REUTERS STREETEVENTS,11/08/12,English,N,18.0,Subscription,,21045694.0,NNBR.OQ - Event Transcript of NN Inc conferenc...,Y,NN INC,,NN INC,,,NN INC,NN INC
2,ANON,INV,THOMSON REUTERS STREETEVENTS,11/08/12,English,N,11.0,Subscription,,21045712.0,CPNO.OQ - Event Transcript of Copano Energy LL...,Y,COPANO ENERGY LLC,,COPANO ENERGY LLC,,,COPANO ENERGY LLC,COPANO ENERGY LLC
3,ANON,INV,THOMSON REUTERS STREETEVENTS,11/08/12,English,N,23.0,Subscription,,21045002.0,EXHO.PA - Event Transcript of Sodexo SA confer...,Y,SODEXO,,SODEXO,,,SODEXO,SODEXO
4,ANON,INV,THOMSON REUTERS STREETEVENTS,11/08/12,English,N,21.0,Subscription,,21045791.0,VMC.N - Event Transcript of Vulcan Materials C...,Y,VULCAN MATERIALS CO.,,VULCAN MATERIALS CO.,,,VULCAN MATERIALS CO.,VULCAN MATERIALS CO.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588052,ANON,INV,REFINITIV STREETEVENTS,8/30/2021,English,N,10.0,Subscription,,72869118.0,DDL.N - Event Transcript of Dingdong (Cayman) ...,Y,DINGDONG (CAYMAN) LTD,1.0,DINGDONG (CAYMAN) LTD,,,DINGDONG (CAYMAN) LTD,DINGDONG (CAYMAN) LTD
588062,ANON,INV,REFINITIV STREETEVENTS,8/30/2021,English,N,7.0,Subscription,,72912293.0,PAIC.OQ - Event Transcript of Petra Acquisitio...,Y,PAIC.OQ - EVENT TRANSCRIPT OF PETRA ACQUISITIO...,1.0,PAIC.OQ,PAIC.OQ,EVENT TRANSCRIPT OF PETRA ACQUISITION INC CONF...,PAIC.OQ,PETRA ACQUISITION INC
588186,ANON,INV,REFINITIV STREETEVENTS,9/7/2021,English,N,14.0,Subscription,,72939535.0,TCAPI.L - Event Transcript of TP ICAP Group PL...,Y,TP ICAP GROUP PLC,1.0,TP ICAP GROUP PLC,,,TP ICAP GROUP PLC,TP ICAP GROUP PLC
588211,ANON,INV,REFINITIV STREETEVENTS,9/7/2021,English,N,17.0,Subscription,,72918159.0,MLNK.N - Event Transcript of Meridianlink Inc ...,Y,MERIDIANLINK INC,1.0,MERIDIANLINK INC,,,MERIDIANLINK INC,MERIDIANLINK INC


In [13]:
full_cc_df["Final_Cleaned_Name_No_Punctuations"] = full_cc_df["Final_Cleaned_Name"].str.split().apply(lambda x: remove_punc(x))
compustat_df["companyname_No_Punctuations"] = compustat_df["companyname"].str.upper().str.split().apply(lambda x: remove_punc(x))

In [14]:
# This will take quite a long time
# Using dask for parallelization
full_df = full_cc_df.copy()
small = full_df
small_dask = dd.from_pandas(small, npartitions = 100)
match_choices = compustat_df["companyname_No_Punctuations"]
results = small_dask["Final_Cleaned_Name_No_Punctuations"].apply(lambda x: process.extractOne(x, match_choices, scorer = fuzz.ratio), meta = "str")
results = results.compute(scheduler = "processes")

In [19]:
a, b = parseResults(results)
matched_df = pd.DataFrame({"CC Firm": small_dask["Final_Cleaned_Name_No_Punctuations"], "Compustat Firm": a, "Similarity": b})
matched_df

Unnamed: 0,CC Firm,Compustat Firm,Similarity
0,FUEL SYSTEMS SOLUTIONS INC,FUEL SYSTEMS SOLUTIONS INC,100.000000
1,NN INC,NN INC,100.000000
2,COPANO ENERGY LLC,COPANO ENERGY LLC,100.000000
3,SODEXO,SODERO,83.333333
4,VULCAN MATERIALS CO,VULCAN MATERIALS COMPANY,88.372093
...,...,...,...
65151,DINGDONG CAYMAN LTD,DINGDONG CAYMAN LIMITED,90.476190
65152,PETRA ACQUISITION INC,PETRA ACQUISITION INC,100.000000
65153,TP ICAP GROUP PLC,TP ICAP GROUP PLC,100.000000
65154,MERIDIANLINK INC,MERIDIANLINK INC,100.000000


In [None]:
matched_df.to_excel("CC_To_Compustat.xlsx")

In [16]:
matched_df = matched_df[matched_df["Similarity"] >= 93]
matched_df

Unnamed: 0,CC Firm,Compustat Firm,Similarity
0,FUEL SYSTEMS SOLUTIONS INC,FUEL SYSTEMS SOLUTIONS INC,100.0
1,NN INC,NN INC,100.0
2,COPANO ENERGY LLC,COPANO ENERGY LLC,100.0
6,HIMAX TECHNOLOGIES INC,HIMAX TECHNOLOGIES INC,100.0
7,HALOZYME THERAPEUTICS INC,HALOZYME THERAPEUTICS INC,100.0
...,...,...,...
65149,PAYCOR HCM INC,PAYCOR HCM INC,100.0
65152,PETRA ACQUISITION INC,PETRA ACQUISITION INC,100.0
65153,TP ICAP GROUP PLC,TP ICAP GROUP PLC,100.0
65154,MERIDIANLINK INC,MERIDIANLINK INC,100.0


In [17]:
manual_matched_df = pd.read_csv("Manual_CC_To_Compustat.csv").drop("Unnamed: 0", axis = 1)
manual_matched_df

Unnamed: 0,CC Firm,Compustat Firm,Similarity
0,VULCAN MATERIALS CO,VULCAN MATERIALS COMPANY,88.372093
1,DUKE ENERGY CORP,EDEN ENERGY CORP,87.500000
2,INSULET CORP,INET CORP,85.714286
3,AMERICAN TOWER CORP,AMERICAN MOTORS CORP,87.179487
4,SALEM COMMUNICATIONS,SAL COMMUNICATIONS INC,85.714286
...,...,...,...
4461,ADIAL PHARMACEUTICALS LLC,ADIAL PHARMACEUTICALS INC,92.000000
4462,BRII BIOSCIENCES LTD,BRII BIOSCIENCES LIMITED,90.909091
4463,MISSFRESH LTD,MISSFRESH LIMITED,86.666667
4464,CITY CHIC COLLECTIVE LTD,CITY CHIC COLLECTIVE LIMITED,92.307692


In [18]:
final_matched_df = pd.concat([matched_df, manual_matched_df])
final_matched_df.to_excel("Matched_CC_To_Compustat.xlsx", index = False)

Unnamed: 0,CC Firm,Compustat Firm,Similarity
0,FUEL SYSTEMS SOLUTIONS INC,FUEL SYSTEMS SOLUTIONS INC,100.000000
1,NN INC,NN INC,100.000000
2,COPANO ENERGY LLC,COPANO ENERGY LLC,100.000000
6,HIMAX TECHNOLOGIES INC,HIMAX TECHNOLOGIES INC,100.000000
7,HALOZYME THERAPEUTICS INC,HALOZYME THERAPEUTICS INC,100.000000
...,...,...,...
4461,ADIAL PHARMACEUTICALS LLC,ADIAL PHARMACEUTICALS INC,92.000000
4462,BRII BIOSCIENCES LTD,BRII BIOSCIENCES LIMITED,90.909091
4463,MISSFRESH LTD,MISSFRESH LIMITED,86.666667
4464,CITY CHIC COLLECTIVE LTD,CITY CHIC COLLECTIVE LIMITED,92.307692
