In [1]:
using CSV
using DataFrames
using StringDistances

# ``` parse CC datalist to match firm name to compustat gvkey and country ```
# ``` to match 3 source are using: 1) Hassan dataset 2) compustat capital IQ companies dataset 3) compustat capital IQ ticker ```
# ``` for unmatched firm's name, string distance comparison is using. ```

In [3]:
function getTicker(str) #``` tries to parse ticker from the title in .csv```
    try
        # f=findfirst(" - ",str)[1]
        f=findfirst(" ",str)[1]
        p=f
        if !isnothing(findfirst(".",str[1:f-1]))
            p=findfirst(".",str[1:f-1])[1]
        end
        return replace(str[1:p-1],r"[\W]"=>"")
    catch
        return ""
    end
end

getTicker (generic function with 1 method)

In [19]:
function deleteCorpWords(fname) #``` delete common words```
    words=["earnings conference call","conference call on productivity", "earnings release conference", "financial release conference",
        "conference call regarding", "earnings conference", "comprehensive review", "final transcript", "edited transcript",
        "week conference", "conference call", "edited brief", "preliminary brief", "earnings call", "earning call",
        "preliminary transcript", "final transcript", "call","cal","merger","c", "earning","earnings", "to discuss",
        "group","plc","ltd","limited","ag","corp","corporation","Incorporation","laboratories","labs","the","proposed","propose",
                        "holdings","oyj","inc","conference","co", "final","preliminary","and","&",
                        "company","trust","investment","investments","sln","sa","s.p.a.","spa","transc",
                        "quarter","st","nd","rd","th",
                        "q", "jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec",
                        ]

    fname = string(" ",fname," ")
    fname=replace(fname,r"[0-9]"=>" ")
    for w in words
        fname=replace(fname,Regex("\\b$w\\b") => " ")
    end

    return strip(fname)
end

deleteCorpWords (generic function with 1 method)

In [17]:
function prepareName(fname) #""" prepare name for matching """
    fname=lowercase(fname)

    try
        f=findfirst("event transcript of",fname)[end]
        fname=fname[nextind(fname,f):end]
    catch
    end
    try
        f=findfirst("event brief of",fname)[end]
        fname=fname[nextind(fname,f):end]
    catch
    end

    try
        f=findfirst(" - ",fname)[end]
        fname=fname[1:f]
    catch
    end

    fname=replace(fname,r"\((.*)\)"=>"")
    fname=deleteCorpWords(fname)
    return strip(replace(fname,r"[.,'#-/0-9]"=>""))
end

prepareName (generic function with 1 method)

In [6]:
function getGVkeyH(fname)
    try
        return gvkey_dict_h[prepareName(fname)]
    catch
        return 0
    end
end

function getGVkeyC(fname)
    try
        return gvkey_dict_c[prepareName(fname)]
    catch
        return 0
    end
end

function getGVKey_ticker(ticker)
    try
        return ticker_gvkey_uniq[ticker]
    catch
        return 0
    end
end

getGVKey_ticker (generic function with 1 method)

In [7]:
function GuesNameTicker!(dfCC) #lookf through set of company names by tikers. (some tickers has several names) ```
    global cash_dict
    # dfCC[!,:prob].=1.
    # dfCC[!,:gues_by_dticker].=0
    # dfCC[!,:gues_name].=""
    for r in eachrow(dfCC)

            if r.gvkey==0
                firmname=prepareName(r.Title)
                #check cash
                if haskey(cash_dict,firmname)
                    cash=cash_dict[firmname]
                    r.prob=cash[1]
                    r.gues_name=cash[2]
                    r.gvkey = cash[3]
                    cash[4]==1 ? r.gues_by_dticker=1 : r.gues_by_dticker=0

                else
                    # println(r.ticker)
                    possible_compname= dfCompT[dfCompT.tickersymbol.==r.ticker,:] #set of possible company names
                    if size(possible_compname)[1]>1
                        prob=0
                        best_match=""
                        gvkey_match=0
                        for pn in eachrow(possible_compname)
                            p=compare(prepareName(pn.companyname),firmname,Jaro())
                            if p>prob
                                 prob=p
                                 best_match=pn.companyname
                                 gvkey_match=pn.gvkey
                            end
                        end
                        if prob>0.8
                            r.prob=prob
                            r.gues_name=best_match
                            r.gvkey=gvkey_match
                            r.gues_by_dticker=1
                            cash_dict[firmname]=[prob,best_match,gvkey_match,1]
                        end
                    end
                end
            end
    end
end

GuesNameTicker! (generic function with 1 method)

In [8]:
function GuesName!(dfCC)# """ mactch companies name by the best matched"""
    # dfCC[!,:prob].=1.
    # dfCC[!,:gues_name].=""
    global cash_dict
    for r in eachrow(dfCC)
        if r.gvkey==0
            fname=prepareName(r.Title)
            #check cash
            if haskey(cash_dict,fname)
                cash=cash_dict[fname]
                r.prob=cash[1]
                r.gues_name=cash[2]
                r.gvkey = cash[3]
                cash[4]==1 ? r.gues_by_dticker=1 : r.gues_by_dticker=0
            else
                best_match=firm_names[1]
                gvkey_match=gvkey_dict_h[best_match]
                prob=compare(fname, best_match, Jaro())
                for f in firm_names[2:end]
                    p=compare(fname, f, Jaro())
                    if p>prob
                      prob=p
                      best_match=f
                      gvkey_match = haskey(gvkey_dict_h,best_match) ? gvkey_dict_h[best_match] : gvkey_dict_c[best_match]
                    end
                end
                r.prob=prob
                r.gvkey=gvkey_match
                r.gues_name=best_match
                cash_dict[fname]=[prob,best_match,gvkey_match,0]
            end
        end
    end
end

GuesName! (generic function with 1 method)

In [14]:
function MergeGvkey!(dfCC) #""" create the main gvkey based on three keys: hassan, compustat, tickers """
    dfCC[!,:gvkey].=0
    for r in eachrow(dfCC)
        if !((r.gvkey_t.==0) .& (r.gvkey_h.==0) .& (r.gvkey_cj0))
            r.gvkey=r.gvkey_c
            if r.gvkey==0
                r.gvkey=r.gvkey_h
                if r.gvkey==0
                    r.gvkey=r.gvkey_t
                end
            end
        end
    end
end

MergeGvkey! (generic function with 1 method)

In [15]:
function  matchFile(filename)
    #read Call csv file
    try
        dfCC=CSV.read("$filename.csv",copycols=true)

        #parse ticker from SubTitile
        dfCC[!,:ticker]=getTicker.(dfCC.Subtitle)
        #match three sources gvkeys
        dfCC[!,:gvkey_t].= getGVKey_ticker.(dfCC[:,:ticker])
        dfCC[!,:gvkey_h]=getGVkeyH.(dfCC.Title)
        dfCC[!,:gvkey_c]=getGVkeyC.(dfCC.Title)



        #Merge into one main gvkey
        MergeGvkey!(dfCC)
        #match firms without gvkey
        dfCC[!,:prob].=1.
        dfCC[!,:gues_by_dticker].=0
        dfCC[!,:gues_name].=""

        GuesNameTicker!(dfCC)
        GuesName!(dfCC)
        #println(dfCC[:,[:gvkey_c,:gvkey_h,:gvkey_t,:gvkey,:prob,:Title,:gues_name]])

        dfCC=join(dfCC,dfCompGvkeyUniqu[:,[:gvkey,:countryid,:country]], on = :gvkey, kind = :inner)
        dfCC[!,:filename].=filename
        select!(dfCC,Not(:Call))
        return dfCC
    catch
    end
end

matchFile (generic function with 1 method)

In [16]:
function DoFolder(year)
    dfList=DataFrame()
    files=readdir()
    
    for file in files
        println(file)
        
        # if file[end-2:end]=="csv"
        if (file[1:4]==string(year))
            try
                dfCC=matchFile(file[1:end-4])
                append!(dfList,dfCC)
                # println("$file done ")
            catch e
                println("$file error", e)
            end
        end
    end
    
    # return dfList
    sort!(dfList,:prob)
    CSV.write("CC_List$year.csv",dfList)
end

DoFolder (generic function with 1 method)

In [None]:
println("start linking CC to GVKEY")

# """ SET Current Folder """
cd("C:\\Users\\jasonjia\\Dropbox\\Projects\\ConferenceCall\\Output\\Csv")

In [9]:
# """Prepare dictionaries and variables"""
dfSV_hassan=CSV.read("C:\\Users\\jasonjia\\Dropbox\\Projects\\ConferenceCall\\Output\\FirmIdentification\\Hassan\\Hassanfile_raw_updated2019030_truncated.csv", DataFrame)

start linking CC to GVKEY


Unnamed: 0_level_0,gvkey,company_name,ticker,hqcountrycode,date,date_earningscall,isin
Unnamed: 0_level_1,Int64,String,String31?,String3?,String7,String15,String15?
1,1004,AAR Corp,AIR,US,2002q1,20-Mar-02,US0003611052
2,1004,AAR Corp,AIR,US,2002q2,27-Jun-02,US0003611052
3,1004,AAR Corp,AIR,US,2002q3,26-Sep-02,US0003611052
4,1004,AAR Corp,AIR,US,2003q3,17-Sep-03,US0003611052
5,1004,AAR Corp,AIR,US,2003q4,18-Dec-03,US0003611052
6,1004,AAR Corp,AIR,US,2004q1,17-Mar-04,US0003611052
7,1004,AAR Corp,AIR,US,2004q2,29-Jun-04,US0003611052
8,1004,AAR Corp,AIR,US,2004q3,22-Sep-04,US0003611052
9,1004,AAR Corp,AIR,US,2004q4,17-Dec-04,US0003611052
10,1004,AAR Corp,AIR,US,2005q1,16-Mar-05,US0003611052


In [10]:
dfComp=CSV.read("C:\\Users\\jasonjia\\Dropbox\\Projects\\ConferenceCall\\Output\\FirmIdentification\\compustat_csv\\ciqcompany_mergedwithgvkeyandcountry_andnaivetickers.csv",copycols=true, DataFrame)
# copycols = True makes a copy of the df, so that it can be edited and isn't read-only.

Unnamed: 0_level_0,companyid,companyname,countryid,gvkey,country
Unnamed: 0_level_1,Float64,String?,Float64?,Float64?,String63?
1,18493.0,DC Venture Partners,213.0,missing,United States
2,18495.0,13i Capital Corporation,213.0,missing,United States
3,18499.0,21 International Holdings Inc.,213.0,missing,United States
4,18501.0,21 Invest Sgr S.p.A.,99.0,missing,Italy
5,18505.0,Palo Alto Venture Partners,213.0,missing,United States
6,18507.0,2M Invest A/S,55.0,235716.0,Denmark
7,18509.0,2nd Generation Capital LLC,213.0,missing,United States
8,18511.0,3i Group plc,212.0,210835.0,United Kingdom
9,18513.0,4C Ventures,213.0,missing,United States
10,18515.0,"Pappas Capital, LLC",213.0,missing,United States


In [11]:
dropmissing(dfComp) # Most firms in dfcompustat are actually missing a gvkey.

Unnamed: 0_level_0,companyid,companyname,countryid,gvkey
Unnamed: 0_level_1,Float64,String,Float64,Float64
1,18507.0,2M Invest A/S,55.0,235716.0
2,18511.0,3i Group plc,212.0,210835.0
3,18527.0,ABB Ltd,195.0,210418.0
4,18671.0,Albemarle Corporation,213.0,29751.0
5,18711.0,The Allstate Corporation,213.0,28349.0
6,18729.0,"The Alpine Group, Inc.",213.0,1331.0
7,18749.0,"Amazon.com, Inc.",213.0,64768.0
8,18759.0,"American Capital, Ltd.",213.0,65345.0
9,18833.0,"Apollo Global Management, Inc.",213.0,184254.0
10,18921.0,AT&T Corp.,213.0,1581.0


In [12]:
dfCompGvkeyUniqu=unique(dfComp,:gvkey) # remove all duplicates of gvkey, including missing entries

Unnamed: 0_level_0,companyid,companyname,countryid,gvkey
Unnamed: 0_level_1,Float64,String?,Float64?,Float64?
1,18493.0,DC Venture Partners,213.0,missing
2,18507.0,2M Invest A/S,55.0,235716.0
3,18511.0,3i Group plc,212.0,210835.0
4,18527.0,ABB Ltd,195.0,210418.0
5,18671.0,Albemarle Corporation,213.0,29751.0
6,18711.0,The Allstate Corporation,213.0,28349.0
7,18729.0,"The Alpine Group, Inc.",213.0,1331.0
8,18749.0,"Amazon.com, Inc.",213.0,64768.0
9,18759.0,"American Capital, Ltd.",213.0,65345.0
10,18833.0,"Apollo Global Management, Inc.",213.0,184254.0


In [15]:
dfCompT=dropmissing(dfComp,:ticker) #drop companies without tickers

Unnamed: 0_level_0,companyid,companyname,countryid,gvkey
Unnamed: 0_level_1,Float64,String?,Float64?,Float64?
1,18507.0,2M Invest A/S,55.0,235716.0
2,18511.0,3i Group plc,212.0,210835.0
3,18527.0,ABB Ltd,195.0,210418.0
4,18671.0,Albemarle Corporation,213.0,29751.0
5,18711.0,The Allstate Corporation,213.0,28349.0
6,18729.0,"The Alpine Group, Inc.",213.0,1331.0
7,18749.0,"Amazon.com, Inc.",213.0,64768.0
8,18759.0,"American Capital, Ltd.",213.0,65345.0
9,18833.0,"Apollo Global Management, Inc.",213.0,184254.0
10,18921.0,AT&T Corp.,213.0,1581.0


In [21]:
gvkey_dict_h = Dict(prepareName(row.company_name) => row.gvkey  for row in eachrow(dfSV_hassan))

Dict{SubString{String}, Int64} with 13123 entries:
  "clear secure"                   => 38954
  "supercom"                       => 177058
  "us bancorp"                     => 4723
  "realty income"                  => 30822
  "cyries energy"                  => 160814
  "acme packet"                    => 175111
  "bluelinx"                       => 161813
  "nordic mining asa"              => 289983
  "goodman sub   australia"        => 203030
  "coca cola hbc"                  => 221261
  "wpp"                            => 14605
  "merus   international"          => 170359
  "oceania healthcare"             => 324125
  "global partners lp"             => 163935
  "ado properties"                 => 319938
  "canal"                          => 2982
  "burckhardt compression holding" => 278299
  "pdf solutions"                  => 144437
  "hoegh lng partners lp"          => 21048
  "tim participacoes"              => 222638
  "trade me"                       => 311319
  "matrix co

In [None]:
gvkey_dict_c = Dict(prepareName(row.companyname) => row.gvkey  for row in eachrow(dfComp))

In [None]:
tickers=unique(dfCompT.tickersymbol) # unique tickers

In [None]:
ticker_gvkey_uniq=Dict()

In [None]:
for t in tickers #create dictionary with tickers, that are unique
    gvk=dfCompT[dfCompT.tickersymbol.==t,:].gvkey
    size(gvk)[1]==1 ? push!(ticker_gvkey_uniq,t=>gvk[1]) : nothing
end

In [None]:
firm_names_h=[k for k in keys(gvkey_dict_h)]
firm_names_c=[k for k in keys(gvkey_dict_c)]
firm_names=vcat(firm_names_h,firm_names_c)


In [None]:
global cash_dict = Dict() # cash is already matched firms name
# """ ************************************************************************ """

In [7]:
# """ main code """
# cd("C:\\CC2010")
for i in 2001:1:2010
     @time DoFolder(2010)
end

# @time dfList=DoFolder()
# sort!(dfList,:prob)
# CSV.write("CC_List.csv",dfList)
# """ END ""

LoadError: UndefVarError: DoFolder not defined