In [1]:
!pip install -qU awswrangler name_matching ray swifter

In [8]:
import awswrangler as wr
import pandas as pd
import swifter
import re
import unidecode
import numpy as np
import os
import glob

# Financials

In [2]:
sql = """SELECT bvdid, closing_date, total_assets, capital, number_of_employees, operating_revenue_turnover, 
sales, gross_profit, added_value, research_and_development_expenses FROM global_financials_and_ratios"""
financials = wr.athena.read_sql_query(sql, database="orbiskof2023")
financials

Unnamed: 0,bvdid,closing_date,total_assets,capital,number_of_employees,operating_revenue_turnover,sales,gross_profit,added_value,research_and_development_expenses
0,AU60820328151,2015,,,,3.0,730600.0,,,
1,AU60820328151,2014,,,,3.0,820200.0,,,
2,AU60820328151,2013,,,,3.0,,,,
3,AU608203327,2015,,,,3.0,,,,
4,AU608203336,2015,,,,3.0,,,,
...,...,...,...,...,...,...,...,...,...,...
602960124,TH0105535169128,20161231,1368.0,242613.0,,,113227.0,,,
602960125,TH0105535169128,20151231,887.0,230461.0,,,109370.0,,13467.0,
602960126,TH0105535169128,20141231,2882.0,237569.0,,,121257.0,,20478.0,
602960127,TH0105535169128,20131231,3017.0,218172.0,,,117025.0,,15969.0,


In [3]:
financials.memory_usage(index=False, deep=True)/1e9

bvdid                                41.454362
closing_date                          3.014801
total_assets                          2.411841
capital                               2.411841
number_of_employees                   2.411841
operating_revenue_turnover            2.411841
sales                                 2.411841
gross_profit                          2.411841
added_value                           2.411841
research_and_development_expenses     2.411841
dtype: float64

In [6]:
financials.isna().sum()/len(financials)*100

bvdid                                 0.000000
closing_date                          0.000000
total_assets                         59.168658
capital                              45.562211
number_of_employees                  99.788665
operating_revenue_turnover           43.169468
sales                                39.406028
gross_profit                         87.838708
added_value                          79.421020
research_and_development_expenses    85.127186
dtype: float64

In [12]:
df = financials[(financials.number_of_employees >= 100) | (financials.operating_revenue_turnover > 10000) | (financials.sales > 10000000)]

In [None]:
financials

# Key financials

In [3]:
sql = """SELECT bvdid, closing_date, total_assets, number_of_employees, operating_revenue_turnover, market_capitalisation_mil FROM key_financials"""
key_financials = wr.athena.read_sql_query(sql, database="orbiskof2023")
key_financials

Unnamed: 0,bvdid,closing_date,total_assets,number_of_employees,operating_revenue_turnover,market_capitalisation_mil
0,UA42012013,20181231,3525,,,
1,UA40918617,20201231,22416,2,26752,
2,UA40918617,20191231,84,2,,
3,UA40918617,20181231,8765,20,,
4,UA40918617,20171231,34987,15,48701,
...,...,...,...,...,...,...
467951975,NL69724687,20201231,18538890,,,
467951976,NL69724687,20191231,17496394,,,
467951977,NL69724687,20181231,20433913,,,
467951978,NL69724687,20171231,179895,,,


In [4]:
key_financials.memory_usage(index=False, deep=True)/1e9

bvdid                         32.274981
closing_date                   2.339760
total_assets                   4.211568
number_of_employees            2.339760
operating_revenue_turnover     4.211568
market_capitalisation_mil      2.339760
dtype: float64

In [17]:
key_financials.isna().sum()/len(key_financials)*100

bvdid                          0.000000
closing_date                   0.000000
total_assets                  43.599474
number_of_employees           34.141406
operating_revenue_turnover    31.734922
market_capitalisation_mil     99.822700
dtype: float64

In [5]:
key_financials.operating_revenue_turnover.describe()

count           319447784.0
mean         19596354.57702
std      79921933355.030823
min        -1685702883376.0
25%                 22445.0
50%                177717.0
75%                719312.0
max      1251616149443280.0
Name: operating_revenue_turnover, dtype: Float64

In [18]:
key_financials.market_capitalisation_mil.describe()

count       829679.0
mean     1698.025295
std      13813.97961
min              0.0
25%             32.0
50%            129.0
75%            577.0
max        2417523.0
Name: market_capitalisation_mil, dtype: Float64

In [7]:
key_financials[key_financials.number_of_employees >= 100].bvdid.nunique()

1797476

In [12]:
key_financials[(key_financials.operating_revenue_turnover > 10000000)].bvdid.nunique()

3641800

In [6]:
df = key_financials[(key_financials.number_of_employees >= 100) | (key_financials.operating_revenue_turnover > 10000000) | (key_financials.market_capitalisation_mil > 0)]
df = df.sort_values(['bvdid', 'closing_date'], ascending=True).copy(deep=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values('operating_revenue_turnover', inplace=True)


In [16]:
df = df.sort_values(['bvdid', 'closing_date'], ascending=True).copy(deep=True)

In [18]:
df.bvdid.nunique()

4495886

In [14]:
df.sort_values('number_of_employees', ascending=False).head(30)

Unnamed: 0,bvdid,closing_date,total_assets,number_of_employees,operating_revenue_turnover,market_capitalisation_mil
344438669,US142591828L,2021,,2700000,21313000000,
344446988,US710415188,20160131,199581000000.0,2300000,482130000000,212477.0
344446987,US710415188,20170131,198825000000.0,2300000,485873000000,205104.0
344446986,US710415188,20180131,204522000000.0,2300000,500343000000,318435.0
344446982,US710415188,20220131,244860000000.0,2300000,572754000000,387815.0
344446983,US710415188,20210131,252496000000.0,2300000,559151000000,397486.0
344446985,US710415188,20190131,219295000000.0,2200000,514405000000,278411.0
344446992,US710415188,20120131,193406000000.0,2200000,446509000000,210139.0
344446991,US710415188,20130131,203105000000.0,2200000,468651000000,233999.0
344446990,US710415188,20140131,204751000000.0,2200000,476294000000,241647.0


In [19]:
df.memory_usage(index=False, deep=True)/1e9

bvdid                         1.006917
closing_date                  0.073186
total_assets                  0.131734
number_of_employees           0.073186
operating_revenue_turnover    0.131734
market_capitalisation_mil     0.073186
dtype: float64

In [20]:
del key_financials

In [None]:
sql = """SELECT bvdid, closing_date, total_assets, number_of_employees, operating_revenue_turnover, market_capitalisation_mil FROM key_financials"""
key_financials = wr.athena.read_sql_query(sql, database="orbiskof2023")
key_financials

In [None]:
    SELECT *
    FROM firms_100_employees_or_10_mil_turnover_any_year
    INNER JOIN 
    (SELECT bvdid, closing_date, total_assets, capital, number_of_employees, operating_revenue_turnover, sales, gross_profit, research_and_development_expenses
    FROM "orbiskof2023"."global_financials_and_ratios")
    USING (bvdid)
    INNER JOIN
    (SELECT bvdid, name_internat, name_native, postcode, city, region_in_country, country_iso_code, website_address
    FROM "orbiskof2023"."contact_info")
    USING (bvdid)

# Firms w/ min. 100 employees or 10 mil turnover

In [None]:
sql = """CREATE TABLE firms_100_employees_or_10_mil_turnover_any_year
        WITH (
            format = 'Parquet',
            write_compression = 'SNAPPY',
            external_location = 's3://orbis-kof-2023/firms_100_employees_or_10_mil_turnover_any_year/'
        )
        AS (
            SELECT DISTINCT(bvdid)
            FROM key_financials
            WHERE number_of_employees >= 100
            OR operating_revenue_turnover >= 10000000
            OR market_capitalisation_mil > 0
        );"""

In [4]:
sql = """SELECT *
    FROM firms_100_employees_or_10_mil_turnover_any_year
    INNER JOIN 
    (SELECT bvdid, closing_date, number_of_employees, operating_revenue_turnover
    FROM "orbiskof2023"."key_financials")
    USING (bvdid)
    LEFT JOIN 
    (SELECT bvdid, closing_date, total_assets, added_value, research_and_development_expenses
    FROM "orbiskof2023"."global_financials_and_ratios")
    USING (bvdid, closing_date)
    INNER JOIN
    (SELECT bvdid, name_internat, name_native, postcode, city, region_in_country, country_iso_code, website_address
    FROM "orbiskof2023"."contact_info")
    USING (bvdid)
    INNER JOIN 
    (SELECT *
    FROM "orbiskof2023"."industry_classifications_small")
    USING (bvdid)
    """
df = wr.athena.read_sql_query(sql, database="orbiskof2023")
df

Unnamed: 0,bvdid,closing_date,number_of_employees,operating_revenue_turnover,total_assets,added_value,research_and_development_expenses,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits
0,MY654509-A,20161231,,21120375,,,,Jumbo Arena Sdn. Bhd.,Jumbo Arena Sdn. Bhd.,83000,"Batu Pahat, Johor",Johor,MY,www.isuzu3s-sales.com,5511,
1,MY654509-A,20161231,,21120375,,,,Jumbo Arena Sdn. Bhd.,Jumbo Arena Sdn. Bhd.,83000,"Batu Pahat, Johor",Johor,MY,www.isuzu3s-sales.com,5012,4511
2,MY654509-A,20171231,,24974889,158267.0,466204.0,,Jumbo Arena Sdn. Bhd.,Jumbo Arena Sdn. Bhd.,83000,"Batu Pahat, Johor",Johor,MY,www.isuzu3s-sales.com,5511,
3,MY654509-A,20171231,,24974889,158267.0,466204.0,,Jumbo Arena Sdn. Bhd.,Jumbo Arena Sdn. Bhd.,83000,"Batu Pahat, Johor",Johor,MY,www.isuzu3s-sales.com,5012,4511
4,MY654509-A,20171231,,24974889,,,,Jumbo Arena Sdn. Bhd.,Jumbo Arena Sdn. Bhd.,83000,"Batu Pahat, Johor",Johor,MY,www.isuzu3s-sales.com,5511,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75991421,RU76993305,20121231,132,2832675,178483.0,,,Limited Liability Company Chukotzhilservis-Ugo...,Чукотжилсервис-Угольные Копи,689501,Pgt Ugolnye Kopi,Far Eastern federal region|Chukotka Autonomous...,RU,,,
75991422,RU76993305,20121231,132,2832675,178483.0,,,Limited Liability Company Chukotzhilservis-Ugo...,Чукотжилсервис-Угольные Копи,689501,Pgt Ugolnye Kopi,Far Eastern federal region|Chukotka Autonomous...,RU,,,
75991423,RU76993305,20121231,132,2832675,178483.0,,,Limited Liability Company Chukotzhilservis-Ugo...,Чукотжилсервис-Угольные Копи,689501,Pgt Ugolnye Kopi,Far Eastern federal region|Chukotka Autonomous...,RU,,,
75991424,RU76993305,20121231,132,2832675,178483.0,,,Limited Liability Company Chukotzhilservis-Ugo...,Чукотжилсервис-Угольные Копи,689501,Pgt Ugolnye Kopi,Far Eastern federal region|Chukotka Autonomous...,RU,,,


In [15]:
df['count_na'] = df.isnull().sum(axis=1)
df.sort_values(['bvdid', 'closing_date', 'number_of_employees', 'operating_revenue_turnover', 
                'research_and_development_expenses', 'count_na'], 
               ascending=[True, True, False, False, False, False], 
               inplace=True)
df.drop(columns='count_na', inplace=True)
df.drop_duplicates(subset=['bvdid', 'closing_date'], inplace=True)
df

Unnamed: 0,bvdid,closing_date,number_of_employees,operating_revenue_turnover,total_assets,added_value,research_and_development_expenses,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits
41708417,ADFEB18424,20051231,,154719400,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619
41708418,ADFEB18424,20061231,,196295277,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619
41708419,ADFEB18424,20071231,,224536363,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619
41708420,ADFEB18424,20081231,,163303389,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619
41708421,ADFEB18424,20091231,,196551196,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30539797,ZWFEI1025337,20161231,,13841000,,,,Grand Reinsurance (Private) Ltd,Grand Reinsurance (Private) Ltd,,Harare,Harare,ZW,www.grandre.co.zw,,
30539802,ZWFEI1025337,20171231,,15267000,,,,Grand Reinsurance (Private) Ltd,Grand Reinsurance (Private) Ltd,,Harare,Harare,ZW,www.grandre.co.zw,,
30539807,ZWFEI1025337,20181231,,27,,,,Grand Reinsurance (Private) Ltd,Grand Reinsurance (Private) Ltd,,Harare,Harare,ZW,www.grandre.co.zw,,
30539812,ZWFEI1025337,20191231,,247,,,,Grand Reinsurance (Private) Ltd,Grand Reinsurance (Private) Ltd,,Harare,Harare,ZW,www.grandre.co.zw,,


In [18]:
df.bvdid.nunique()

4062446

In [16]:
df.name_internat.nunique()

3892412

In [17]:
df.closing_date.apply(str).str[:4].value_counts().sort_index()

closing_date
1935          1
1936          1
1968          8
1970          3
1972          1
1973          2
1974          1
1975          1
1976          2
1977          1
1979          1
1980          9
1981        183
1982        639
1983       1674
1984       3073
1985       3688
1986       4607
1987       5999
1988       7786
1989       9099
1990      13635
1991      18770
1992      24563
1993      31977
1994      37134
1995      45489
1996      56203
1997      67445
1998      80912
1999      98536
2000     116555
2001     143325
2002     171638
2003     195951
2004     212434
2005     235395
2006     423676
2007     474597
2008     533562
2009     550444
2010     580958
2011     734620
2012    1187073
2013    1381345
2014    1461614
2015    1380680
2016    1349141
2017    1382679
2018    1380776
2019    1342563
2020    1463894
2021    2062085
2022     144972
Name: count, dtype: int64

In [72]:
df.dtypes

bvdid                                 object
closing_date                           Int32
number_of_employees                    Int32
operating_revenue_turnover             Int64
total_assets                         float32
added_value                          float32
research_and_development_expenses    float32
name_internat                         object
name_native                           object
postcode                              object
city                                  object
region_in_country                     object
country_iso_code                      object
website_address                       object
ussic_primary_code                    object
nace_rev_2_core_code_4_digits         object
dtype: object

In [13]:
wr.s3.to_parquet(df, path="""s3://orbis-kof-2023/firms_100_employees_or_10_mil_turnover_any_year_full/
                             firms_100_employees_or_10_mil_turnover_any_year_full.parquet""")

{'paths': ['s3://orbis-kof-2023/firms_100_employees_or_10_mil_turnover_any_year_full/\n                             firms_100_employees_or_10_mil_turnover_any_year_full.parquet'],
 'partitions_values': {}}

# Read orbis (firms w/ min. 100 employees or 10 mil turnover)

In [56]:
import pandas as pd
import awswrangler as wr

In [57]:
df = wr.s3.read_parquet('s3://orbis-kof-2023/firms_100_employees_or_10_mil_turnover_any_year_full/')

In [58]:
for col, dtype in df.dtypes.items():
    if dtype=='string':
        df[col] = df[col].astype(str)

In [59]:
df['cleaned_name'] = df.name_internat.swifter.apply(firm_name_clean)

Pandas Apply:   0%|          | 0/19421420 [00:00<?, ?it/s]

# Read compustat

In [60]:
def firm_name_clean(firm_name, lower=True, remove_punc=True, remove_legal=True, remove_parentheses=True):
    # make string
    firm_name = str(firm_name)
    firm_name = unidecode.unidecode(firm_name)
    # lowercase
    if lower:
        firm_name = firm_name.lower()
    # remove punctuation
    if remove_punc:
        firm_name = firm_name.translate(str.maketrans('', '', '!"#$%\\\'*+,./:;<=>?@^_`{|}~'))
    # remove legal identifiers
    if remove_legal:
        legal_identifiers = ["co", "inc", "ag", "ltd", "lp", "llc", "pllc", "llp", "plc", "ltdplc", "corp",
                             "corporation", "ab", "cos", "cia", "sa", "company", "companies", "consolidated",
                             "stores", "limited", "srl", "kk", "gmbh", "pty", "group", "yk", "bhd",
                             "limitada", "holdings", "kg", "bv", "pte", "sas", "ilp", "nl", "genossenschaft",
                             "gesellschaft", "aktiengesellschaft", "ltda", "nv", "oao", "holding", "se",
                             "oy", "plcnv", "the", "neft", "& co", "&co", "(u.k.)", "uk", "south africa",
                             "vietnman", "S/a", "& co"]
        pattern = '|'.join(legal_identifiers)
        pattern = '\\b(' + pattern + ')\\b'  # match only word boundaries
        firm_name = re.sub(pattern, '', firm_name)
    # remove parentheses and anything in them: Bayerische Motoren Werke (BMW) -> Bayerische Motoren Werke
    if remove_parentheses:
        firm_name = re.sub(r'\([^()]*\)', '', firm_name)

    # make hyphens consistent
    firm_name = firm_name.replace(' - ', '-')

    # remove ampersand symbol
    firm_name = firm_name.replace('&amp;', '&')
    firm_name = firm_name.replace('&amp', '&')

    # strip
    firm_name = firm_name.strip()

    return firm_name


def firm_name_matching(df, df_lookup, firm_name_col='company', clean_lookup=True):
    assert not df[firm_name_col].duplicated().any(), 'Firm names to match contain duplicates!'
    assert not df_lookup[firm_name_col].duplicated().any(), 'Lookup firm list contains duplicates!'

    df['match_col'] = df[firm_name_col].apply(firm_name_clean)
    if clean_lookup:
        df_lookup['match_col'] = df_lookup[firm_name_col].apply(firm_name_clean)
    else:
        df_lookup['match_col'] = df_lookup[firm_name_col]

    res = df.merge(df_lookup, on='match_col', how='left', indicator=True)
    print(f'Matched {(res._merge == "both").sum()/len(df)*100} percent of companies')

    # res = res.drop(columns=['_merge', 'match_col'])

    return res

In [61]:
compustat_na = pd.read_csv('compustat_na_lexis.csv')
compustat_global = pd.read_csv('compustat_global_lexis.csv')
compustat = pd.concat([compustat_global, compustat_na]).drop_duplicates(subset=['gvkey', 'fyear'])
compustat

Unnamed: 0,gvkey,fyear,datadate,at,emp,revt,sale,xrd,conm,addzip,...,naics,sic,weburl,indfmt,consol,popsrc,datafmt,tic,curcd,costat
0,1166,1996.0,1996-12-31,611.980,4.140,658.238,658.238,57.599,ASM INTERNATIONAL NV,1322 AP,...,333242.0,3559.0,www.asm.com,,,,,,,
1,1166,1997.0,1997-12-31,724.115,4.514,708.673,708.673,85.944,ASM INTERNATIONAL NV,1322 AP,...,333242.0,3559.0,www.asm.com,,,,,,,
2,1166,1998.0,1998-12-31,623.539,4.436,634.913,634.913,79.945,ASM INTERNATIONAL NV,1322 AP,...,333242.0,3559.0,www.asm.com,,,,,,,
3,1166,1999.0,1999-12-31,425.035,5.426,414.495,414.495,47.145,ASM INTERNATIONAL NV,1322 AP,...,333242.0,3559.0,www.asm.com,,,,,,,
4,1166,2000.0,2000-12-31,777.940,6.491,935.212,935.212,73.800,ASM INTERNATIONAL NV,1322 AP,...,333242.0,3559.0,www.asm.com,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605298,351491,2019.0,2019-12-31,17847.000,32.525,13408.000,13408.000,521.000,IVECO GROUP N V,10156,...,336120.0,3711.0,www.ivecogroup.com,INDL,C,D,STD,IVCGF,USD,A
605299,351491,2020.0,2020-12-31,18841.000,32.632,12549.000,12549.000,526.000,IVECO GROUP N V,10156,...,336120.0,3711.0,www.ivecogroup.com,INDL,C,D,STD,IVCGF,USD,A
605300,351491,2021.0,2021-12-31,18834.114,34.132,14388.308,14388.308,547.054,IVECO GROUP N V,10156,...,336120.0,3711.0,www.ivecogroup.com,INDL,C,D,STD,IVCGF,USD,A
605301,351491,2022.0,2022-12-31,17113.769,35.611,15343.932,15343.932,505.515,IVECO GROUP N V,10156,...,336120.0,3711.0,www.ivecogroup.com,INDL,C,D,STD,IVCGF,USD,A


In [62]:
compustat.columns

Index(['gvkey', 'fyear', 'datadate', 'at', 'emp', 'revt', 'sale', 'xrd',
       'conm', 'addzip', 'city', 'conml', 'loc', 'naics', 'sic', 'weburl',
       'indfmt', 'consol', 'popsrc', 'datafmt', 'tic', 'curcd', 'costat'],
      dtype='object')

In [63]:
compustat_renamed = compustat[['gvkey', 'fyear', 'emp', 'revt', 'at', 'xrd', 'conm', 'addzip', 
                              'city', 'loc', 'weburl', 'sic', 'naics']].copy()
compustat_renamed.columns = ['bvdid', 'closing_date', 'number_of_employees',
       'operating_revenue_turnover', 'total_assets', 'research_and_development_expenses', 'name_internat',
       'postcode', 'city', 'country_iso_code',
       'website_address', 'ussic_primary_code',
       'naics_primary_code']
compustat_renamed['from_compustat'] = 1
compustat_renamed['cleaned_name'] = compustat_renamed.name_internat.swifter.apply(firm_name_clean)
compustat_renamed['number_of_employees'] = compustat_renamed.number_of_employees * 1e3
compustat_renamed['total_assets'] = compustat_renamed.total_assets * 1e6
compustat_renamed['operating_revenue_turnover'] = compustat_renamed.operating_revenue_turnover * 1e6
compustat_renamed['research_and_development_expenses'] = compustat_renamed.research_and_development_expenses * 1e6



Pandas Apply:   0%|          | 0/1411998 [00:00<?, ?it/s]

In [64]:
orbis_merged_compustat = pd.concat([df, compustat_renamed])
orbis_merged_compustat['from_compustat'] = orbis_merged_compustat.from_compustat.fillna(0)
orbis_merged_compustat['bvdid'] = orbis_merged_compustat.bvdid.astype(str)
orbis_merged_compustat

Unnamed: 0,bvdid,closing_date,number_of_employees,operating_revenue_turnover,total_assets,added_value,research_and_development_expenses,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name,naics_primary_code,from_compustat
0,ADFEB18424,20051231.0,,154719400.0,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,,0.0
1,ADFEB18424,20061231.0,,196295277.0,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,,0.0
2,ADFEB18424,20071231.0,,224536363.0,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,,0.0
3,ADFEB18424,20081231.0,,163303389.0,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,,0.0
4,ADFEB18424,20091231.0,,196551196.0,,,,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605298,351491,2019.0,32525.0,13408000000.0,1.784700e+10,,521000000.0,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,336120.0,1.0
605299,351491,2020.0,32632.0,12549000000.0,1.884100e+10,,526000000.0,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,336120.0,1.0
605300,351491,2021.0,34132.0,14388308000.0,1.883411e+10,,547054000.0,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,336120.0,1.0
605301,351491,2022.0,35611.0,15343932000.0,1.711377e+10,,505515000.0,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,336120.0,1.0


In [65]:
orbis_merged_compustat.columns

Index(['bvdid', 'closing_date', 'number_of_employees',
       'operating_revenue_turnover', 'total_assets', 'added_value',
       'research_and_development_expenses', 'name_internat', 'name_native',
       'postcode', 'city', 'region_in_country', 'country_iso_code',
       'website_address', 'ussic_primary_code',
       'nace_rev_2_core_code_4_digits', 'cleaned_name', 'naics_primary_code',
       'from_compustat'],
      dtype='object')

In [66]:
df_static = orbis_merged_compustat[['bvdid', 'name_internat', 'name_native',
       'postcode', 'city', 'region_in_country', 'country_iso_code',
       'website_address', 'ussic_primary_code',
       'nace_rev_2_core_code_4_digits', 'cleaned_name', 'from_compustat']]
df_static

Unnamed: 0,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name,from_compustat
0,ADFEB18424,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,0.0
1,ADFEB18424,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,0.0
2,ADFEB18424,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,0.0
3,ADFEB18424,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,0.0
4,ADFEB18424,Andorra Banc Agricol Reig SA - AndBank,Andorra Banc Agricol Reig SA - AndBank,,Escaldes-Engordany,,AD,www.andbank.com,6282,6619,andorra banc agricol reig -andbank,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
605298,351491,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,1.0
605299,351491,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,1.0
605300,351491,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,1.0
605301,351491,IVECO GROUP N V,,10156,Turin,,ITA,www.ivecogroup.com,3711.0,,iveco n v,1.0


In [67]:
df_dynamic = orbis_merged_compustat[['bvdid', 'closing_date', 'number_of_employees',
       'operating_revenue_turnover', 'total_assets', 'added_value',
       'research_and_development_expenses']]
df_dynamic

Unnamed: 0,bvdid,closing_date,number_of_employees,operating_revenue_turnover,total_assets,added_value,research_and_development_expenses
0,ADFEB18424,20051231.0,,154719400.0,,,
1,ADFEB18424,20061231.0,,196295277.0,,,
2,ADFEB18424,20071231.0,,224536363.0,,,
3,ADFEB18424,20081231.0,,163303389.0,,,
4,ADFEB18424,20091231.0,,196551196.0,,,
...,...,...,...,...,...,...,...
605298,351491,2019.0,32525.0,13408000000.0,1.784700e+10,,521000000.0
605299,351491,2020.0,32632.0,12549000000.0,1.884100e+10,,526000000.0
605300,351491,2021.0,34132.0,14388308000.0,1.883411e+10,,547054000.0
605301,351491,2022.0,35611.0,15343932000.0,1.711377e+10,,505515000.0


In [68]:
df_static = df_static.set_index('bvdid').sort_index()

In [69]:
max_per_bvdid_all_years = df_dynamic.groupby('bvdid')[['number_of_employees', 'operating_revenue_turnover']].max()
max_per_bvdid_all_years.columns = ['max_number_of_employees', 'max_turnover']
df_static = df_static.merge(max_per_bvdid_all_years, left_index=True, right_index=True)

In [70]:
df_static = df_static.sort_values(['from_compustat', 'max_number_of_employees', 'max_turnover'], ascending=False)
df_static = df_static.reset_index().drop_duplicates(subset=['cleaned_name'], keep='first')
df_static

Unnamed: 0,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name,from_compustat,max_number_of_employees,max_turnover
0,210637,UCOM-UNITED COMMUNICATON IND,,10900,Bangkok,,THA,,4810.0,,ucom-united communicaton ind,1.0,4855000.0,36815892000.0
10,11259,WALMART INC,,72716,Bentonville,,USA,www.stock.walmart.com,5331.0,,walmart,1.0,2300000.0,608481000000.0
63,64768,AMAZON.COM INC,,98109-5210,Seattle,,USA,www.aboutamazon.com,5961.0,,amazoncom,1.0,1608000.0,513983000000.0
91,61994,UNITED STATES POSTAL SERVICE,,20260-0546,Washington,,USA,www.usps.gov,4210.0,,united states postal service,1.0,905766.0,78620000000.0
120,203177,HON HAI PRECISION IND CO LTD,,236,New Taipei City,,TWN,www.foxconn.com,3674.0,,hon hai precision ind,1.0,878429.0,6626996750000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20833362,US69204NU,Global X FDS Autonomous and Electric Vehicles ETF,Global X FDS Autonomous and Electric Vehicles ETF,10022,New York,New York,US,www.globalxfunds.com,6722,6420,global x fds autonomous and electric vehicles etf,0.0,,487000.0
20833366,MN40136OI,Bayan Aldar Joint Stock Company,Bayan Aldar Joint Stock Company,,Uliastai Sum,,MN,,2099,1089,bayan aldar joint stock,0.0,,485376.0
20833376,NP124272R,Chhyangdi Hydropower Limited,Chhyangdi Hydropower Limited,,"Kathmandu, Kathmandu",,NP,chpl.com.np,,,chhyangdi hydropower,0.0,,484996.0
20833381,US841384961,"Hinto Energy, Inc.","Hinto Energy, Inc.",80111,Greenwood Village,Colorado,US,www.hintoenergy.com,6726,6430,hinto energy,0.0,,475630.0


# Firm name matching

In [71]:
lexis_firm_names_clean = pd.read_csv('lexis_firm_names_clean.csv')
lexis_firm_names_clean

Unnamed: 0,cleaned_name
0,microsoft
1,international business machines
2,siemens
3,mitsubishi
4,american express
...,...
132023,compania paraguaya de comunicaciones
132024,cencar
132025,censu
132026,bebidas del paraguay


In [72]:
df_static.cleaned_name.nunique()

3784619

In [73]:
to_remove = ['federal reserve', 'eu', 'fed', 'treasury', 'congress', 'european central bank',
             'international monetary fund', 'central bank', 'senate', 'white house', 'house', 'sec',
             'ecb', 'european commission', 'state', 'un', 'bank of england', 'opec', 'supreme court',
             'world bank', 'pentagon', 'cabinet', 'web service', 'us senate', 'imf', 'defense',
             'federal reserve bank' 'euro', 'house of representatives', 'bank', 'journal',
             'us bankruptcy court', 'medicare', 'american international', 'finance', 's&p', 's&p 500',
             'news', 'united nations', 'nasdaq', 'parliament', 'us treasury department', 'romney',
             'draghi', 'usda', 'cotton', 'district court', 'army', '', '&', np.nan, None, 'NYSE',
             'Newstex', 'NASDAQ', 'GLOBE NEWSWIRE', 'Zacks Investment Research', 'Reuters', 'ASX',
             'Nasdaq', 'TSX', 'LSE', 'JV', 'Thomson Reuters', 'Wall Street Journal', 'OTC', 'M',
             'Financial Times', 'The European Commission', 'T', 'NYSE MKT', 'TSXV', 'OTCQB',
             'BUSINESS TIMES MALAYSIA', 'fdch', 'WORLDSOURCES', 'WORLD TIMES', 'ASX',
             'WORLDSOURCES ONLINE', 'OTCBB', 'ASIA WorldSources Online', 'GLOBE NEWSWIRE',
             'AFX', 'PRNewswire-FirstCall', 'ASIA WorldSources', 'Reuters', 'ANSA', 'PR Newswire',
             'TSX VENTURE', 'RTTNews', 'ENP Newswire', 'M2 COMMUNICATIONS', 'OTCQB',
             'OTC Bulletin Board', 'Xinhua', 'CSE', '-', 'Joint Venture', 'Alliance News',
             'Interfax', 'JAKARTA POST', 'SeeNews', 'EU', 'Company', 'Asia Pulse', 'dpa-AFX',
             'JAKARTA POST ASIA WorldSources', 'XFN-ASIA', 'Financial Times', 'FDA', 'News Corp',
             'WORLD TIMES', 'RWE Australian Business News', 'TSXV', 'AAP', 'BUSINESS TIMES',
             'TSE', 'AP', 'HT Digital Content Services', 'ASIA WorldSources Online', 'EPA',
             'AIM', 'TSX VENTURE', 'Government', 'ICB', 'ICB', 'European Union', 'RWE Aust Business News',
             'JAKARTA POST INDONESIA', 'NRL', 'Newsfile', '', 'BSE', '', 'NSE', 'isis', 'ase',
             'united states navy', 'ule', 'aex', 'Undisclosed JV Partner', 'Undisclosed Chinese Co', 
             'Undisclosed JV Partner', 'Undisclosed Chinese Co', 'Peoples Republic of China', 'Malaysia',
                  'Ministry of Finance Singapore', 'Singapore', 'Russian Federation',
                  'Undisclosed Japanese Partner', 'Saudi Arabia', 'Indonesia Republic', 'Republic of Korea',
                  'Undisclosed Companies', 'Philippines', 'Algerian Government', 'Undisclosed Japanese Co(s)',
                  'Seeking Partner', 'Vietnam', 'Soviet Union', 'Kazakhstan', 'Undisclosed US Partner',
                  'Uzbekistan', 'Undisclosed Russian Partner', 'Kingdom of Spain', 'Iran', 'Shareholders',
                  'Undisclosed Thai Co', 'Pakistan', 'South Africa', 'Undisclosed Australian Partner',
                  'Republic of Ireland', 'Russian Regional Government', 'Cambodia', 'Ukraine',
                  'Undisclosed American Co', 'Chile', 'Undisclosed', 'joint venture electronics', 'european commission', 'news']
to_remove = [firm_name_clean(elem) for elem in to_remove]
lexis_firm_names_clean.drop(lexis_firm_names_clean[lexis_firm_names_clean.cleaned_name.isin(to_remove)].index, inplace=True)

# drop firm names with only one character
lexis_firm_names_clean.drop(lexis_firm_names_clean[lexis_firm_names_clean.cleaned_name.str.len() < 2].index, inplace=True)

lexis_firm_names_clean.drop_duplicates(inplace=True)

In [74]:
res = firm_name_matching(lexis_firm_names_clean, 
                             df_static, 
                             firm_name_col='cleaned_name', clean_lookup=False)
res

Matched 61.33457564463837 percent of companies


Unnamed: 0,cleaned_name_x,match_col,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name_y,from_compustat,max_number_of_employees,max_turnover,_merge
0,microsoft,microsoft,12141,MICROSOFT CORP,,98052-6399,Redmond,,USA,www.microsoft.com,7372.0,,microsoft,1.0,221000.0,198270000000.0,both
1,international business machines,international business machines,US130871985,International Business Machines Corp,International Business Machines Corp,10504,Armonk,New York,US,www.ibm.com,,,international business machines,0.0,434246.0,106916000000.0,both
2,siemens,siemens,19349,SIEMENS AG,,80333,Munich,,DEU,www.siemens.com,9997.0,,siemens,1.0,484000.0,134134000000.0,both
3,mitsubishi,mitsubishi,100555,MITSUBISHI CORP,,100-8086,Tokyo,,JPN,www.mitsubishicorp.com,5099.0,,mitsubishi,1.0,86098.0,23103043000000.0,both
4,american express,american express,1447,AMERICAN EXPRESS CO,,10285,New York,,USA,www.americanexpress.com,6141.0,,american express,1.0,114352.0,55625000000.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132007,compania paraguaya de comunicaciones,compania paraguaya de comunicaciones,,,,,,,,,,,,,,,left_only
132008,cencar,cencar,TH0105537065095,Cencar LTD,บจ.เซ็นคาร์ จำกัด,12130,"Lam Luk Ka, Pathumthani",,TH,www.cencar.com,6531,6832,cencar,0.0,,1067612799.0,both
132009,censu,censu,,,,,,,,,,,,,,,left_only
132010,bebidas del paraguay,bebidas del paraguay,,,,,,,,,,,,,,,left_only


In [75]:
i=4
res.iloc[40*i:40*(i+1)]

Unnamed: 0,cleaned_name_x,match_col,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name_y,from_compustat,max_number_of_employees,max_turnover,_merge
160,raytheon,raytheon,8972,RAYTHEON CO,,02173,Lexington,,USA,,3812.0,,raytheon,1.0,119200.0,29176000000.0,both
161,newmont,newmont,7881,NEWMONT CORP,,80237,Denver,,USA,www.newmont.com,1040.0,,newmont,1.0,17100.0,12222000000.0,both
162,daimlerchrysler,daimlerchrysler,,,,,,,,,,,,,,,left_only
163,saudi basic industries,saudi basic industries,248963,SAUDI BASIC INDUSTRIES CORP,,11422,Riyadh,,SAU,www.sabic.com,2860.0,,saudi basic industries,1.0,40000.0,198466764000.0,both
164,borealis,borealis,200460,BOREALIS,,1020,Vienna,,AUT,www.borealisgroup.com,2821.0,,borealis,1.0,7228.0,20418000000.0,both
165,cppib,cppib,HK0009135277,Cppib (Hong Kong) Limited,Cppib (Hong Kong) Limited,,,,HK,www.cppib.com,6726.0,,cppib,0.0,113.0,492000.0,both
166,qatar petroleum,qatar petroleum,,,,,,,,,,,,,,,left_only
167,singapore airlines,singapore airlines,100713,SINGAPORE AIRLINES LTD,,819829,Singapore,,SGP,www.singaporeair.com,4512.0,,singapore airlines,1.0,31834.0,17774800000.0,both
168,dsm,dsm,CN9361219289,Dsm (china) limited,帝斯曼(中国)有限公司,200120,Shanghai,East China|Shanghai,CN,www.dsm.com/countrysites/dsm-china/zh_cn/home....,7389.0,7490.0,dsm,0.0,540.0,130597138.0,both
169,sparton,sparton,9922,SPARTON CORP,,32130,De Leon Springs,,USA,www.sparton.com,3679.0,,sparton,1.0,3900.0,419362000.0,both


In [76]:
res.name_internat.notna().sum()

80969

In [77]:
res[res.name_internat.isna()]

Unnamed: 0,cleaned_name_x,match_col,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name_y,from_compustat,max_number_of_employees,max_turnover,_merge
20,gazprom,gazprom,,,,,,,,,,,,,,,left_only
26,saudi aramco,saudi aramco,,,,,,,,,,,,,,,left_only
37,motorola,motorola,,,,,,,,,,,,,,,left_only
48,board of directors,board of directors,,,,,,,,,,,,,,,left_only
92,fiat,fiat,,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132005,spectrarep,spectrarep,,,,,,,,,,,,,,,left_only
132006,compania panamericana de servicios,compania panamericana de servicios,,,,,,,,,,,,,,,left_only
132007,compania paraguaya de comunicaciones,compania paraguaya de comunicaciones,,,,,,,,,,,,,,,left_only
132009,censu,censu,,,,,,,,,,,,,,,left_only


In [78]:
## split df into batches and use multiprocessing to speed up the process
import numpy as np
import ray
from name_matching.name_matcher import NameMatcher
ray.init(ignore_reinit_error=True)


matcher = NameMatcher(low_memory=False, top_n=5, common_words=False, legal_suffixes=True,
                      distance_metrics=['editex', 'discounted_levenshtein',
                                        'refined_soundex'])
matcher.load_and_process_master_data(df_matching_data=df_static.head(500000),
                                         column='cleaned_name')

ref_matcher = ray.put(matcher)

def split_df(df, chunk_size):
    num_chunks = len(df) // chunk_size + 1
    return np.array_split(df, num_chunks)

@ray.remote
def match_names(df_matching, column_matching, matcher):
    res = matcher.match_names(to_be_matched=df_matching, column_matching=column_matching)
    matches = res[res['score'] > 95]
    return matches

def match_names_multiprocessing(df_matching, column_matching, matcher, chunk_size=1000):
    df_batches = split_df(df_matching, chunk_size)
    futures = [match_names.remote(df, column_matching, matcher) for df in df_batches]
    matches = ray.get(futures)
    return pd.concat(matches)

matches = match_names_multiprocessing(res[res.name_internat.isna()], 'cleaned_name_x', ref_matcher)

2023-07-21 01:53:10,429	INFO worker.py:1474 -- Calling ray.init() again after it has already been called.
  idf = np.log(n_samples / df) + 1


In [79]:
matches

Unnamed: 0,original_name,match_name,score,match_index
1116,lmt,lmt,100.000000,146450.0
1482,cargill incorporated,cargill,100.000000,111882.0
1811,rr,rr,100.000000,117173.0
2470,statoil asa,statoil,100.000000,192117.0
3888,caisse de depot et placement du quebec,caisse de depot et de placement du quebec,95.058981,175757.0
...,...,...,...,...
131617,serene spa,serene,100.000000,388705.0
131723,stearns,stearns,100.000000,289040.0
131770,komercni banka,komercni banka as,100.000000,4925.0
131805,l3 communications,l3 communications,100.000000,159634.0


In [80]:
res_fuzzy = matches[['original_name', 'match_name']].merge(df_static, left_on='match_name', right_on='cleaned_name')
res_fuzzy.rename(columns={'original_name': 'cleaned_name_x', 'match_name': 'cleaned_name_y'}, inplace=True)
res_fuzzy

Unnamed: 0,cleaned_name_x,cleaned_name_y,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name,from_compustat,max_number_of_employees,max_turnover,match_col
0,lmt,lmt,DE7370062061,LMT GmbH & Co. KG,LMT GmbH & Co. KG,21493,Schwarzenbek,Schleswig-Holstein|Lauenburg,DE,www.lmt-tools.com,6722,,lmt,0.0,2148.0,495455238.0,lmt &
1,cargill incorporated,cargill,GB01387437,Cargill PLC,Cargill PLC,KT13 0SL,Weybridge,England|London Outer|Kingston Upon Thames (KT)...,GB,www.cargill.co.uk,,,cargill,0.0,6076.0,2825848430.0,cargill
2,rr,rr,NL67275974,RR Holding B.V.,RR Holding B.V.,6546 BE,Nijmegen,Gelderland|Nijmegen,NL,www.directezorgnijmegen.nl,6719,,rr,0.0,4759.0,,rr
3,statoil asa,statoil,AO100005282,Statoil,Statoil,,Luanda,Luanda,AO,www.statoil.com,1389,0910,statoil,0.0,1200.0,3610000000.0,statoil
4,caisse de depot et placement du quebec,caisse de depot et de placement du quebec,CAFEB21716,Caisse de depot et de placement du Quebec,Caisse de depot et de placement du Quebec,QC G1R 3X5,Quebec,,CA,www.cdpq.com,,,caisse de depot et de placement du quebec,0.0,1454.0,38435288684.0,caisse de depot et de placement du quebec
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,serene spa,serene,GB03024917,Serene Group Limited,Serene Group Limited,EC1V 4PY,London,England|London Inner|London (EC)|London EC1V,GB,,,,serene,0.0,472.0,8203010.0,serene
488,stearns,stearns,US113407022GN,Stearns,Stearns,90807-2910,Long Beach,California|Los Angeles County,US,www.stearnsnet.com,5411,4711,stearns,0.0,696.0,382000.0,stearns
489,komercni banka,komercni banka as,204719,KOMERCNI BANKA AS,,114 07,Prague,,CZE,www.kb.cz,6020.0,,komercni banka as,1.0,14843.0,103479000000.0,komercni banka as
490,l3 communications,l3 communications,DE2050486935,L-3 Communications Holding GmbH,L-3 Communications Holding GmbH,72793,Pfullingen,Baden-Wuerttemberg|Tuebingen|Reutlingen,DE,www.l-3com.com,6722,,l3 communications,0.0,1775.0,527987497.0,l-3 communications


In [81]:
res_full = pd.concat([res[res.name_internat.notna()], res_fuzzy])
res_full.drop(columns=['match_col', 'cleaned_name_y', '_merge', 'cleaned_name'], inplace=True)
res_full.rename(columns={'cleaned_name_x': 'cleaned_name'}, inplace=True)
res_full

Unnamed: 0,cleaned_name,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,from_compustat,max_number_of_employees,max_turnover
0,microsoft,12141,MICROSOFT CORP,,98052-6399,Redmond,,USA,www.microsoft.com,7372.0,,1.0,221000.0,198270000000.0
1,international business machines,US130871985,International Business Machines Corp,International Business Machines Corp,10504,Armonk,New York,US,www.ibm.com,,,0.0,434246.0,106916000000.0
2,siemens,19349,SIEMENS AG,,80333,Munich,,DEU,www.siemens.com,9997.0,,1.0,484000.0,134134000000.0
3,mitsubishi,100555,MITSUBISHI CORP,,100-8086,Tokyo,,JPN,www.mitsubishicorp.com,5099.0,,1.0,86098.0,23103043000000.0
4,american express,1447,AMERICAN EXPRESS CO,,10285,New York,,USA,www.americanexpress.com,6141.0,,1.0,114352.0,55625000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,serene spa,GB03024917,Serene Group Limited,Serene Group Limited,EC1V 4PY,London,England|London Inner|London (EC)|London EC1V,GB,,,,0.0,472.0,8203010.0
488,stearns,US113407022GN,Stearns,Stearns,90807-2910,Long Beach,California|Los Angeles County,US,www.stearnsnet.com,5411,4711,0.0,696.0,382000.0
489,komercni banka,204719,KOMERCNI BANKA AS,,114 07,Prague,,CZE,www.kb.cz,6020.0,,1.0,14843.0,103479000000.0
490,l3 communications,DE2050486935,L-3 Communications Holding GmbH,L-3 Communications Holding GmbH,72793,Pfullingen,Baden-Wuerttemberg|Tuebingen|Reutlingen,DE,www.l-3com.com,6722,,0.0,1775.0,527987497.0


In [82]:
res_full.to_pickle('matching_result_lexis_orbis2023_compustat.pkl')

In [85]:
df_dynamic[df_dynamic.bvdid.isin(res_full.bvdid)].to_pickle('matching_result_dynamic_lexis_orbis2023_compustat.pkl')

In [158]:
# matches.to_pickle('dnb_matches1.pkl')

In [20]:
# matches = pd.read_pickle('dnb_matches.pkl')

In [159]:
# res.to_pickle('res1.pkl')

In [209]:
names_ids = lexis_firm_names_clean[['cleaned_name']].merge(res_full[['cleaned_name', 'bvdid']],
                                         on='cleaned_name', how='inner')
names_ids = names_ids.dropna().set_index('cleaned_name').squeeze().to_dict()

In [None]:
important_labels = ['StrategicAlliance', 'JointVenture', 'Marketing', 'Manufacturing',
                    'ResearchandDevelopment', 'Licensing']

# read lexis nexis articles with detected orgs and relations
df = pd.read_pickle('/Users/Jakob/Documents/financial_news_data/lexisnexis_preds_robust_vortex_99.pkl')

df.drop(columns=['index_x', 'index_y'], inplace=True)


df = df[['publication', 'publication_date', 'firms', 'rels_pred', 'country', 'industry']]

df['cleaned_firms'] = df.firms.apply(lambda firms: [firm_name_clean(firm) for firm in firms])


names_ids = lexis_firm_names_clean.merge(res_full[['cleaned_name', 'bvdid']],
                                         on='cleaned_name', how='left')

rels = df[['publication_date', 'cleaned_firms', 'rels_pred']].copy()
rels['firm_a'] = rels.cleaned_firms.str[0]
rels['firm_b'] = rels.cleaned_firms.str[1]
rels.drop(columns=['cleaned_firms'], inplace=True)

rels['firm_a'] = rels.firm_a.map(names_ids)
rels['firm_b'] = rels.firm_b.map(names_ids)

rels.dropna(inplace=True)

# remove terminated
rels = rels[rels.rels_pred.apply(lambda rels: 'Terminated' not in rels)]

# remove firms where both participants are the same
rels = rels[rels.firm_a != rels.firm_b]

# remove duplicate relationships (same participants, same type, same year)
rels['year'] = rels.publication_date.dt.year
rels = rels.groupby(['firm_a', 'firm_b', 'year']).agg(list)
rels.reset_index(inplace=True)

# from itertools import chain
# rels['rels_pred'] = rels.rels_pred.apply(chain.from_iterable).apply(list).apply(set).apply(list)

rels = rels.explode('rels_pred')

orbis.to_csv(os.path.join(output_path, 'rel_database', 'lexis_orbis_match.csv'), index=False)

# save separate csvs for each relation type
for rel_name in important_labels:
    rels[rels.rels_pred.apply(lambda rel: rel==rel_name)].drop(columns=['rels_pred']).to_csv(
            os.path.join(output_path, 'rel_database', f'{rel_name}_LexisNexis.csv'), index=False)

In [71]:
merged_orbis = merged.merge(df_static, left_on='matched_name', right_on='cleaned_name', how='inner')
merged_orbis = merged_orbis.drop(columns=['index'])
merged_orbis

Unnamed: 0,original_name,matched_name,bvdid,name_internat,name_native,postcode,city,region_in_country,country_iso_code,website_address,ussic_primary_code,nace_rev_2_core_code_4_digits,cleaned_name,max_number_of_employees,max_turnover
0,microsoft,microsoft,US911144442,Microsoft Corporation,Microsoft Corporation,98052,Redmond,Washington,US,www.microsoft.com,,,microsoft,221000,198270000000
1,international business machines,international business machines,US130871985,International Business Machines Corp,International Business Machines Corp,10504,Armonk,New York,US,www.ibm.com,,,international business machines,434246,106916000000
2,siemens,siemens,DE2010000581,Siemens AG,Siemens AG,80333,Munchen,,DE,www.siemens.com,,,siemens,477100,113278278484
3,mitsubishi,mitsubishi,JP5010001008771,Mitsubishi Corporation,三菱商事株式会社,100-0005,Chiyoda-Ku,Kanto|Tokyo,JP,www.mitsubishicorp.com,,,mitsubishi,86098,195485450403
4,american express,american express,US134922250,American Express Company,American Express Company,10285,New York,New York,US,www.americanexpress.com,,,american express,64611,43645000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78212,objectiva,objective,AU050539350,Objective Corporation Limited,Objective Corporation Limited,2060,North Sydney,New South Wales,AU,www.objective.com,7371,6201,objective,240,73276251
78213,k-air,kair,RU60611419,Kair,Каир,610046,Kirov,Volga federal region|Kirov region,RU,,,,kair,35,45984051
78214,f&h,fh,US189804411L,FH CO INC,FH CO INC,67067-9001,Kechi,Kansas|Sedgwick County,US,www.f-hcompanies_pristine_pristine.com,1743,4399,fh,130,500000
78215,software602,software602 as,CZ63078236,Software602 a.s.,Software602 a.s.,14000,Praha 4,Prague - capital of the CR|District of Prague 4,CZ,www.602.cz,,,software602 as,110,10435134


In [72]:
df_dynamic[df_dynamic.bvdid.isin(merged_orbis.bvdid)]

Unnamed: 0,bvdid,closing_date,number_of_employees,operating_revenue_turnover,total_assets,added_value,research_and_development_expenses
73,AE0000000904,20141231,195,220000000,,,
74,AE0000000904,20201231,195,220000000,,,
76,AE0000001238,20130331,25,78869679,,,
77,AE0000001238,20140331,25,73534249,,,
78,AE0000001238,20150331,25,54949375,,,
...,...,...,...,...,...,...,...
19421185,ZWFEB23823,20171231,823,89420000,,,
19421186,ZWFEB23823,20181231,,120394000,,,
19421187,ZWFEB23823,20191231,,383836371,,,
19421188,ZWFEB23823,20201231,25,20295000,,,
