In [2]:
!nvidia-smi

Mon Apr  4 21:57:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 495.46       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:81:00.0 Off |                  N/A |
| 32%   43C    P8    18W / 350W |      1MiB / 24268MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
!pip install -q -U 'spacy[cuda115]'
!python -m spacy download en_core_web_trf
!pip install -q pandas unidecode more_itertools ipynb

Collecting en-core-web-trf==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.2.0/en_core_web_trf-3.2.0-py3-none-any.whl (460.2 MB)
[K     |████████████████████████████████| 460.2 MB 1.0 MB/s eta 0:00:0103    |█████████████████████████████▍  | 422.0 MB 1.0 MB/s eta 0:00:38
[?25hCollecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.5-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 123 kB/s eta 0:00:011
Collecting transformers<4.18.0,>=3.4.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 16.1 MB/s eta 0:00:01
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 45.8 MB/s eta 0:00:01
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp38-cp38-manylinux_2_1



In [4]:
import spacy
from tqdm.auto import tqdm
tqdm.pandas()
import pandas as pd
import unidecode
import re

In [5]:
spacy.require_gpu()

True

In [6]:
nlp = spacy.load("en_core_web_trf") # spacy model

In [7]:
def get_orgs(text):
    '''
    This function takes a text. Uses the Spacy model.
    The model will tokenize, POS-tag and recognize the entities named in the text.
    Returns a list of entities in the text that were recognized as organizations.
    '''
    # Apply the model
    tags = nlp(text)
    ents = [(ent.text.replace('\'', ''), (ent.start_char, ent.end_char)) for ent in tags.ents if ent.label_=='ORG'] # also remove apostrophes
    # Return the list of entities recognized as organizations
    return ents

# test run
get_orgs('Apple and Microsoft plan to form a joint venture for the development of cloud-based '
             'computing infrastrucutre.')

[('Apple', (0, 5)), ('Microsoft', (10, 19))]

In [8]:
# # read lexis nexis news data
# df = pd.read_pickle("lexisnexis")
# df.shape

In [9]:
# df['ents_pred'] = df.content.str[:600].progress_apply(get_orgs)

In [10]:
df = pd.read_pickle("lexisnexis")

In [11]:
def firm_name_clean(firm_name, lower=True, remove_punc=True, remove_legal=True, remove_parentheses=True):
    # make string
    firm_name = str(firm_name)
    firm_name = unidecode.unidecode(firm_name)
    # lowercase
    if lower:
        firm_name = firm_name.lower()
    # remove punctuation
    if remove_punc:
        firm_name = firm_name.translate(str.maketrans('', '', '!"#$%\\\'*+,./:;<=>?@^_`{|}~'))
    # remove legal identifiers
    if remove_legal:
        legal_identifiers = ["co", "inc", "ag", "ltd", "lp", "llc", "pllc", "llp", "plc", "ltdplc", "corp",
                             "corporation", "ab", "cos", "cia", "sa", "company", "companies", "consolidated",
                             "stores", "limited", "srl", "kk", "gmbh", "pty", "group", "yk", "bhd",
                             "limitada", "holdings", "kg", "bv", "pte", "sas", "ilp", "nl", "genossenschaft",
                             "gesellschaft", "aktiengesellschaft", "ltda", "nv", "oao", "holding", "se",
                             "oy", "plcnv", "the", "neft", "& co", "&co"]
        pattern = '|'.join(legal_identifiers)
        pattern = '\\b(' + pattern + ')\\b'  # match only word boundaries
        firm_name = re.sub(pattern, '', firm_name)
    # remove parentheses and anything in them: Bayerische Motoren Werke (BMW) -> Bayerische Motoren Werke
    if remove_parentheses:
        firm_name = re.sub(r'\([^()]*\)', '', firm_name)

    # make hyphens consistent
    firm_name = firm_name.replace(' - ', '-')

    # remove ampersand symbol
    firm_name = firm_name.replace('&amp;', '&')
    firm_name = firm_name.replace('&amp', '&')

    # strip
    firm_name = firm_name.strip()

    return firm_name

def extract_firm_name_and_spans(text, names, clean_names=True):
    if clean_names:
        cleaned_names = [firm_name_clean(name) for name in names]
        names += cleaned_names
    pattern = r'|'.join(re.escape(word.strip()) for word in names)

    res = re.finditer(pattern, text, flags=re.IGNORECASE)

    return [(match.group(), match.span()) for match in res]

def clean_unique_entities(ents):
    seen_ents = []
    res = []
    for ent in ents:
        cleaned_ent = firm_name_clean(ent[0])
        if cleaned_ent not in seen_ents:
            res.append(ent + (cleaned_ent,))
            seen_ents.append(cleaned_ent)


    return res


In [12]:
df.iloc[:10].progress_apply(lambda x: extract_firm_name_and_spans(x.content, x.company), axis=1)

  0%|          | 0/10 [00:00<?, ?it/s]

0    [(Honda Motor, (186, 197)), (Honda Motor, (505...
1    [(Royal Dutch Shell PLC, (45, 66)), (PetroChin...
2    [(Microsoft Corp, (2756, 2770)), (VHA Inc, (28...
3    [(Kandi Technologies, (78, 96)), (Geely Automo...
4    [(Fujitsu Electronics Inc, (193, 216)), (Fujit...
5    [(, (0, 0)), (, (1, 1)), (, (2, 2)), (, (3, 3)...
6    [(Eros International Media, (84, 108)), (Eros ...
7    [(Fertoz Ltd, (46, 56)), (Fertoz, (278, 284)),...
8    [(Intermatic Inc, (265, 279)), (Intermatic, (5...
9    [(Honda Motor, (13, 24)), (Honda motor, (297, ...
dtype: object

In [13]:
df['ents_matched'] = df.progress_apply(lambda x: extract_firm_name_and_spans(x.content[:600], x.company), axis=1)

  0%|          | 0/1493945 [00:00<?, ?it/s]

In [14]:
orbis = pd.read_csv('https://www.dropbox.com/s/bsq4m09j3ovqsy1/firm_lookup_list.csv.gzip?dl=1', compression='gzip')
orbis.head()

Unnamed: 0,0
0,polytec france
1,ati france
2,neubau kompass
3,acupuncture chiropractic clinic
4,protocor twenty two cc


In [15]:
orbis = set(orbis.squeeze().to_list())

In [16]:
df.ents_pred.apply(len).mean()

5.8734960122360595

In [17]:
df['ents'] = df.ents_matched + df.ents_pred
df.ents.apply(len).mean()

131.72732195629692

In [18]:
df['ents'] = df.ents.progress_apply(clean_unique_entities)
df.ents.apply(len).mean()

  0%|          | 0/1493945 [00:00<?, ?it/s]

4.728095077128007

In [19]:
# filter out entities not in orbis
df['ents'] = df.ents.apply(lambda ents: [ent for ent in ents if ent[2] in orbis])
df.ents.apply(len).mean()

2.3208511692197504

In [20]:
df.shape

(1493945, 18)

In [21]:
df = df[df.ents.apply(len) > 1].copy() # need at least two unique identified participants
df.shape

(997100, 18)

In [22]:
# calc number of combinations
import math
df.ents.apply(len).apply(lambda n: math.factorial(n)/(2*math.factorial(n-2))).sum()

4303138.0

In [23]:
# df['ents'] = df.ents.str[:4] # take only first four entities for each article

In [24]:
#We have multiple detected entities in each row. However, LUKE needs a pair of exactly two entities as an
# input. Therefore, we need to create rows for all combinations of entities in a document:
import itertools
df["ent_comb"] = df.ents.apply(lambda ents: [list(comb) for comb in itertools.combinations(ents, 2)])
df = df.explode("ent_comb")
df.shape

(4303138, 19)

In [25]:
common_firms = df.ents.explode().apply(lambda ent: ent[0]).value_counts()

In [26]:
common_firms.iloc[100:150]

Dow Chemical Co           10443
BBC                       10408
Vivendi                   10360
Sony Corp                 10228
Nissan Motor              10191
Reliance Industries       10176
Merrill Lynch             10122
British Airways           10106
Dow Chemical              10049
Viacom                     9964
Virgin                     9939
Verizon Communications     9867
Sanofi                     9863
Telefonica                 9798
BA                         9648
VW                         9640
LG                         9612
Commission                 9535
Thales                     9519
RWE                        9202
ABC                        9184
Sumitomo Corp              9170
Walt Disney Co             9165
Bank of America            9095
Nokia Siemens Networks     9092
PetroChina                 9077
Peugeot                    9075
Pfizer                     9049
Mazda                      9049
Barclays                   9020
MGM                        8945
Siemens 

In [27]:
# take only combinations of entities close to each other
df = df[df.ent_comb.apply(lambda ent_pair: abs(ent_pair[0][1][0]-ent_pair[1][1][0]) < 200)]

In [28]:
df["firms"] = df.ent_comb.apply(lambda ents: [ent[0] for ent in ents])
df["spans"] = df.ent_comb.apply(lambda ents: [ent[1] for ent in ents])

In [30]:
df.drop(columns=['ents_pred', 'ents_matched', 'ents', 'ent_comb'], inplace=True)
df.rename(columns={'content': 'document'}, inplace=True)
df.head()

Unnamed: 0,title,document,publication,word_count,publication_date,publication_date_text,author,copyright,subject,country,city,person,industry,company,lang,firms,firms.1,spans
3,![['ELectricCar News: Kandi's (NASDAQ GS: $KND...,"! Jinhua, China - July 25, 2016 (Newsfile Corp...",['MENA English (Middle East and North Africa F...,4514,2016-07-26,"July 26, 2016 Tuesday",,Copyright 2016 MENAFN.COM All Rights Reserved,"[JOINT VENTURES, ELECTRIC VEHICLES, HOLDING CO...","[CHINA, NORTHERN AFRICA, MIDDLE EAST]","[DONGGUAN, GUANGDONG, CHINA]",[],"[NAICS336211 MOTOR VEHICLE BODY MANUFACTURING,...","[KANDI TECHNOLOGIES GROUP INC, GEELY AUTOMOBIL...",en,"[(Kandi Technologies, (78, 96), kandi technolo...","[Kandi Technologies, Geely Automobile Holdings...","[(78, 96), (274, 303)]"
3,![['ELectricCar News: Kandi's (NASDAQ GS: $KND...,"! Jinhua, China - July 25, 2016 (Newsfile Corp...",['MENA English (Middle East and North Africa F...,4514,2016-07-26,"July 26, 2016 Tuesday",,Copyright 2016 MENAFN.COM All Rights Reserved,"[JOINT VENTURES, ELECTRIC VEHICLES, HOLDING CO...","[CHINA, NORTHERN AFRICA, MIDDLE EAST]","[DONGGUAN, GUANGDONG, CHINA]",[],"[NAICS336211 MOTOR VEHICLE BODY MANUFACTURING,...","[KANDI TECHNOLOGIES GROUP INC, GEELY AUTOMOBIL...",en,"[(Kandi Technologies, (78, 96), kandi technolo...","[Kandi Technologies, GS]","[(78, 96), (117, 119)]"
3,![['ELectricCar News: Kandi's (NASDAQ GS: $KND...,"! Jinhua, China - July 25, 2016 (Newsfile Corp...",['MENA English (Middle East and North Africa F...,4514,2016-07-26,"July 26, 2016 Tuesday",,Copyright 2016 MENAFN.COM All Rights Reserved,"[JOINT VENTURES, ELECTRIC VEHICLES, HOLDING CO...","[CHINA, NORTHERN AFRICA, MIDDLE EAST]","[DONGGUAN, GUANGDONG, CHINA]",[],"[NAICS336211 MOTOR VEHICLE BODY MANUFACTURING,...","[KANDI TECHNOLOGIES GROUP INC, GEELY AUTOMOBIL...",en,"[(Kandi Technologies, (78, 96), kandi technolo...","[Kandi Technologies, Kandi]","[(78, 96), (146, 151)]"
3,![['ELectricCar News: Kandi's (NASDAQ GS: $KND...,"! Jinhua, China - July 25, 2016 (Newsfile Corp...",['MENA English (Middle East and North Africa F...,4514,2016-07-26,"July 26, 2016 Tuesday",,Copyright 2016 MENAFN.COM All Rights Reserved,"[JOINT VENTURES, ELECTRIC VEHICLES, HOLDING CO...","[CHINA, NORTHERN AFRICA, MIDDLE EAST]","[DONGGUAN, GUANGDONG, CHINA]",[],"[NAICS336211 MOTOR VEHICLE BODY MANUFACTURING,...","[KANDI TECHNOLOGIES GROUP INC, GEELY AUTOMOBIL...",en,"[(Kandi Technologies, (78, 96), kandi technolo...","[Kandi Technologies, Kandi Electric Vehicles G...","[(78, 96), (175, 214)]"
3,![['ELectricCar News: Kandi's (NASDAQ GS: $KND...,"! Jinhua, China - July 25, 2016 (Newsfile Corp...",['MENA English (Middle East and North Africa F...,4514,2016-07-26,"July 26, 2016 Tuesday",,Copyright 2016 MENAFN.COM All Rights Reserved,"[JOINT VENTURES, ELECTRIC VEHICLES, HOLDING CO...","[CHINA, NORTHERN AFRICA, MIDDLE EAST]","[DONGGUAN, GUANGDONG, CHINA]",[],"[NAICS336211 MOTOR VEHICLE BODY MANUFACTURING,...","[KANDI TECHNOLOGIES GROUP INC, GEELY AUTOMOBIL...",en,"[(Geely Automobile Holdings Ltd, (274, 303), g...","[Geely Automobile Holdings Ltd, GS]","[(274, 303), (117, 119)]"


In [31]:
df.to_pickle('lexisnexis_for_inference')