In [1]:
# !apt-get -qq install sudo
# !sudo apt -qq install wget -y

In [2]:
# # download model
# !wget -O glowing-planet-85 'https://www.dropbox.com/s/0whlizgnw6s4eb7/glowing-planet-85.ckpt?dl=1'

In [3]:
# # download lexis data
# !wget -O lexisnexis 'https://www.dropbox.com/s/ew8rjok128rof1x/lexisnexis_with_ents_spacy_transformer.pkl?dl=1'

In [4]:
!ls

Firm_relation_extraction_LUKE.ipynb  lexis_entity_extraction_spacy.ipynb
LUKE_model.py			     lexisnexis
__pycache__			     utils.py
glowing-planet-85


In [5]:
!nvidia-smi

Mon Apr  4 19:31:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.39.01    Driver Version: 510.39.01    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:19:00.0 Off |                  N/A |
|  0%   41C    P8    22W / 250W |      1MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [6]:
# !pip install -q -U 'spacy[cuda114]'
# !python -m spacy download en_core_web_trf
# !pip install -q pandas unidecode more_itertools ipynb

In [7]:
import spacy
from tqdm.auto import tqdm
tqdm.pandas()
import pandas as pd
import unidecode
import re

In [8]:
spacy.require_gpu()

True

In [9]:
nlp = spacy.load("en_core_web_trf") # spacy model

In [10]:
def get_orgs(text):
    '''
    This function takes a text. Uses the Spacy model.
    The model will tokenize, POS-tag and recognize the entities named in the text.
    Returns a list of entities in the text that were recognized as organizations.
    '''
    # Apply the model
    tags = nlp(text)
    ents = [(ent.text.replace('\'', ''), (ent.start_char, ent.end_char)) for ent in tags.ents if ent.label_=='ORG'] # also remove apostrophes
    # Return the list of entities recognized as organizations
    return ents

# test run
get_orgs('Apple and Microsoft plan to form a joint venture for the development of cloud-based '
             'computing infrastrucutre.')

[('Apple', (0, 5)), ('Microsoft', (10, 19))]

In [11]:
# # read lexis nexis news data
# df = pd.read_pickle("lexisnexis")
# df.shape

In [12]:
# df['ents_pred'] = df.content.str[:600].progress_apply(get_orgs)

In [13]:
df = pd.read_pickle("lexisnexis")

In [14]:
def firm_name_clean(firm_name, lower=True, remove_punc=True, remove_legal=True, remove_parentheses=True):
    # make string
    firm_name = str(firm_name)
    firm_name = unidecode.unidecode(firm_name)
    # lowercase
    if lower:
        firm_name = firm_name.lower()
    # remove punctuation
    if remove_punc:
        firm_name = firm_name.translate(str.maketrans('', '', '!"#$%\\\'*+,./:;<=>?@^_`{|}~'))
    # remove legal identifiers
    if remove_legal:
        legal_identifiers = ["co", "inc", "ag", "ltd", "lp", "llc", "pllc", "llp", "plc", "ltdplc", "corp",
                             "corporation", "ab", "cos", "cia", "sa", "company", "companies", "consolidated",
                             "stores", "limited", "srl", "kk", "gmbh", "pty", "group", "yk", "bhd",
                             "limitada", "holdings", "kg", "bv", "pte", "sas", "ilp", "nl", "genossenschaft",
                             "gesellschaft", "aktiengesellschaft", "ltda", "nv", "oao", "holding", "se",
                             "oy", "plcnv", "the", "neft", "& co", "&co"]
        pattern = '|'.join(legal_identifiers)
        pattern = '\\b(' + pattern + ')\\b'  # match only word boundaries
        firm_name = re.sub(pattern, '', firm_name)
    # remove parentheses and anything in them: Bayerische Motoren Werke (BMW) -> Bayerische Motoren Werke
    if remove_parentheses:
        firm_name = re.sub(r'\([^()]*\)', '', firm_name)

    # make hyphens consistent
    firm_name = firm_name.replace(' - ', '-')

    # remove ampersand symbol
    firm_name = firm_name.replace('&amp;', '&')
    firm_name = firm_name.replace('&amp', '&')

    # strip
    firm_name = firm_name.strip()

    return firm_name

def extract_firm_name_and_spans(text, names, clean_names=True):
    if clean_names:
        cleaned_names = [firm_name_clean(name) for name in names]
        names += cleaned_names
    pattern = r'|'.join(re.escape(word.strip()) for word in names)

    res = re.finditer(pattern, text, flags=re.IGNORECASE)

    return [(match.group(), match.span()) for match in res]

def clean_unique_entities(ents):
    seen_ents = []
    res = []
    for ent in ents:
        cleaned_ent = firm_name_clean(ent[0])
        if cleaned_ent not in seen_ents:
            res.append(ent + (cleaned_ent,))
            seen_ents.append(cleaned_ent)


    return res


In [15]:
df.iloc[:10].progress_apply(lambda x: extract_firm_name_and_spans(x.content, x.company), axis=1)

  0%|          | 0/10 [00:00<?, ?it/s]

0    [(Honda Motor, (186, 197)), (Honda Motor, (505...
1    [(Royal Dutch Shell PLC, (45, 66)), (PetroChin...
2    [(Microsoft Corp, (2756, 2770)), (VHA Inc, (28...
3    [(Kandi Technologies, (78, 96)), (Geely Automo...
4    [(Fujitsu Electronics Inc, (193, 216)), (Fujit...
5    [(, (0, 0)), (, (1, 1)), (, (2, 2)), (, (3, 3)...
6    [(Eros International Media, (84, 108)), (Eros ...
7    [(Fertoz Ltd, (46, 56)), (Fertoz, (278, 284)),...
8    [(Intermatic Inc, (265, 279)), (Intermatic, (5...
9    [(Honda Motor, (13, 24)), (Honda motor, (297, ...
dtype: object

In [None]:
df['ents_matched'] = df.progress_apply(lambda x: extract_firm_name_and_spans(x.content[:600], x.company), axis=1)

  0%|          | 0/1493945 [00:00<?, ?it/s]

In [None]:
orbis = pd.read_csv('https://www.dropbox.com/s/bsq4m09j3ovqsy1/firm_lookup_list.csv.gzip?dl=1', compression='gzip')
orbis.head()

In [None]:
orbis = set(orbis.squeeze().to_list())

In [None]:
df.ents_pred.apply(len).mean()

In [None]:
df['ents'] = df.ents_matched + df.ents_pred
df.ents.apply(len).mean()

In [None]:
df['ents'] = df.ents.progress_apply(clean_unique_entities)
df.ents.apply(len).mean()

In [None]:
# filter out entities not in orbis
df['ents'] = df.ents.apply(lambda ents: [ent for ent in ents if ent[2] in orbis])
df.ents.apply(len).mean()

In [None]:
df.shape

In [None]:
df = df[df.ents.apply(len) > 1].copy() # need at least two unique identified participants
df.shape

In [None]:
# calc number of combinations
import math
df.ents.apply(len).apply(lambda n: math.factorial(n)/(2*math.factorial(n-2))).sum()

In [45]:
# df['ents'] = df.ents.str[:4] # take only first four entities for each article

In [46]:
#We have multiple detected entities in each row. However, LUKE needs a pair of exactly two entities as an
# input. Therefore, we need to create rows for all combinations of entities in a document:
import itertools
df["ent_comb"] = df.ents.apply(lambda ents: [list(comb) for comb in itertools.combinations(ents, 2)])
df = df.explode("ent_comb")
df.shape

(7147919, 20)

In [65]:
common_firms = df.ents.explode().apply(lambda ent: ent[0]).value_counts()

In [75]:
common_firms.iloc[150:200]

Sony Corp                        7922
Honda                            7899
Blackstone                       7858
Toshiba Corp                     7857
Alcoa                            7802
Vale                             7800
LON                              7795
NNPC                             7786
Credit Suisse                    7738
CP                               7690
Eni                              7661
TSX VENTURE:                     7658
Royal Bank of Scotland           7506
Government                       7487
Pfizer                           7433
China National Petroleum Corp    7334
Intel                            7301
ICB                              7295
Fujitsu                          7269
Delta Air Lines                  7260
Philips                          7207
LG                               7177
European Union                   7136
Bombardier                       7036
Sanofi                           7006
RWE Aust Business News           6982
PDVSA       

In [72]:
firm_name_clean('/PRNewswire-FirstCall/ ')

'prnewswire-firstcall'

In [61]:
df[df.ent_comb.apply(lambda ent_pair: abs(ent_pair[0][1][0]-ent_pair[1][1][0]) < 100)]

'Travel company Expedia (EXPE) said it has acquired Travelocity from technology provider Sabre (SABR) for $280 million in cash. The acquisition follows'

In [None]:
# take only combinations of entities close to each other
df = df[df.ent_comb.apply(lambda ent_pair: abs(ent_pair[0][1][0]-ent_pair[1][1][0]) < 100)]

In [43]:
df["firms"] = df.ent_comb.apply(lambda ents: [ent[0] for ent in ents])
df["spans"] = df.ent_comb.apply(lambda ents: [ent[1] for ent in ents])

In [58]:
df.drop(columns=['ents_pred', 'firms_pred', 'spans_pred', 'matched_ents', 'ents', 'ent_comb'], inplace=True)
df.rename(columns={'content': 'document', 'ent_comb': 'firms'}, inplace=True)
df.head()

[('RTTNews', (1, 8)),
 ('Qualcomm Inc.', (12, 25)),
 ('QCOM', (27, 31)),
 ('TDK Corp.', (49, 58)),
 ('TTDKF.PK', (60, 68)),
 ('TTDKY.PK', (70, 78)),
 ('RF360 Holdings Singapore PTE. Ltd.', (159, 193)),
 ('Qualcomm', (224, 232)),
 ('RFFE Business Unit', (235, 253)),
 ('TDK SAW Business Group', (535, 557)),
 ('RF360 Holdings', (584, 598))]

In [49]:
from ipynb.fs.defs.Firm_relation_extraction_LUKE import LUKE

NameError: name 'rel_label_names' is not defined