In [None]:
import itertools
import json
import urllib
import torch
import nltk
import opennre
import pandas as pd
from transformers import LukeForEntityPairClassification, LukeTokenizer

In [None]:
# Define some global variables
ENTITY_TYPES = ["human", "person", "company", "enterprise", "business", "geographic region",
                "human settlement", "geographic entity", "territorial entity type", "organization"]
# wikipedia key
WIKIFIER_KEY = "sosnideoztzyizecctjwupsbpabuft"

In [None]:
# wikifier function

def wikifier(text, lang="en", threshold=0.8):
    """Function that fetches entity linking results from wikifier.com API"""
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", WIKIFIER_KEY),
        ("pageRankSqThreshold", "%g" %
         threshold), ("applyPageRankSqThreshold", "true"),
        ("nTopDfValuesToIgnore", "100"), ("nWordsToIgnoreFromList", "100"),
        ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
        ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    url = "http://www.wikifier.org/annotate-article"
    # Call the Wikifier and read the response.
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # Output the annotations.
    results = list()
    for annotation in response["annotations"]:
        # Filter out desired entity classes
        if ('wikiDataClasses' in annotation) and (any([el['enLabel'] in ENTITY_TYPES for el in annotation['wikiDataClasses']])):

            # Specify entity label
            if any([el['enLabel'] in ["human", "person"] for el in annotation['wikiDataClasses']]):
                label = 'Person'
            elif any([el['enLabel'] in ["company", "enterprise", "business", "organization"] for el in annotation['wikiDataClasses']]):
                label = 'Organization'
            elif any([el['enLabel'] in ["geographic region", "human settlement", "geographic entity", "territorial entity type"] for el in annotation['wikiDataClasses']]):
                label = 'Location'
            else:
                label = None

            results.append({'title': annotation['title'], 'wikiId': annotation['wikiDataItemId'], 'label': label,
                            'characters': [(el['chFrom'], el['chTo']) for el in annotation['support']]})
    return results

In [None]:
# test wikifier
test_text = """
Elon Musk is a business magnate, industrial designer, and engineer. 
Elon Musk is the founder, CEO, CTO, and chief designer of SpaceX. 
Elon Musk is also early investor, CEO, and product architect of Tesla, Inc. Elon Musk is also the founder of The Boring Company and the co-founder of Neuralink. A centibillionaire, Musk became the richest person in the world in January 2021, with an estimated net worth of $185 billion at the time, surpassing Jeff Bezos. Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa. Elon Musk briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University. Elon Musk transferred to the University of Pennsylvania two years later, where Elon Musk received dual bachelor's degrees in economics and physics. Elon Musk moved to California in 1995 to attend Stanford University, but decided instead to pursue a business career. Elon Musk went on co-founding a web software company Zip2 with Elon Musk brother Kimbal Musk.
"""

test_results = wikifier(test_text)
# pd.DataFrame(test_results)

In [None]:
import nltk
# spacy 2.1.0 
import spacy

# make sure this is 2.1.0 or else there will be issues
print(spacy.__version__)


In [None]:
# load spacy model
# python -m spacy download en
nlp = spacy.load('en')

# load openner model. This will take quite awhile to download
relation_model = opennre.get_model('wiki80_bert_softmax')

# Load NLTK
# punkt is a tokanizer that splits text into sentances
nltk.download('punkt')

In [None]:
relation_threshold = 0.5

text = test_text
entities_list = list()
relations_list = list()
sentences = nltk.sent_tokenize(text)
for sentence in sentences[:2]:
    entities = wikifier(sentence, threshold=0.8)
    entities_list.extend(
        [{'title': el['title'], 'wikiId': el['wikiId'], 'label': el['label']} for el in entities]
    )
    for permutation in itertools.permutations(entities, 2):
        # print(f"permutations is {permutation}")
        for source in permutation[0]['characters']:
                for target in permutation[1]['characters']:
                    data = relation_model.infer(
                        {'text': sentence, 'h': {'pos': [source[0], source[1] + 1]}, 't': {'pos': [target[0], target[1] + 1]}})
                    if data[1] > relation_threshold:
                        relations_list.append(
                            {'source': permutation[0]['title'], 'target': permutation[1]['title'], 'type': data[0]})

print(relations_list)        

In [None]:
entities_list = list()
for sentence in nltk.sent_tokenize(text):
    sentence = strip_punctuation(sentence)
    entities = wikifier(sentence, threshold=entities_threshold)
    entities_list.extend(
        [{'title': el['title'], 'wikiId': el['wikiId'], 'label': el['label']} for e in entities])
    # Iterate over every permutation pair of entities
    for permutation in itertools.permutations(entities, 2):
        for source in permutation[0]['characters']:
            for target in permutation[1]['characters']:
                # Relationship extraction with OpenNRE
                data = relation_model.infer(
                    {'text': sentence, 'h': {'pos': [source[0], source[1] + 1]}, 't': {'pos': [target[0], target[1] + 1]}})
                if data[1] > relation_threshold:
                    relations_list.append(
                        {'source': permutation[0]['title'], 'target': permutation[1]['title'], 'type': data[0]}
output = {'entities': deduplicate_dict(entities_list), 'relations': deduplicate_dict(relations_list)}