In [None]:
import itertools
import json
import os
import urllib
import torch
import nltk
import opennre
import nltk
import spacy
import pandas as pd
from dotenv import load_dotenv

# This is the development notebook

## Major components of the pipeline so far

### 1. Create Named Entity Recognition (NER) solution
 - For this exercise the solution of [wikifier](http://wikifier.org/info.html) has been identified as a possible avenue for extracting entities of interest. The wififier is an API endpoint that takes a chunk of text and annotates entities that exists in the "wiki" corpus.
 - #TODO: explore additional NER solutions such as Facebook's [BLINK](https://github.com/facebookresearch/BLINK) or potentially a custom solution.


 ### 2. Load Relation Extraction (RE) Model
- For this exercise we leverage [OpenNRE's](https://github.com/thunlp/OpenNRE) framework of pretrained relationship extraction models.
- This pipeline initially uses the `wiki80_bert_softmax` model, which is trained on wiki80 dataset with a BERT encoder.
- #TODO: explore additional RE models such as LUKE's `studio-ousia/luke-large-finetuned-tacred`

In [None]:
# Define some global variables

# These are the entity types that we are after
# They are sourced from our NER solution (in this case wikifier)
ENTITY_TYPES = ["human", "person", "company", "enterprise", "business", "geographic region",
                "human settlement", "geographic entity", "territorial entity type", "organization"]

# load wikipedia key from .env file in root directory
load_dotenv('../.env')
WIKIFIER_KEY = os.environ.get("WIKIFIER_KEY")

In [None]:
# wikifier function

def wikifier(text, lang="en", threshold=0.8):
    """
    Function that fetches entity linking results from wikifier.com API
    
    Example:
    >>> wikifier("Elon Musk is the founder, CEO, CTO, and chief designer of SpaceX.")
    >>> [{'title': 'Elon Musk',  'wikiId': 'Q317521',  'label': 'Person',  'characters': [(0, 8), (5, 8)]}, {'title': 'SpaceX',  'wikiId': 'Q193701',  'label': 'Organization',  'characters': [(58, 63)]}
    """
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", WIKIFIER_KEY),
        ("pageRankSqThreshold", "%g" %
         threshold), ("applyPageRankSqThreshold", "true"),
        ("nTopDfValuesToIgnore", "100"), ("nWordsToIgnoreFromList", "100"),
        ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
        ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    url = "http://www.wikifier.org/annotate-article"
    # Call the Wikifier and read the response.
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # Output the annotations.
    results = list()
    for annotation in response["annotations"]:
        # Filter out desired entity classes
        if ('wikiDataClasses' in annotation) and (any([el['enLabel'] in ENTITY_TYPES for el in annotation['wikiDataClasses']])):

            # Specify entity label
            if any([el['enLabel'] in ["human", "person"] for el in annotation['wikiDataClasses']]):
                label = 'Person'
            elif any([el['enLabel'] in ["company", "enterprise", "business", "organization"] for el in annotation['wikiDataClasses']]):
                label = 'Organization'
            elif any([el['enLabel'] in ["geographic region", "human settlement", "geographic entity", "territorial entity type"] for el in annotation['wikiDataClasses']]):
                label = 'Location'
            else:
                label = None

            results.append({'title': annotation['title'], 'wikiId': annotation['wikiDataItemId'], 'label': label,
                            'characters': [(el['chFrom'], el['chTo']) for el in annotation['support']]})
    return results

In [None]:
# Example of wikifier output
test_text = """
Elon Musk is a business magnate, industrial designer, and engineer. 
Elon Musk is the founder, CEO, CTO, and chief designer of SpaceX. 
Elon Musk is also early investor, CEO, and product architect of Tesla, Inc. Elon Musk is also the founder of The Boring Company and the co-founder of Neuralink. A centibillionaire, Musk became the richest person in the world in January 2021, with an estimated net worth of $185 billion at the time, surpassing Jeff Bezos. Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa. Elon Musk briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University. Elon Musk transferred to the University of Pennsylvania two years later, where Elon Musk received dual bachelor's degrees in economics and physics. Elon Musk moved to California in 1995 to attend Stanford University, but decided instead to pursue a business career. Elon Musk went on co-founding a web software company Zip2 with Elon Musk brother Kimbal Musk.
"""

test_results = wikifier(test_text)
pd.DataFrame(test_results)

# Load ML models

* model versions is very important here. We use `spacy==2.1.0`

In [None]:
# load spacy model
# python -m spacy download en
nlp = spacy.load('en')

# load openner model. This will take quite awhile to download
relation_model = opennre.get_model('wiki80_bert_softmax')

# Load NLTK
# punkt is a tokanizer that splits text into sentances
nltk.download('punkt')

In [None]:
entity_threshold = 0.8
relation_threshold = 0.5

# use test text from above. This will be future source of input
text = test_text
entities_list = list()
relations_list = list()
# split text into sentences
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    # for each sentence extract entities with a specified confidence
    entities = wikifier(sentence, threshold=entity_threshold)
    entities_list.extend(
        [{'title': el['title'], 'wikiId': el['wikiId'], 'label': el['label']} for el in entities]
    )
    # permutate over entities in a sentence
    for permutation in itertools.permutations(entities, 2):
        for source in permutation[0]['characters']:
                for target in permutation[1]['characters']:
                    # for each permutation, infer a relation from our relation model
                    data = relation_model.infer(
                        {'text': sentence, 'h': {'pos': [source[0], source[1] + 1]}, 't': {'pos': [target[0], target[1] + 1]}})
                    if data[1] > relation_threshold:
                        relations_list.append(
                            {'source': permutation[0]['title'], 'target': permutation[1]['title'], 'type': data[0]})      

Below you can see results from both the extracted entities and extracted relations. IN their intitial state there are many duplicates.

In [None]:
# relations and entity dbs (non-deduped)

relations_df = pd.DataFrame(relations_list)

entities_df = pd.DataFrame(entities_list)

relations_df


In [None]:
# simple util for deduplication
def deduplicate_dict(d):
    return [dict(y) for y in set(tuple(x.items()) for x in d)]

output = {'entities': deduplicate_dict(entities_list), 'relations': deduplicate_dict(relations_list)}

# final output ~ish
print(output)