In [None]:
def resolve_references(doc):
    token_mention_mapper = {}
    output_str = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]
    
    for cluster in clusters:
        first_mention = cluster[0]
        # replace mentions of an entity with the text of the first mention
        for mention_span in list(cluster)[1:]:
            # key is the index of the mention in the original string
            # value is the first mention + the whitespace of the current mention
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            
            # if there are any other words in the mention, then replace them with the empty string
            for token in mention_span[1:]:
                token_mention_mapper[token.idx] = ""
    
    # loop through all tokens in original string, if the word/phrase has been identified as an entity
    # then replace it with the value given above
    for token in doc:
        if token.idx in token_mention_mapper:
            output_str += token_mention_mapper[token.idx]
        else:
            output_str += token.text + token.whitespace_
    return output_str
                

In [None]:
import requests
from string import punctuation
import json

label_map = {
        "person": "Person",
        "school": "School",
        "higher education institution": "University",
        "city/town": "Location",
        "country": "Location",
        "geographic region": "Location",
        "location": "Location",
        "political party": "Party",
        "company": "Organisation",
        "business": "Organisation",
        "organization": "Organisation",
    }

def get_label(annotation_classes):
    for wiki_class in annotation_classes:
        label = label_map.get(wiki_class['enLabel'])
        if label:
            return label
    return None

'''
pageRankSqThreshold prunes annotations based on their page ranks
applyPageRankSqThreshold discards all annotations that have been pruned
wikiDataClasses returns wikidata list (concept ID, concept name) for all classes that the concept belongs to
maxMentionEntropy ignore highly ambiguous mentions
'''

def entity_naming(text, threshold=0.8):
    """Function that fetches entity linking results from wikifier.com API"""
    # Prepare the URL.
    #TODO: move API key to .env file
    url = "http://www.wikifier.org/annotate-article"
    data = {
        "text": text,
        "lang": "en",
        "userKey": "bknexcqfanbxjxnubamwxgdnzybwyz",
        # prune annotations based on page rank
        "pageRankSqThreshold": threshold,
        # discard all annotations that have been pruned
        "applyPageRankSqThreshold": "true",
        "support": "true",
        "minLinkFrequency": "true",
        "ranges": "false",
        "includeCosines": "false",
        # ignore ambiguous mentions
        "maxMentionEntropy": "3"
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(url, data=data, headers=headers, timeout=60)
    # call wikifier api and read the response
    if response.status_code == 200:
        response = json.loads(response.content.decode('utf8'))
    else:
        print(f"Error: {response.status_code} when using wikifier API")
        return None
    # output the annotations
    results = []
    for annotation in response['annotations']:
        # only get desired annotations
        if ('wikiDataClasses' in annotation):
            # Get entity label
            label = get_label(annotation['wikiDataClasses'])
            # If label is returned then add to results dict
            if label is not None:
                results.append({'title': annotation['title'], 'label': label,
                                'characters': [(data['chFrom'], data['chTo']) for data in annotation['support']]})
    return results

In [None]:
import opennre
import spacy

def nlp_pipeline(doc, mp_name):
    relation_model = opennre.get_model('tacred_bertentity_softmax')
    tokeniser = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    tokeniser.add_pipe('sentencizer')
    
    relation_threshold = 0.9

    resolved_txt = resolve_references(doc)
    
    entities_dict = {}
    relation_dict_list = []

    tokenised_txt = tokeniser(resolved_txt)
    for sentence in tokenised_txt.sents:
        # strip punctuation
        sentence = ''.join(char for char in sentence.text if char not in punctuation)
        entities = entity_naming(sentence)

        # should only be 1 entity with mp_name as title, so return characters for that entry, return empty list if not found
        mp_positions = next((entity['characters'] for entity in entities if entity['title'] == mp_name), [])
        # check that character indexes match mp_name in sentence
        valid_mp_pos = [t for t in mp_positions if sentence[t[0]:t[1]+1] == mp_name]

        for entity in entities:
            # don't want target to be the MP themselves, or their political party
            if entity['title'] != mp_name and entity['label'] != 'Party':
                    for mp_pos in valid_mp_pos:
                        for target in entity['characters']:
                                data = relation_model.infer(
                                    {'text': sentence,
                                        'h': {'pos': [mp_pos[0], mp_pos[1] + 1]},
                                        't': {'pos': [target[0], target[1] + 1]}}
                                )
                                if data[1] > relation_threshold and data[0] != 'NA':
                                    relation_dict_list.append(
                                    {'source': mp_name, 'target': entity['title'], 'type': data[0]})
                                    entities_dict[entity['title']] = entity['label']
    # deduplicate a list of dictionaries by converting them to frozensets as keys in a new dictionary, then extracting unique values
    unique_relations = list({frozenset(d.items()): d for d in relation_dict_list}.values())

    return entities_dict, unique_relations

In [None]:
def create_new_rel_work(tx, source_name, target_label, target_name, relation_type):
    return tx.run(f"MATCH (m:MP {{name: $source_name}}) \
                  MERGE (t:{target_label} {{name: $target_name}}) \
                  MERGE (m)-[:{relation_type}]->(t) \
                  RETURN m, t",
                  source_name=source_name, target_name=target_name).single()

def create_new_rel(driver, source_name, target_label, target_name, relation_type):
    session = driver.session()

    record = session.execute_write(create_new_rel_work,
                                   source_name=source_name, target_label=target_label,
                                   target_name=target_name, relation_type=relation_type)
    
    target = record['t']
    print(target)

    session.close()

In [None]:
from database import Database
import os
import pprint
import wikipedia
import spacy

nlp = spacy.load("en_coreference_web_trf")

mp_name = 'Emily Thornberry'

wiki = wikipedia.page(mp_name)
text = wiki.content
# prepend wiki text with MP name as in graph as sometimes the wikipedia articles include middle names
text = f'{mp_name}. ' + text
# only interested in wikipedia content before the references section
text = text.split("== References ==")[0]

doc = nlp(text)
resolve_references(doc)

for i, pipe in enumerate(nlp.pipe_names):
    print(f"{i}, {pipe}")

entities, relations = nlp_pipeline(doc, mp_name)
print(pprint.pprint(entities))

for r in relations:
    print(pprint.pprint(r))

In [None]:
driver = Database.init_driver(os.getenv("NEO4J_URI"), os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

for trip in relations:
    target_name = trip['target']
    target_label = entities[target_name]
    # strip and capitalise relation type to only include type, so 'EMPLOYEE_OF' instead of 'per:employee_of'
    relation_type = trip['type'].split(':')[1].upper()
    create_new_rel(driver, mp_name, target_label, target_name, relation_type)
