In [3]:
import spacy
from spacy.tokens import Doc

nlp = spacy.load("en_coreference_web_trf")

In [8]:
import pprint
def resolve_references(doc):
    token_mention_mapper = {}
    output_str = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]
    
    for cluster in clusters:
        first_mention = cluster[0]
        # replace mentions of an entity with the text of the first mention
        for mention_span in list(cluster)[1:]:
            # key is the index of the mention in the original string
            # value is the first mention + the whitespace of the current mention
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            
            # if there are any other words in the mention, then replace them with the empty string
            for token in mention_span[1:]:
                token_mention_mapper[token.idx] = ""
    
    # loop through all tokens in original string, if the word/phrase has been identified as an entity
    # then replace it with the value given above
    print(pprint.pprint(token_mention_mapper))
    for token in doc:
        if token.idx in token_mention_mapper:
            output_str += token_mention_mapper[token.idx]
        else:
            output_str += token.text + token.whitespace_
    return output_str
                

In [5]:
import requests
from string import punctuation
import json

label_map = {
        "person": "Person",
        "school": "School",
        "higher education institution": "University",
        "city/town": "Location",
        "country": "Location",
        "geographic region": "Location",
        "location": "Location",
        "company": "Organisation",
        "business": "Organisation",
        "organization": "Organisation",
    }

def get_label(annotation_classes):
    for wiki_class in annotation_classes:
        label = label_map.get(wiki_class['enLabel'])
        if label:
            return label
    return None

'''
pageRankSqThreshold prunes annotations based on their page ranks
applyPageRankSqThreshold discards all annotations that have been pruned
wikiDataClasses returns wikidata list (concept ID, concept name) for all classes that the concept belongs to
maxMentionEntropy ignore highly ambiguous mentions
'''

def entity_naming(text, threshold=0.8):
    print(text)
    """Function that fetches entity linking results from wikifier.com API"""
    # Prepare the URL.
    #TODO: move API key to .env file
    url = "http://www.wikifier.org/annotate-article"
    data = {
        "text": text,
        "lang": "en",
        "userKey": "bknexcqfanbxjxnubamwxgdnzybwyz",
        # prune annotations based on page rank
        "pageRankSqThreshold": threshold,
        # discard all annotations that have been pruned
        "applyPageRankSqThreshold": "true",
        "support": "true",
        "minLinkFrequency": "true",
        "ranges": "false",
        "includeCosines": "false",
        # ignore ambiguous mentions
        "maxMentionEntropy": "3"
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(url, data=data, headers=headers, timeout=60)
    # call wikifier api and read the response
    if response.status_code == 200:
        response = json.loads(response.content.decode('utf8'))
    else:
        print(f"Error: {response.status_code} when using wikifier API")
        return None
    # output the annotations
    results = []
    for annotation in response['annotations']:
        # only get desired annotations
        if ('wikiDataClasses' in annotation):
            # Get entity label
            label = get_label(annotation['wikiDataClasses'])
            # If label is returned then add to results dict
            if label is not None:
                results.append({'title': annotation['title'], 'label': label,
                                'characters': [(data['chFrom'], data['chTo']) for data in annotation['support']]})
    return results

In [17]:
import opennre
import nltk
import itertools
import pprint
import time

def strip_punctuation(s):
    """Removes all punctuation from a string"""
    return ''.join(c for c in s if c not in punctuation)

def deduplicate_dict(d):
    return [dict(y) for y in set(tuple(x.items()) for x in d)]

def nlp_pipeline(doc):
    relation_model = opennre.get_model('tacred_bertentity_softmax')
    tokeniser = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    tokeniser.add_pipe('sentencizer')
    relation_threshold = 0.9

    entities_dict = {}
    relation_dict_list = []
    resolved_txt = resolve_references(doc)
    
    tokenised_txt = tokeniser(resolved_txt)

    for sentence in tokenised_txt.sents:
        sentence = strip_punctuation(sentence.text)
        entities = entity_naming(sentence)
        for entity in entities:
            entities_dict[entity['title']] = entity['label']
        # TODO: only do this for source nodes with title = MP name
        for p in itertools.permutations(entities, 2):
            for source in p[0]['characters']:
                for target in p[1]['characters']:
                    data = relation_model.infer(
                        {'text': sentence, 
                        'h': {'pos': [source[0], source[1] + 1]}, 
                        't': {'pos': [target[0], target[1] + 1]}})
                    # if confident in infered relationship
                    # data = (str:type of relationship, int:confidence)
                    if data[1] > relation_threshold and data[0] != 'NA':
                        relation_dict_list.append(
                            {'source': p[0]['title'], 'target': p[1]['title'], 'type': data[0]})
    return entities_dict, deduplicate_dict(relation_dict_list)

In [None]:
def create_new_rel_work(tx, source_name, target_label, target_name, relation_type):
    return tx.run(f"MATCH (m:MP {{name: $source_name}}) \
                  MERGE (t:{target_label} {{name: $target_name}}) \
                  MERGE (m)-[:{relation_type}]->(t) \
                  RETURN m, t",
                  source_name=source_name, target_name=target_name).single()

def create_new_rel(driver, source_name, target_label, target_name, relation_type):
    session = driver.session()

    record = session.execute_write(create_new_rel_work,
                                   source_name=source_name, target_label=target_label,
                                   target_name=target_name, relation_type=relation_type)
    
    target = record['t']
    print(target)

    session.close()

In [18]:
from database import Database
import os

import wikipedia

wiki = wikipedia.page('Emily Thornberry')
text = wiki.content
text = 'Emily Thornberry. ' + text
# only interested in wikipedia content before the references section
text = text.split("== References ==")[0]

# text = ("""Emily Anne Thornberry (born 27 July 1960) is a British politician who has been Member of Parliament (MP) for Islington South and Finsbury since 2005. Emily Thornberry is member of the Labour Party, she has served as Shadow Attorney General for England and Wales since 2021, and previously from 2011 to 2014. She has also served as Shadow Secretary of State for Foreign and Commonwealth Affairs from 2016 to 2020, Shadow First Secretary of State from 2017 to 2020 and Shadow Secretary of State for International Trade from 2020 to 2021.""")

doc = nlp(text)
resolve_references(doc)

for i, pipe in enumerate(nlp.pipe_names):
    print(f"{i}, {pipe}")

entities, relations = nlp_pipeline(doc)
print(entities)

filtered_triples = [triple for triple in relations if triple['source'] == 'Emily Thornberry']
print(filtered_triples)

2023-04-13 15:34:29,429 - root - INFO - Loading BERT pre-trained checkpoint.


{180: 'Emily Anne Thornberry (born 27 July 1960) ',
 290: 'Emily Anne Thornberry (born 27 July 1960) ',
 440: '2020 ',
 504: '2020 ',
 512: '2021',
 517: 'Emily Anne Thornberry (born 27 July 1960)',
 518: '',
 522: '',
 531: '',
 534: '',
 536: '',
 544: '',
 548: '',
 550: '',
 560: 'Emily Anne Thornberry (born 27 July 1960) ',
 708: 'Emily Anne Thornberry (born 27 July 1960) ',
 757: '2005 ',
 816: 'Emily Anne Thornberry (born 27 July 1960) ',
 848: 'Parliament ',
 862: '2005 ',
 909: 'England and Wales ',
 917: '',
 921: '',
 964: '2011 ',
 975: 'Emily Anne Thornberry (born 27 July 1960) ',
 991: '2014 ',
 1087: 'the Labour Party ',
 1094: '',
 1121: 'Emily Anne Thornberry (born 27 July 1960) ',
 1340: 'Emily Anne Thornberry (born 27 July 1960) ',
 1371: 'Jeremy Corbyn ',
 1391: 'the Labour Party ',
 1395: '',
 1402: '',
 1411: 'the 2015 Labour Party leadership election ',
 1415: '',
 1420: '',
 1431: '',
 1531: 'Emily Anne Thornberry (born 27 July 1960) ',
 1701: 'Emily Anne Thornb

Some weights of the model checkpoint at /Users/fyjca2/.opennre/pretrain/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{180: 'Emily Anne Thornberry (born 27 July 1960) ',
 290: 'Emily Anne Thornberry (born 27 July 1960) ',
 440: '2020 ',
 504: '2020 ',
 512: '2021',
 517: 'Emily Anne Thornberry (born 27 July 1960)',
 518: '',
 522: '',
 531: '',
 534: '',
 536: '',
 544: '',
 548: '',
 550: '',
 560: 'Emily Anne Thornberry (born 27 July 1960) ',
 708: 'Emily Anne Thornberry (born 27 July 1960) ',
 757: '2005 ',
 816: 'Emily Anne Thornberry (born 27 July 1960) ',
 848: 'Parliament ',
 862: '2005 ',
 909: 'England and Wales ',
 917: '',
 921: '',
 964: '2011 ',
 975: 'Emily Anne Thornberry (born 27 July 1960) ',
 991: '2014 ',
 1087: 'the Labour Party ',
 1094: '',
 1121: 'Emily Anne Thornberry (born 27 July 1960) ',
 1340: 'Emily Anne Thornberry (born 27 July 1960) ',
 1371: 'Jeremy Corbyn ',
 1391: 'the Labour Party ',
 1395: '',
 1402: '',
 1411: 'the 2015 Labour Party leadership election ',
 1415: '',
 1420: '',
 1431: '',
 1531: 'Emily Anne Thornberry (born 27 July 1960) ',
 1701: 'Emily Anne Thornb

In [20]:
import pprint
print(pprint.pprint(relations))

[{'source': 'Emily Thornberry',
  'target': "Transport and General Workers' Union",
  'type': 'per:employee_of'},
 {'source': 'Emily Thornberry',
  'target': 'University of Kent',
  'type': 'per:schools_attended'},
 {'source': 'Caroline Flint',
  'target': 'Labour Party (UK)',
  'type': 'per:employee_of'},
 {'source': 'Charles Hendry',
  'target': 'Department of Energy and Climate Change',
  'type': 'per:employee_of'},
 {'source': 'Emily Thornberry',
  'target': 'Department for Work and Pensions',
  'type': 'per:title'},
 {'source': 'Emily Thornberry',
  'target': 'Liberal Democrats (UK)',
  'type': 'per:employee_of'},
 {'source': 'Emily Thornberry',
  'target': 'Department of Energy and Climate Change',
  'type': 'per:employee_of'},
 {'source': 'Emily Thornberry',
  'target': 'Islington',
  'type': 'per:cities_of_residence'},
 {'source': 'Labour Party (UK)',
  'target': 'Keir Starmer',
  'type': 'org:top_members/employees'},
 {'source': 'Ed Miliband',
  'target': 'Labour Party (UK)',


In [None]:
from pprint import pprint

driver = Database.init_driver(os.getenv("NEO4J_URI"), os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

filtered_triples = [triple for triple in relations if triple['source'] == 'Chris Bryant']

for trip in filtered_triples:
    target_name = trip['target']
    target_label = entities[target_name]
    # stripand capitalise relation type to only include type, so 'EMPLOYEE_OF' instead of 'per:employee_of'
    relation_type = trip['type'].split(':')[1].upper()
    create_new_rel(driver, 'Chris Bryant', target_label, target_name, relation_type)
