In [None]:
import spacy
from spacy.tokens import Doc

nlp = spacy.load("en_coreference_web_trf")

In [None]:
import wikipedia

wiki = wikipedia.page('Tulip Siddiq')

text = wiki.content

In [None]:
# https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles=Chris_Bryant
# prepend summary with "X is an MP" so that first instance of entity will be given name, not full name

In [None]:
def resolve_references(doc):
    token_mention_mapper = {}
    output_str = ""
    clusters = [
        val for key, val in doc.spans.items() if key.startswith("coref_cluster")
    ]
    
    for cluster in clusters:
        first_mention = cluster[0]

        for mention_span in list(cluster)[1:]:
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            
            for token in mention_span[1:]:
                token_mention_mapper[token.idx] = ""
    
    for token in doc:
        if token.idx in token_mention_mapper:
            output_str += token_mention_mapper[token.idx]
        else:
            output_str += token.text + token.whitespace_
    return output_str
                

In [None]:
import urllib
from string import punctuation
import json

ENTITY_TYPES = ["human", "person", "company", "enterprise", "business", "geographic region",
                "human settlement", "geographic entity", "territorial entity type", "organization"]


'''
pageRankSqThreshold prunes annotations based on their page ranks
applyPageRankSqThreshold discards all annotations that have been pruned
wikiDataClasses returns wikidata list (concept ID, concept name) for all classes that the concept belongs to
maxMentionEntropy ignore highly ambiguous mentions
'''

def entity_naming(text, lang="en", threshold=0.8):
    """Function that fetches entity linking results from wikifier.com API"""
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", "bknexcqfanbxjxnubamwxgdnzybwyz"),
        ("pageRankSqThreshold", "%g" %
         threshold), ("applyPageRankSqThreshold", "true"),
        ("support", "true"),
        ("minLinkFrequency", "2"), ("ranges", "false"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    url = "http://www.wikifier.org/annotate-article"
    # call wikifier api and read the response
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # output the annotations
    results = []
    for annotation in response["annotations"]:
        # only get desired annotations
        if ('wikiDataClasses' in annotation) and (any([e['enLabel'] in ENTITY_TYPES for e in annotation['wikiDataClasses']])):

            # Specify entity label
            if any([e['enLabel'] in ["human", "person"] for e in annotation['wikiDataClasses']]):
                label = 'Person'
            elif any([e['enLabel'] in ["company", "enterprise", "business", "organization"] for e in annotation['wikiDataClasses']]):
                label = 'Organization'
            elif any([e['enLabel'] in ["geographic region", "human settlement", "geographic entity", "territorial entity type"] for e in annotation['wikiDataClasses']]):
                label = 'Location'
            else:
                label = None

            results.append({'title': annotation['title'], 'label': label,
                            'characters': [(e['chFrom'], e['chTo']) for e in annotation['support']]})
    return results

In [40]:
import opennre
import nltk
import itertools
import pprint

def strip_punctuation(s):
    """Removes all punctuation from a string"""
    return ''.join(c for c in s if c not in punctuation)

def deduplicate_dict(d):
    return [dict(y) for y in set(tuple(x.items()) for x in d)]

def nlp_pipeline(doc):
    relation_model = opennre.get_model('tacred_bertentity_softmax')
    nltk.download('punkt')
    relation_threshold = 0.9

    entities_dict = {}
    relation_dict_list = []
    resolved_txt = resolve_references(doc)
    for sentence in nltk.sent_tokenize(resolved_txt):
        sentence = strip_punctuation(sentence)
        entities = entity_naming(sentence)
        print(entities)
        for entity in entities:
            entities_dict[entity['title']] = entity['label']
        for p in itertools.permutations(entities, 2):
            for source in p[0]['characters']:
                for target in p[1]['characters']:
                    data = relation_model.infer(
                        {'text': sentence, 
                        'h': {'pos': [source[0], source[1] + 1]}, 
                        't': {'pos': [target[0], target[1] + 1]}})
                    # if confident in infered relationship
                    # data = (str:type of relationship, int:confidence)
                    if data[1] > relation_threshold and data[0] != 'NA':
                        relation_dict_list.append(
                            {'source': p[0]['title'], 'target': p[1]['title'], 'type': data[0]})
    return entities_dict, deduplicate_dict(relation_dict_list)

In [None]:
from database import Database
import os

# driver = Database.init_driver(os.getenv("NEO4J_URI"), os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))
doc = nlp(text)

for i, pipe in enumerate(nlp.pipe_names):
    print(f"{i}, {pipe}")

results = nlp_pipeline(doc)

In [34]:
[entities_dict[entity['title']] = entity['label'] for entity in entities]

Unite the Union Organization employee_of
The Royal School, Hampstead Organization schools_attended
West Hampstead Organization cities_of_residence
King's College London Organization schools_attended
University College London Organization schools_attended
Hampstead Organization cities_of_residence
Royal Society Organization employee_of
Labour Party (UK) Organization employee_of
Royal Society of Arts Organization employee_of
Mill Hill School Organization schools_attended
Sheikh Rehana Person parents
Young Labour (UK) Organization employee_of


In [37]:
def create_new_rel_work(tx, source_name, target_label, target_name, relation_type):
    return tx.run(f"MATCH (m:MP {{name: $source_name}}) \
                  MERGE (t:{target_label} {{name: $target_name}}) \
                  MERGE (m)-[:{relation_type}]->(t) \
                  RETURN m, t",
                  source_name=source_name, target_name=target_name).single()

def create_new_rel(driver, source_name, target_label, target_name, relation_type):
    session = driver.session()

    record = session.execute_write(create_new_rel_work,
                                   source_name=source_name, target_label=target_label,
                                   target_name=target_name, relation_type=relation_type)
    
    target = record['t']
    print(target)

    session.close()

In [39]:
from pprint import pprint

driver = Database.init_driver(os.getenv("NEO4J_URI"), os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

# TODO: refactor nlp pipeline so that results['entities'] is a dict with key=title and value=label
entity_dict = {entity['title']: entity['label'] for entity in results['entities']}

filtered_triples = [triple for triple in results['relations'] if triple['source'] == 'Tulip Siddiq']

for trip in filtered_triples:
    target_name = trip['target']
    target_label = entity_dict[target_name]
    # stripand capitalise relation type to only include type, so 'EMPLOYEE_OF' instead of 'per:employee_of'
    relation_type = trip['type'].split(':')[1].upper()
    create_new_rel(driver, 'Tulip Siddiq', target_label, target_name, relation_type)


Already intialised
<Node element_id='767' labels=frozenset({'Organization'}) properties={'name': 'Unite the Union'}>
<Node element_id='768' labels=frozenset({'Organization'}) properties={'name': 'The Royal School, Hampstead'}>
<Node element_id='769' labels=frozenset({'Organization'}) properties={'name': 'West Hampstead'}>
<Node element_id='770' labels=frozenset({'Organization'}) properties={'name': "King's College London"}>
<Node element_id='771' labels=frozenset({'Organization'}) properties={'name': 'University College London'}>
<Node element_id='772' labels=frozenset({'Organization'}) properties={'name': 'Hampstead'}>
<Node element_id='773' labels=frozenset({'Organization'}) properties={'name': 'Royal Society'}>
<Node element_id='774' labels=frozenset({'Organization'}) properties={'name': 'Labour Party (UK)'}>
<Node element_id='775' labels=frozenset({'Organization'}) properties={'name': 'Royal Society of Arts'}>
<Node element_id='776' labels=frozenset({'Organization'}) properties={'n