In [23]:
import plac
import random
import pandas as pd 
import textacy.extract
from pathlib import Path
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

In [24]:
# Extract Relationships section

In [25]:
def extract_event_loc_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for location in filter(lambda w: w.ent_type_ == 'LOC', doc):
        if location.dep_ in ('attr', 'dobj'):
            subject = [w for w in location.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, location))
        elif location.dep_ == 'pobj' and location.head.dep_ == 'prep':
            relations.append((location.head.head, location))
    return relations

In [26]:
def extract_person_loc_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations_person = []
    for person in filter(lambda w: w.ent_type_ == 'PERSON', doc):
        if person.dep_ in ('attr', 'dobj'):
            subject = [w for w in person.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations_person.append((subject, person))
        elif person.dep_ == 'pobj' and person.head.dep_ == 'prep':
            person.append((person.head.head, person))
    return relations_person

In [27]:
# new entity label
LABEL = 'RANK'

# training data
TRAIN_DATA = [
    ('Mandro training centre', {
        'entities': [(0, 22, 'LOC')]
    }),
    ('My name is Marcus BRODY. I am 15 years old', {
        'entities': [(11, 24, 'PERSON')]
    }),
    ('visited the camp one time', {
        'entities': [(0, 25, 'EVENT')]
    }),
    ('I went to a rally in Goma', {
        'entities': [(21, 25, 'LOC')]
    }),
    ('In late 2002', {
        'entities': [(8, 12, 'DATE')]
    }),
    ('there was a lot of fighting in Goma', {
        'entities': [(31, 35, 'LOC')]
    }), 
    ("I remember one of the commanders who spoke was Chief KOBONO", {
        'entities': [(47, 52, 'RANK')]
    }),
    ("The President, Ule MATOBO GOBO", {
        'entities': [(0, 13, 'RANK')]
    }),
     ('there was a lot of fighting in Goma', {
        'entities': [(19, 35, 'EVENT')]
    })
    
]

In [28]:
#add new entity
nlp = spacy.load('en')
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')
    ner.add_label(LABEL)   # add new entity label to entity recognizer

optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk('model')
        
# test the trained model
for text, _ in TRAIN_DATA:
        doc = nlp(text)
        if text != "":
            print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        #print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
        
#df = pd.DataFrame.from_dict(ent.text, ent.label_)
#df.drop_duplicates(keep=False, inplace=True)
#df.head()
#df.to_csv('WITNESS_OUT_NODES.csv',header=['label','node_type'],index_label='Id')

Entities [('Mandro training centre', 'LOC')]
Entities [('The President', 'RANK')]
Entities [('fighting in', 'EVENT'), ('Goma', 'LOC')]
Entities [('fighting in', 'EVENT'), ('Goma', 'LOC')]
Entities [('Chief', 'RANK')]
Entities [('Marcus BRODY.', 'PERSON')]
Entities [('visited the camp one time', 'EVENT')]
Entities [('2002', 'DATE')]
Entities [('Goma', 'LOC')]


In [29]:
#import text file
witness_file = open('witness_text_clean.txt','r')
#witness_file = open('witness2.txt','r')
text_witness = ""
text_witness = witness_file.read() 
if text_witness != "":
   text = text_witness

#load spacy model
nlp = spacy.load('model')
#nlp = spacy.load('en_core_web_lg')
#doc = nlp(text_witness)

#create relationships
doc = nlp(text_witness)
relations = extract_event_loc_relations(doc)
for r1, r2 in relations:
    print('RELATIONSHIP :'+'{:<10}\t{}\t{}'.format(r1.text,r2.ent_type_,r2.text))
    
#if relations !="":
    df = pd.DataFrame.from_dict(relations)
    df.head()
    df.to_csv('WITNESS_STAT_OUTPUT_NODES.csv',header=['EVENT_Took_place_at','LOC'],index_label='Id')
#else:
#    print('NO RELATIONS FOUND')

        
# Print the results
displacy.render(doc, style='ent',jupyter=True)

RELATIONSHIP :fighting  	LOC	Goma
RELATIONSHIP :a rally   	LOC	Goma
RELATIONSHIP :your question	LOC	the Mandro training centre
RELATIONSHIP :the  training centre	LOC	Mandro
