In [46]:
import plac
import random
import pandas as pd
import textacy.extract
from pathlib import Path
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

In [47]:
def extract_lives_in_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for location in filter(lambda w: w.ent_type_ == 'LOC', doc):
        if location.dep_ in ('attr', 'dobj'):
            subject = [w for w in location.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, location))
        elif location.dep_ == 'pobj' and location.head.dep_ == 'prep':
            relations.append((location.head.head, location))
    return relations

In [48]:
# training data

# new entity label
LABEL = 'RANK'

TRAIN_DATA = [
    ('My name is Marcus BRODY. I am 15 years old', {
        'entities': [(11, 24, 'PERSON')]
    }),
    ('I went to a rally in Goma', {
        'entities': [(21, 25, 'LOC')]
    }),
    ('In late 2002', {
        'entities': [(8, 12, 'DATE')]
    }),
    ('there was a lot of fighting in Goma', {
        'entities': [(31, 35, 'LOC')]
    }), 
    ("I remember one of the commanders who spoke was Chief KOBONO", {
        'entities': [(47, 52, 'RANK')]
    }),
    ("Chief", {
        'entities': [(0, 5, 'RANK')]
    }),
     ('there was a lot of fighting in Goma', {
        'entities': [(19, 35, 'EVENT')]
    })
    
]

#add new entity
nlp = spacy.load('en')
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')
    ner.add_label(LABEL)   # add new entity label to entity recognizer

optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk('model')
        
# test the trained model
for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        #print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
        
#import text file
witness_file = open('witness_text_clean.txt','r')
text_witness = ""
text_witness = witness_file.read() 
if text_witness != "":
   text = text_witness

#load spacy model
nlp = spacy.load('model')
#nlp = spacy.load('en_core_web_lg')
doc = nlp(text_witness)

for ent in doc.ents:
    print('LABEL: ',ent.text,'|| NODE TYPE:', ent.label_)
    # Extract semi-structured statements
    #print("Here are the things I know about Person:")
    #statements = textacy.extract.semistructured_statements(doc, "I")
    #print('LABEL: ',ent.text,'|| NODE TYPE:', ent.label_)
    #for statement in statements:
    #    subject, verb, fact = statement
    #    print({fact})

#create relationships
#for text in text_witness:
doc = nlp(text_witness)
relations = extract_lives_in_relations(doc)
for r1, r2 in relations:
    print('RELATIONSHIP :'+'{:<10}\t{}\t{}'.format(r1.text,r2.ent_type_,r2.text))   
    
    
import csv    
with open('WITNESS_OUTPUT_NODES.csv', 'w') as csvfile:
    fieldnames = ['label', 'node_type']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for ent in doc.ents:
        writer.writerow({'label': ent.text,'node_type': ent.label_})
        
#with open('WITNESS_OUTPUT_RELATIONS.csv', 'w') as csvfile:
relations = extract_lives_in_relations(doc)
#    for r1, r2 in relations:
#        #print('RELATIONSHIP :'+'{:<10}\t{}\t{}'.format(r1.text,r2.ent_type_,r2.text))   
#        fieldnames = ['event','LOC']
#        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#        writer.writeheader()
        #for ent in doc.ents:
#        writer.writerow({'event': r1.text,'LOC': r2.text})
        
df = pd.DataFrame.from_dict(relations)
df.drop_duplicates(keep=False, inplace=True)
df.head()
df.to_csv('WITNESS_OUTPUT_RELATIONS.csv',header=['label','node_type'],index_label='Id')
#df.dropna(subset=['name', 'born'])
        
# Print the results

#displacy.render(doc, style='ent',jupyter=True)

Entities [('Goma', 'LOC')]
Entities [('Marcus BRODY.', 'PERSON')]
Entities [('Chief', 'RANK')]
Entities [('Goma', 'LOC')]
Entities [('Chief', 'RANK')]
Entities [('Goma', 'LOC')]
Entities [('2002', 'DATE')]
LABEL:  Marcus BRODY. || NODE TYPE: PERSON
LABEL:  2002 || NODE TYPE: DATE
LABEL:  Goma || NODE TYPE: LOC
LABEL:    || NODE TYPE: LOC
LABEL:  Rebels || NODE TYPE: PRODUCT
LABEL:  Goma || NODE TYPE: LOC
LABEL:  Government || NODE TYPE: LOC
LABEL:  Congo || NODE TYPE: GPE
LABEL:    || NODE TYPE: LOC
LABEL:  Chief || NODE TYPE: RANK
LABEL:  Rebels || NODE TYPE: FAC
LABEL:    || NODE TYPE: ORG
LABEL:  Goma || NODE TYPE: LOC
LABEL:  Government || NODE TYPE: LOC
LABEL:  Congo || NODE TYPE: GPE
LABEL:    || NODE TYPE: LOC
LABEL:  Chief || NODE TYPE: RANK
LABEL:  Rebels || NODE TYPE: FAC
LABEL:    || NODE TYPE: ORG
LABEL:  Kalemie || NODE TYPE: GPE
LABEL:    || NODE TYPE: ORDINAL
LABEL:    || NODE TYPE: LOC
LABEL:  first || NODE TYPE: ORDINAL
LABEL:  first || NODE TYPE: ORDINAL
LABEL:    || 