In [19]:
import json
import logging
import sys
import argparse
import os
import json
import pickle
import random
import spacy

import pandas as pd
from pathlib import Path
from spacy.util import minibatch, compounding
from spacy import displacy

In [2]:
# Rule based NER
s = "From: John Smith \n To: Jim Bob \n Hello Jim, I am writing to wish you a happy birthday. Best, John "
#(?<=From:)(.*)(?=\n)
import re
sender = re.search(r'From:(.*?)\n', s).group(1) # Get strings between From: and \n
receiver = re.search(r'To:(.*?)\n', s).group(1) # Get strings between To: and \n
content = re.search(r'.*\n(.*)$',s).group(1) # Get strings after the last occurence of \n
print('Sender: ' + sender , '\nReceiver: '+ receiver, '\nContent: '+ content)

Sender:  John Smith  
Receiver:  Jim Bob  
Content:  Hello Jim, I am writing to wish you a happy birthday. Best, John 


In [1]:
# Feature based
with open('train.txt', 'r') as file :
    filedata = file.read()

In [15]:
# Only keep the word and the tag column and save the data as 'tsv' file
data = pd.read_csv("train.txt", delimiter= ' ', header = None)
data = data.drop([1, 2], axis=1)
data.to_csv("sec_good.tsv", sep = "\t", index=False)


In [16]:
# Transform data in 'tsv' format to data in 'json' format
def tsv_to_json_format(input_path,output_path,unknown_label):
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        #print(len(entity),"Yes")
                        d={}
                        d['text']=word
                        #print(d['text'])
                        d['start']=start
                        d['end']=start+len(word)-1  
                        #print(d['start'],d['end'])
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

tsv_to_json_format("sec_good.tsv",'sec_good.json','abc')

In [17]:
# Load data in 'json' format and transform it to the required format of spaCy
training_data = []
lines=[]
with open("sec_good.json", 'r') as f:
    lines = f.readlines()

for line in lines:
    data = json.loads(line)
    text = data['content']
    entities = []
    for annotation in data['annotation']:
        point = annotation['points'][0]
        labels = annotation['label']
        if not isinstance(labels, list):
            labels = [labels]

        for label in labels:
            entities.append((point['start'], point['end'] + 1 ,label))


    training_data.append((text, {"entities" : entities}))

print(training_data)

with open('train', 'wb') as fp:
    pickle.dump(training_data, fp)

[('0 Subordinated Loan Agreement - Silicium de Provence SAS and Evergreen Solar Inc ', {'entities': [(32, 40, 'I-ORG'), (41, 43, 'I-ORG'), (44, 52, 'I-ORG'), (53, 56, 'I-ORG'), (61, 70, 'I-ORG'), (71, 76, 'I-ORG'), (77, 80, 'I-ORG')]}), ('7 - December 2007 [ HERBERT SMITH LOGO ] ................................ 2007 SILICIUM DE PROVENCE SAS and EVERGREEN SOLAR , INC ', {'entities': [(20, 27, 'I-PER'), (28, 33, 'I-PER'), (79, 87, 'I-ORG'), (88, 90, 'I-ORG'), (91, 99, 'I-ORG'), (100, 103, 'I-ORG'), (108, 117, 'I-ORG'), (118, 123, 'I-ORG')]}), ('SUBORDINATED LOAN AGREEMENT HERBERT SMITH LLP Page 1 of 12 7 - December 2007 TABLE OF CONTENTS Clause Headings Page 1 ', {'entities': [(28, 35, 'I-PER'), (36, 41, 'I-PER')]}), ('INTERPRETATION 3 2 ', {'entities': []}), ('LOAN 4 3 ', {'entities': []}), ('INTEREST AND REDEMPTION 4 4 ', {'entities': []}), ('EARLY REDEMPTIONS 5 5 ', {'entities': []}), ('REPRESENTATIONS 5 6 ', {'entities': []}), ('UNDERTAKINGS 5 7 ', {'entities': []}), ('SUBORDINATION 

In [18]:
# Save training data as TRAIN_DATA
with open ('train', 'rb') as f:
    TRAIN_DATA = pickle.load(f)

# Load pretrained word embeddings
nlp = spacy.load('en_core_web_lg-2.1.0')

ner = nlp.get_pipe("ner")

# Add new labels in the training data set
nlp.entity.add_label('I-PER')
nlp.entity.add_label('I-MISC')
nlp.entity.add_label('I-ORG')
nlp.entity.add_label('I-LOC')


other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    nlp.begin_training()
    for itn in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
            # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.9,  # dropout - make it harder to memorise data
                    losses=losses,
                )

In [None]:
# Save model to local repository
nlp.to_disk("/Users/Jenny/Desktop/NLP-Primer/model") # Change path to you own repository

# Display result using 'displacy'

pdf = open("pdf.txt","r") # Give path or name to the document you want to test NER on
data = pdf.read()
pdf.close()

# Load custom trained model
nlp2 = spacy.load("model")
sen = nlp2(data)
html = displacy.render(sen, style='ent', jupyter=False)
output_path = Path("/Users/Jenny/Desktop/NLP-Primer/pdf_sec.html") # Change path to your own
output_path.open("w", encoding="utf-8").write(html)