In [1]:
import spacy
import sys
from collections import defaultdict

nlp = spacy.load('en_core_web_md')

In [12]:
chat_text = ('Hi my name is Donald Duck, I am American, and my bank account 32345128 with sort code 12-15-18. \
                My VISA is 4444333322221111 how do I get access to online banking? \
                I think I have $325 on my account when I checked on 10-05-2011 \
                I have 2 accounts with you. \
                You can reach me on phone 01234 000 001 or donaldduck@gmail.com. \
                Ohh and my address is 1313 Webfoot Walk, Duckburg, Calisota')
chat_test_doc = nlp(chat_text)
ner = nlp.get_pipe('ner')
print(ner.predict(chat_test_doc))
print(type(ner))
for ent in chat_test_doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))

[<spacy.syntax.stateclass.StateClass object at 0x000001DAADA22F78>]
<class 'spacy.pipeline.pipes.EntityRecognizer'>
Donald Duck 14 25 PERSON People, including fictional
American 32 40 NORP Nationalities or religious or political groups
32345128 62 70 CARDINAL Numerals that do not fall under another type
12 86 88 CARDINAL Numerals that do not fall under another type
VISA 115 119 ORG Companies, agencies, institutions, etc.
4444333322221111 123 139 DATE Absolute or relative dates or periods
325 211 214 MONEY Monetary values, including unit
10-05-2011 247 257 DATE Absolute or relative dates or periods
2 281 282 CARDINAL Numerals that do not fall under another type
Webfoot Walk 426 438 FAC Buildings, airports, highways, bridges, etc.
Duckburg 440 448 GPE Countries, cities, states
Calisota 450 458 GPE Countries, cities, states


In [13]:
from spacy import displacy

displacy.serve(chat_test_doc, style='ent')

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [18]:
#nlp = spacy.load('en')
text = u'Will Japan join the European Union? If yes, we should move to United States. Fasten your belts, America we are coming'


with nlp.disable_pipes('ner'):
    doc = nlp(chat_text)

threshold = 0.2
beams = nlp.entity.beam_parse([ doc ], beam_width = 16, beam_density = 0.0001)

entity_scores = defaultdict(float)
for beam in beams:
    for score, ents in nlp.entity.moves.get_beam_parses(beam):
        #print ('ents: {}, Score: {}'.format(ents, score))
        for start, end, label in ents:
            entity_scores[(start, end, label)] += score

print ('Entities and scores (detected with beam search)')
for key in entity_scores:
    start, end, label = key
    score = entity_scores[key]
    if ( score > threshold):
        print ('Label: {}, Text: {}, Score: {}'.format(label, doc[start:end], score))

Entities and scores (detected with beam search)
Label: PERSON, Text: Donald Duck, Score: 0.9357815980788531
Label: NORP, Text: American, Score: 1.0
Label: CARDINAL, Text: 32345128, Score: 1.0
Label: MONEY, Text: 325, Score: 1.0
Label: DATE, Text: 10-05-2011, Score: 1.0
Label: CARDINAL, Text: 2, Score: 1.0
Label: FAC, Text: Webfoot Walk, Score: 1.0
Label: GPE, Text: Duckburg, Score: 1.0
Label: GPE, Text: Calisota, Score: 0.8795162506840635


### Let's create a Custom Named Entity 

#### Capture Email, Phone, Bank Account and Sort Code

In [None]:
data1 = 'My bank account is 09876543 with sort code 11-01-45.'
    
data2 = 'You can reach me on my email abc@hotmail.co.uk or call me on 07726 000 123'

data3 = 'Bank Account 09780909, Sort Code 12-34-56'

data4 = 'Contact me on mail mickeymouse@yahoo.com or 01480 345 345'

data5 = 'Hi bank people. Need help with my account, account is 45674567, call me on 01345 567567'

data6 = 'Can you help me with my sort 34-23-12 and contact me using xyz@googlemail.dk'

data7 = 'I live here in the UK and need help with accessing my account. Bank acc is 09876543, 13-20-54. Let me know on 1290 344 456 or helpme@outlook.com'

data8 = 'Sort Code 33-34-35, Bank Account 33445566'

data9 = 'Email: Guffy@yahoo.com; phone: 01346 022 773'

data10 = 'Help, my sort code is 22-23-71, contact me using Daisy@googlemail.co.uk, ohh account no is 44435987'



In [None]:
TRAIN_DATA = [
    (data1, {
        'entities': [(19, 27, 'BANK_ACCOUNT'), (43, 51, 'SORT_CODE')]
    }),
     (data2, {
         'entities': [(29, 46, 'EMAIL'), (61, 74, 'PHONE')]
    }),
     (data3, {
         'entities': [(13, 21, 'BANK_ACCOUNT'), (33, 41, 'SORT_CODE')]
    }),
     (data4, {
         'entities': [(19, 40, 'EMAIL'), (44, 57, 'PHONE')]
    }),
     (data5, {
         'entities': [(54, 62, 'BANK_ACCOUNT'), (75, 87, 'PHONE')]
    }),
     (data6, {
         'entities': [(29, 37, 'SORT_CODE'), (59, 76, 'EMAIL')]
    }),
     (data7, {
         'entities': [(75, 83, 'BANK_ACCOUNT'), (85, 93, 'SORT_CODE'), (110, 122, 'PHONE'), (126, 144, 'EMAIL')]
    }),
     (data8, {
         'entities': [(10, 18, 'SORT_CODE'), (33, 41, 'BANK_ACCOUNT')]
    }),
     (data9, {
         'entities': [(7, 22, 'EMAIL'), (31, 44, 'PHONE')]
    }),
     (data10, {
         'entities': [(22, 30, 'SORT_CODE'), (49, 71, 'EMAIL'), (91, 99, 'BANK_ACCOUNT')]
    }),

]

In [None]:
import random
from tqdm import tqdm

ner = nlp.get_pipe('ner')

n_iter=200
# add labels, Trains data based on annotations 
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        print(ent[2])
        ner.add_label(ent[2])

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout 
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

In [None]:
# internal test to check mappings of input test data, that is, string locations of entities
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
new_chat_test_doc = nlp(chat_text)
displacy.serve(new_chat_test_doc, style='ent')

#### Finding the confidence score for an entity 

In [None]:
for ent in new_chat_test_doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))