In [1]:
import spacy

nlp = spacy.load('en_core_web_md')

In [2]:
chat_text = ('Hi my name is Donald Duck, I am American, and my bank account 12345678 with sort code 12-15-18. \
                My VISA is 4444333322221111 how do I get access to online banking? \
                I think I have $325 on my account when I checked on 10-05-2011 \
                I have 2 accounts with you. \
                You can reach me on phone 01234 000 001 or donaldduck@gmail.com. \
                Ohh and my address is 1313 Webfoot Walk, Duckburg, Calisota')
chat_test_doc = nlp(chat_text)
for ent in chat_test_doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))

Donald Duck 14 25 PERSON People, including fictional
American 32 40 NORP Nationalities or religious or political groups
12 86 88 CARDINAL Numerals that do not fall under another type
VISA 115 119 ORG Companies, agencies, institutions, etc.
4444333322221111 123 139 DATE Absolute or relative dates or periods
325 211 214 MONEY Monetary values, including unit
10-05-2011 247 257 DATE Absolute or relative dates or periods
2 281 282 CARDINAL Numerals that do not fall under another type
Webfoot Walk 426 438 FAC Buildings, airports, highways, bridges, etc.
Duckburg 440 448 GPE Countries, cities, states
Calisota 450 458 GPE Countries, cities, states


In [3]:
from spacy import displacy

displacy.serve(chat_test_doc, style='ent')

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### Let's create a Custom Named Entity 

#### Capture Email, Phone, Bank Account and Sort Code

In [4]:
data1 = 'My bank account is 09876543 with sort code 11-01-45'
    
data2 = 'You can reach me on my email abc@hotmail.co.uk or call me on 07726 000 123'

data3 = 'Bank Account 09780909, Sort Code 12-34-56'

data4 = 'Contact me on mail mickeymouse@yahoo.com or 01480 345 345'

data5 = 'Hi bank people. Need help with my account, account is 45674567, call me on 01345 567567'

data6 = 'Can you help me with my sort 34-23-12, contact me using xyz@googlemail.dk'

data7 = 'I live here in the UK and need help with accessing my account. Bank acc is 09876543, 13-20-54. Let me know on 1290 344 456 or helpme@outlook.com'

data8 = 'Sort Code 33-34-35, Bank Account 33445566'

data9 = 'Email: Guffy@yahoo.com; phone: 01346 022 773'


In [5]:
TRAIN_DATA = [
    (data1, {
        'entities': [(19, 27, 'BANK_ACCOUNT'), (43, 51, 'SORT_CODE')]
    }),
     (data2, {
         'entities': [(29, 46, 'EMAIL'), (61, 74, 'PHONE')]
    }),
     (data3, {
         'entities': [(13, 21, 'BANK_ACCOUNT'), (33, 41, 'SORT_CODE')]
    }),
     (data4, {
         'entities': [(19, 40, 'EMAIL'), (44, 57, 'PHONE')]
    }),
     (data5, {
         'entities': [(54, 62, 'BANK_ACCOUNT'), (75, 87, 'PHONE')]
    }),
     (data6, {
         'entities': [(29, 37, 'SORT_CODE'), (56, 73, 'EMAIL')]
    }),
     (data7, {
         'entities': [(75, 83, 'BANK_ACCOUNT'), (85, 93, 'SORT_CODE'), (110, 122, 'PHONE'), (126, 144, 'EMAIL')]
    }),
     (data8, {
         'entities': [(10, 18, 'SORT_CODE'), (33, 41, 'BANK_ACCOUNT')]
    }),
     (data9, {
         'entities': [(7, 22, 'EMAIL'), (31, 44, 'PHONE')]
    }),

]

In [6]:
import random
from tqdm import tqdm

ner = nlp.get_pipe('ner')

n_iter=200
# add labels, Trains data based on annotations 
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        print(ent[2])
        ner.add_label(ent[2])

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        #random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout 
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

BANK_ACCOUNT
SORT_CODE
EMAIL
PHONE
BANK_ACCOUNT
SORT_CODE
EMAIL
PHONE
BANK_ACCOUNT
PHONE
SORT_CODE
EMAIL
SORT_CODE
EMAIL


100%|██████████| 7/7 [00:00<00:00,  8.95it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.26it/s]

{'ner': 104.59795580594924}


100%|██████████| 7/7 [00:00<00:00,  8.85it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.81it/s]

{'ner': 108.62217126160914}


100%|██████████| 7/7 [00:00<00:00,  8.76it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.85it/s]

{'ner': 126.06563625065857}


100%|██████████| 7/7 [00:00<00:00,  9.08it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.40it/s]

{'ner': 116.57214191334286}


100%|██████████| 7/7 [00:00<00:00,  8.27it/s]
 14%|█▍        | 1/7 [00:00<00:00,  6.67it/s]

{'ner': 117.6837270708686}


100%|██████████| 7/7 [00:00<00:00,  8.35it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.77it/s]

{'ner': 110.1592922315021}


100%|██████████| 7/7 [00:00<00:00,  8.53it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.00it/s]

{'ner': 105.18961475673132}


100%|██████████| 7/7 [00:00<00:00,  8.29it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.26it/s]

{'ner': 98.74492625542916}


100%|██████████| 7/7 [00:00<00:00,  8.65it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.35it/s]

{'ner': 100.45593319166801}


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.70it/s]

{'ner': 97.13326164707541}


100%|██████████| 7/7 [00:00<00:00,  8.80it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.52it/s]

{'ner': 86.6890414913305}


100%|██████████| 7/7 [00:00<00:00,  8.84it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.33it/s]

{'ner': 93.26830441487255}


100%|██████████| 7/7 [00:00<00:00,  8.77it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.14it/s]

{'ner': 87.94931669632206}


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.20it/s]

{'ner': 89.50449027016293}


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.90it/s]

{'ner': 87.05374589143321}


100%|██████████| 7/7 [00:00<00:00,  9.02it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 86.5992323316168}


100%|██████████| 7/7 [00:00<00:00,  8.69it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.85it/s]

{'ner': 78.49612959683873}


100%|██████████| 7/7 [00:00<00:00,  8.49it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.52it/s]

{'ner': 100.62608844228089}


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.80it/s]

{'ner': 87.14180809538811}


100%|██████████| 7/7 [00:00<00:00,  8.74it/s]
 14%|█▍        | 1/7 [00:00<00:00,  6.93it/s]

{'ner': 98.43485201057047}


100%|██████████| 7/7 [00:00<00:00,  8.70it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 88.38353066971467}


100%|██████████| 7/7 [00:00<00:00,  8.72it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.93it/s]

{'ner': 93.46097383089364}


100%|██████████| 7/7 [00:00<00:00,  8.71it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.52it/s]

{'ner': 89.9357967665419}


100%|██████████| 7/7 [00:00<00:00,  8.80it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 92.26203690376133}


100%|██████████| 7/7 [00:00<00:00,  8.12it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.62it/s]

{'ner': 93.43802884314209}


100%|██████████| 7/7 [00:00<00:00,  8.25it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.00it/s]

{'ner': 81.06096187140793}


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 93.6473264824599}


100%|██████████| 7/7 [00:00<00:00,  8.84it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 85.54540045734029}


100%|██████████| 7/7 [00:00<00:00,  9.02it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.40it/s]

{'ner': 96.6026690271101}


100%|██████████| 7/7 [00:00<00:00,  8.28it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.52it/s]

{'ner': 89.89873925130814}


100%|██████████| 7/7 [00:00<00:00,  8.97it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 88.06868830416352}


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.93it/s]

{'ner': 87.90201162855374}


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
 14%|█▍        | 1/7 [00:00<00:00,  6.62it/s]

{'ner': 94.49768003821373}


100%|██████████| 7/7 [00:00<00:00,  8.72it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 87.49808548178407}


100%|██████████| 7/7 [00:00<00:00,  8.84it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.35it/s]

{'ner': 85.45217104395851}


100%|██████████| 7/7 [00:00<00:00,  8.96it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.14it/s]

{'ner': 96.54805378243327}


100%|██████████| 7/7 [00:00<00:00,  8.83it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.17it/s]

{'ner': 77.52908518095501}


100%|██████████| 7/7 [00:00<00:00,  8.88it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.71it/s]

{'ner': 91.00726347242016}


100%|██████████| 7/7 [00:00<00:00,  8.98it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.01it/s]

{'ner': 92.14050911765662}


100%|██████████| 7/7 [00:00<00:00,  8.61it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.62it/s]

{'ner': 86.67934483411955}


100%|██████████| 7/7 [00:00<00:00,  8.33it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.16it/s]

{'ner': 88.65608314162819}


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 79.09643124649301}


100%|██████████| 7/7 [00:00<00:00,  8.37it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.90it/s]

{'ner': 87.3242555167817}


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
 14%|█▍        | 1/7 [00:00<00:00,  6.41it/s]

{'ner': 85.38062029727735}


100%|██████████| 7/7 [00:00<00:00,  8.41it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.15it/s]

{'ner': 85.90615152008832}


100%|██████████| 7/7 [00:00<00:00,  8.62it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.85it/s]

{'ner': 83.03337930352427}


100%|██████████| 7/7 [00:00<00:00,  8.66it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.77it/s]

{'ner': 82.69863230257761}


100%|██████████| 7/7 [00:00<00:00,  8.93it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 75.02753624439356}


100%|██████████| 7/7 [00:00<00:00,  9.30it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.52it/s]

{'ner': 89.9640091606525}


100%|██████████| 7/7 [00:00<00:00,  9.04it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.80it/s]

{'ner': 84.30709452592419}


100%|██████████| 7/7 [00:00<00:00,  8.58it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.87it/s]

{'ner': 83.79758027330172}


100%|██████████| 7/7 [00:00<00:00,  8.86it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.93it/s]

{'ner': 86.2034178157337}


100%|██████████| 7/7 [00:00<00:00,  9.24it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.35it/s]

{'ner': 86.32686004857533}


100%|██████████| 7/7 [00:00<00:00,  8.96it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 79.02991656836821}


100%|██████████| 7/7 [00:00<00:00,  8.82it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.00it/s]

{'ner': 84.06599815332447}


100%|██████████| 7/7 [00:00<00:00,  9.01it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.74it/s]

{'ner': 76.91675732069416}


100%|██████████| 7/7 [00:00<00:00,  8.92it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.93it/s]

{'ner': 77.23554283719932}


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 86.76051216202904}


100%|██████████| 7/7 [00:00<00:00,  9.04it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 87.25073209343827}


100%|██████████| 7/7 [00:00<00:00,  8.79it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.09it/s]

{'ner': 92.06026715354528}


100%|██████████| 7/7 [00:00<00:00,  8.89it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.35it/s]

{'ner': 78.8226740170503}


100%|██████████| 7/7 [00:00<00:00,  8.51it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.54it/s]

{'ner': 89.60833028401248}


100%|██████████| 7/7 [00:00<00:00,  9.13it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.52it/s]

{'ner': 92.42599503893871}


100%|██████████| 7/7 [00:00<00:00,  9.09it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.77it/s]

{'ner': 80.76993755160947}


100%|██████████| 7/7 [00:00<00:00,  8.36it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.99it/s]

{'ner': 86.07983497466194}


100%|██████████| 7/7 [00:00<00:00,  8.68it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.17it/s]

{'ner': 87.4583772670012}


100%|██████████| 7/7 [00:00<00:00,  8.80it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.75it/s]

{'ner': 85.58053899277002}


100%|██████████| 7/7 [00:00<00:00,  8.93it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.77it/s]

{'ner': 85.05292116419878}


100%|██████████| 7/7 [00:00<00:00,  9.19it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.52it/s]

{'ner': 85.68686280422844}


100%|██████████| 7/7 [00:00<00:00,  8.89it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 92.66119417152368}


100%|██████████| 7/7 [00:00<00:00,  9.00it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.17it/s]

{'ner': 75.34159397287294}


100%|██████████| 7/7 [00:00<00:00,  9.21it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.35it/s]

{'ner': 79.0200684380834}


100%|██████████| 7/7 [00:00<00:00,  9.49it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.71it/s]

{'ner': 89.27893297583796}


100%|██████████| 7/7 [00:00<00:00,  9.20it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.25it/s]

{'ner': 83.01203274610452}


100%|██████████| 7/7 [00:00<00:00,  8.96it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.93it/s]

{'ner': 80.86702255040291}


100%|██████████| 7/7 [00:00<00:00,  8.97it/s]
 29%|██▊       | 2/7 [00:00<00:00,  9.70it/s]

{'ner': 78.8364296733198}


100%|██████████| 7/7 [00:00<00:00,  9.13it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.17it/s]

{'ner': 75.4144716772862}


100%|██████████| 7/7 [00:00<00:00,  9.19it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 79.91630489256931}


100%|██████████| 7/7 [00:00<00:00,  8.94it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.47it/s]

{'ner': 87.23829974769615}


100%|██████████| 7/7 [00:00<00:00,  8.80it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.62it/s]

{'ner': 82.84492272557691}


100%|██████████| 7/7 [00:00<00:00,  9.13it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.25it/s]

{'ner': 85.45149611054512}


100%|██████████| 7/7 [00:00<00:00,  8.95it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.35it/s]

{'ner': 81.7398422350816}


100%|██████████| 7/7 [00:00<00:00,  9.19it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.69it/s]

{'ner': 88.84994732297491}


100%|██████████| 7/7 [00:00<00:00,  9.19it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.76it/s]

{'ner': 81.62843078072183}


100%|██████████| 7/7 [00:00<00:00,  8.64it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.26it/s]

{'ner': 78.2738105738681}


100%|██████████| 7/7 [00:00<00:00,  8.20it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.71it/s]

{'ner': 76.77051728708466}


100%|██████████| 7/7 [00:00<00:00,  9.22it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.70it/s]

{'ner': 83.21131028523087}


100%|██████████| 7/7 [00:00<00:00,  9.39it/s]
 14%|█▍        | 1/7 [00:00<00:00,  6.76it/s]

{'ner': 82.62183006180567}


100%|██████████| 7/7 [00:00<00:00,  9.06it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.47it/s]

{'ner': 79.89064077254443}


100%|██████████| 7/7 [00:00<00:00,  9.38it/s]
 14%|█▍        | 1/7 [00:00<00:00,  8.77it/s]

{'ner': 76.35201338086335}


100%|██████████| 7/7 [00:00<00:00,  9.31it/s]
 14%|█▍        | 1/7 [00:00<00:00,  7.81it/s]

{'ner': 87.58151656843256}


100%|██████████| 7/7 [00:00<00:00,  9.38it/s]
 29%|██▊       | 2/7 [00:00<00:00, 10.26it/s]

{'ner': 89.07652002116083}


100%|██████████| 7/7 [00:00<00:00,  9.10it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 79.95092494998698}


100%|██████████| 7/7 [00:00<00:00,  9.26it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.43it/s]

{'ner': 80.77309059750405}


100%|██████████| 7/7 [00:00<00:00,  9.13it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.43it/s]

{'ner': 88.24201777600683}


100%|██████████| 7/7 [00:00<00:00,  9.00it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.80it/s]

{'ner': 89.98832385886635}


100%|██████████| 7/7 [00:00<00:00,  9.14it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.01it/s]

{'ner': 84.77746089613356}


100%|██████████| 7/7 [00:00<00:00,  9.30it/s]
  0%|          | 0/7 [00:00<?, ?it/s]

{'ner': 73.65101680575754}


100%|██████████| 7/7 [00:00<00:00,  9.33it/s]
 14%|█▍        | 1/7 [00:00<00:00,  9.80it/s]

{'ner': 78.12456577876583}


100%|██████████| 7/7 [00:00<00:00,  9.41it/s]

{'ner': 82.50096433673752}





In [7]:
# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities []
Entities [('45674567', 'BANK_ACCOUNT')]
Entities []
Entities [('01480 345 345', 'PHONE')]
Entities [('09780909', 'BANK_ACCOUNT')]
Entities []
Entities []


In [8]:
new_chat_test_doc = nlp(chat_text)
displacy.serve(new_chat_test_doc, style='ent')

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
