### Import libraries

In [38]:
import spacy
import random
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import pandas as pd
import ast

### Import the training data

In [34]:
data = pd.read_csv('data/training_data.csv')
data

Unnamed: 0.1,Unnamed: 0,Summary,Annotations
0,0,In continuation to our letter dated 26th Octob...,"{'entities': [(350, 358, 'Div'), (145, 153, 'D..."
1,1,Balkrishna Industries Ltd has informed BSE tha...,"{'entities': [(156, 167, 'Dec')]}"
2,2,Vaibhav Global Ltd has informed BSE that the B...,"{'entities': [(208, 216, 'Div'), (220, 225, 'O..."
3,3,K.Z.LEASING &amp; FINANCE LTD.has informed BSE...,"{'entities': [(121, 131, 'SDate'), (108, 120, ..."
4,4,KALYANI INVESTMENT COMPANY LTD.has informed BS...,"{'entities': [(122, 132, 'SDate'), (109, 121, ..."
...,...,...,...
295,295,IVP LTD.has informed BSE that the meeting of t...,"{'entities': [(99, 109, 'SDate'), (86, 98, 'In..."
296,296,LADDERUP FINANCE LTD.has informed BSE that the...,"{'entities': [(112, 122, 'SDate'), (99, 111, '..."
297,297,RSWM LTD.has informed BSE that the meeting of ...,"{'entities': [(100, 110, 'SDate'), (87, 99, 'I..."
298,298,AJMERA REALTY &amp; INFRA INDIA LTD.has inform...,"{'entities': [(127, 137, 'SDate'), (114, 126, ..."


In [43]:
'''
convert the csv data into specific format of the training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]
'''

train_data = []
temp_set = ()
temp_dict = {}

for i in range(len(data)):
    temp_dict['entities'] = ast.literal_eval(data['Annotations'][0][13:-1])
    temp_set = (data['Summary'][0] , temp_dict)
    train_data.append(temp_set)

In [45]:
'''
print to check train_data
'''
#train_data

'\nprint to check train_data\n'

### Load NER (Name Entity Recognition)spacy model

In [47]:
nlp = spacy.blank('en')

In [48]:
'''
creating and adding "ner" pipeline if not exist
'''

if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")
    


In [49]:
'''
fixing number of iterations model has to do in training
At each iteration, the training data is shuffled to ensure the model doesn’t make any generalizations based on the order of examples.
'''

n_iter=100

### Add the above made labels in the ner (name entity recognition) pipe

In [50]:
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [51]:
'''
get names of other pipes to disable them during training
'''
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

### Train the model

In [None]:
count=0
exception_summary=[]
exception_annotations=[]
with nlp.disable_pipes(*other_pipes):  # only train NER pipeline
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        try:
            for text, annotations in tqdm(train_data):
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.35,  # dropout makes it harder for the model to memorize the training data.
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
        except:
            count+=1
            exception_summary.append(text)
            exception_annotations.append(annotations)   
        print(losses)

  **kwargs
100%|██████████| 300/300 [00:23<00:00, 12.80it/s]
  0%|          | 1/300 [00:00<00:30,  9.84it/s]

{'ner': 1215.8859523992728}


100%|██████████| 300/300 [00:34<00:00,  8.78it/s]
  0%|          | 1/300 [00:00<00:33,  8.93it/s]

{'ner': 7.981945408799414e-05}


100%|██████████| 300/300 [00:30<00:00,  9.78it/s]
  0%|          | 1/300 [00:00<00:32,  9.14it/s]

{'ner': 4.279735676733278e-06}


100%|██████████| 300/300 [00:29<00:00, 10.03it/s]
  1%|          | 2/300 [00:00<00:25, 11.64it/s]

{'ner': 8.324015926857244e-07}


100%|██████████| 300/300 [00:32<00:00,  9.16it/s]
  0%|          | 1/300 [00:00<00:46,  6.40it/s]

{'ner': 8.629334604848203e-06}


100%|██████████| 300/300 [00:44<00:00,  6.70it/s]
  0%|          | 1/300 [00:00<00:46,  6.40it/s]

{'ner': 3.0261564833907983e-06}


100%|██████████| 300/300 [00:49<00:00,  6.12it/s]
  0%|          | 1/300 [00:00<00:44,  6.65it/s]

{'ner': 9.014956076167244e-08}


100%|██████████| 300/300 [00:46<00:00,  6.41it/s]
  0%|          | 1/300 [00:00<00:42,  7.11it/s]

{'ner': 1.0034206652543012e-07}


100%|██████████| 300/300 [00:46<00:00,  6.40it/s]
  0%|          | 1/300 [00:00<00:46,  6.37it/s]

{'ner': 2.7754662299534823e-08}


100%|██████████| 300/300 [00:47<00:00,  6.28it/s]
  0%|          | 1/300 [00:00<00:41,  7.28it/s]

{'ner': 1.8242138048316337e-08}


100%|██████████| 300/300 [00:46<00:00,  6.39it/s]
  0%|          | 1/300 [00:00<00:43,  6.83it/s]

{'ner': 5.429243313677803e-06}


100%|██████████| 300/300 [00:51<00:00,  5.85it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 10.618253563628093}


100%|██████████| 300/300 [01:02<00:00,  4.81it/s]
  0%|          | 1/300 [00:00<00:57,  5.21it/s]

{'ner': 2.7805534152095515}


100%|██████████| 300/300 [01:00<00:00,  4.92it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 0.00024359136205635986}


100%|██████████| 300/300 [01:02<00:00,  4.82it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 2.0227634089544272e-23}


100%|██████████| 300/300 [01:01<00:00,  4.84it/s]
  0%|          | 1/300 [00:00<00:56,  5.34it/s]

{'ner': 1.666480078982984e-24}


100%|██████████| 300/300 [01:00<00:00,  4.93it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 5.395793123246451e-14}


100%|██████████| 300/300 [01:00<00:00,  4.95it/s]
  0%|          | 1/300 [00:00<00:56,  5.33it/s]

{'ner': 1.974362897536095e-10}


100%|██████████| 300/300 [01:02<00:00,  4.83it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 5.841880899742984e-19}


100%|██████████| 300/300 [01:05<00:00,  4.57it/s]
  0%|          | 1/300 [00:00<00:56,  5.26it/s]

{'ner': 1.164124772123504e-20}


100%|██████████| 300/300 [01:06<00:00,  4.53it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 4.945826870148887e-21}


100%|██████████| 300/300 [01:03<00:00,  4.72it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 6.779368256825402e-21}


100%|██████████| 300/300 [01:06<00:00,  4.53it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 3.2609232114350744e-17}


100%|██████████| 300/300 [01:07<00:00,  4.47it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 2.258284984807895e-19}


100%|██████████| 300/300 [01:03<00:00,  4.69it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 1.755336516844742e-19}


100%|██████████| 300/300 [01:02<00:00,  4.77it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 3.3627720344441613e-18}


100%|██████████| 300/300 [01:04<00:00,  4.66it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 2.8511201449935273e-18}


100%|██████████| 300/300 [01:04<00:00,  4.69it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 2.0425136118016664e-22}


100%|██████████| 300/300 [01:03<00:00,  4.70it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 1.2621807424407188e-17}


100%|██████████| 300/300 [01:05<00:00,  4.55it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 2.3019988499859058e-20}


100%|██████████| 300/300 [01:04<00:00,  4.62it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 3.283635014126568e-13}


 18%|█▊        | 54/300 [00:11<00:52,  4.71it/s]

In [None]:
### Save/Download the model 

In [None]:
nlp.to_disk(r'ner_spacy_model')

In [None]:
### Load the model for testing

In [None]:
nlp2 = spacy.load(r'ner_spacy_model')

In [None]:
### Test model

In [None]:
text = input()

In [None]:
'''
tokenize the input text
'''
sentences = sent_tokenize(text)
print(sentences)

In [None]:
'''
test the model
'''
for sentence in sentences:
    doc2 = nlp2(sentence)
    #print(doc2)
    for ent in doc2.ents:
        print(ent.label_, "->", ent.text)