### Import libraries

In [135]:
import spacy
import random
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import pandas as pd
import re
import numpy as np

### Read data

In [136]:
data = pd.read_csv('data/tagging.csv')

In [137]:
data.head(5)

Unnamed: 0,Date,Company Name,Title,Corporate Action,Web-Link,Summary,Dividend,Dividend percentage (DP),Dividend Offer Value,Share Face Value (‘FV’),...,Divided Decision,Bonus/ Right Ratio,Bonus/Right Ex-date,Bonus Require,Bonus Provided,Bonus Decision,Meeting Scheduled Date,Intimation/Confirmed,Type of Meeting,KW
0,01-Nov-20,NACL Industries Ltd,Corporate Action-Board to consider Dividend,Dividend,https://www.bseindia.com/stockinfo/AnnPdfOpen....,In continuation to our letter dated 26th Octob...,dividend,,,,...,consider,,,,,,,,,
1,31-Oct-20,BALKRISHNA INDUSTRIES LTD.,Board to Consider Q2 results and Second Interi...,Dividend,https://www.bseindia.com/stockinfo/AnnPdfOpen....,Balkrishna Industries Ltd has informed BSE tha...,dividend,,,,...,to consider,,,,,,,,,
2,30-Oct-20,Vaibhav Global Ltd,Board declares Second Interim Dividend,Dividend,https://www.bseindia.com/xml-data/corpfiling/A...,Vaibhav Global Ltd has informed BSE that the B...,Dividend,,Rs. 5,Rs. 10,...,considered and approved,,,,,,,,,
3,30-Oct,K.Z.LEASING & FINANCE LTD.,Board Meeting Intimation for Consideration Of ...,Meetings,https://www.bseindia.com/xml-data/corpfiling/A...,K.Z.LEASING &amp; FINANCE LTD.has informed BSE...,,,,,...,,,,,,,05-11-2020,scheduled on,meeting of the Board of Directors,Board Meeting Intimation
4,30-Oct,KALYANI INVESTMENT COMPANY LTD.,Board Meeting Intimation for Consider And Appr...,Meetings,https://www.bseindia.com/xml-data/corpfiling/A...,KALYANI INVESTMENT COMPANY LTD.has informed BS...,,,,,...,,,,,,,09-11-2020,scheduled on,meeting of the Board of Directors,


### Data Processing

Convert the data into specific format of the training data which will be fed in the spacy model 

In [138]:
''' 
Getting the list of columns in dataframe 
'''
data.columns

Index(['Date', 'Company Name', 'Title', 'Corporate Action', 'Web-Link',
       'Summary', 'Dividend', 'Dividend percentage (DP)',
       'Dividend Offer Value', 'Share Face Value (‘FV’)', 'Dividend Ex- Date',
       'Dividend Announce Date', 'Divided Decision ', 'Bonus/ Right Ratio',
       'Bonus/Right Ex-date', 'Bonus Require ', 'Bonus Provided',
       'Bonus Decision', 'Meeting Scheduled Date', 'Intimation/Confirmed',
       'Type of Meeting', 'KW'],
      dtype='object')

In [139]:
'''
LABELS:

Div = Dividend
Percent = Dividend percentage (DP)
OfferV = Dividend Offer Value
FaceV = Share Face Value (‘FV’)
DivExDate = Dividend Ex- Date
AnnDate = Dividend Announce Date
Dec = Divided Decision
Ratio = Bonus/ Right Ratio
BonusExDate = Bonus/Right Ex-date
BonusReq = Bonus Require
BonusProv = Bonus Provided 
BonusDec = Bonus Decision
SDate = Meeting Scheduled Date
Int = Intimation/Confirmed
Type = Type of Meeting 

'''


'\nLABELS:\n\nDiv = Dividend\nPercent = Dividend percentage (DP)\nOfferV = Dividend Offer Value\nFaceV = Share Face Value (‘FV’)\nDivExDate = Dividend Ex- Date\nAnnDate = Dividend Announce Date\nDec = Divided Decision\nRatio = Bonus/ Right Ratio\nBonusExDate = Bonus/Right Ex-date\nBonusReq = Bonus Require\nBonusProv = Bonus Provided \nBonusDec = Bonus Decision\nSDate = Meeting Scheduled Date\nInt = Intimation/Confirmed\nType = Type of Meeting \n\n'

In [140]:
'''
Changing the format of the date to match the contents of summary
'''

date = list(data['Meeting Scheduled Date'])
for i in range(len(date)):
    if not (date[i] is np.nan):
        date[i] = date[i].replace('-','/') 
        
data['Meeting Scheduled Date'] = date

In [141]:
'''
list of the enities columns of data = columns[]
list of the labels given to these entities = labels[]
'''
columns = ['Dividend', 'Dividend percentage (DP)','Dividend Offer Value', 'Share Face Value (‘FV’)', 'Dividend Ex- Date', 'Dividend Announce Date', 'Divided Decision ', 'Bonus/ Right Ratio', 'Bonus/Right Ex-date', 'Bonus Require ', 'Bonus Provided','Bonus Decision', 'Meeting Scheduled Date', 'Intimation/Confirmed','Type of Meeting']
labels = ['Div', 'Percent', 'OfferV', 'FaceV', 'DivExDate', 'AnnDate', 'Dec', 'Ratio', 'BonusExDate', 'BonusReq', 'BonusProv', 'BonusDec', 'SDate', 'Int', 'Type']

In [142]:
'''
list of summaries (column in data)
'''
summaries = list(data['Summary'])

In [143]:
'''
drop the rest of the columns other than entities 
'''
data = data[columns]
data.sample(n=5)

Unnamed: 0,Dividend,Dividend percentage (DP),Dividend Offer Value,Share Face Value (‘FV’),Dividend Ex- Date,Dividend Announce Date,Divided Decision,Bonus/ Right Ratio,Bonus/Right Ex-date,Bonus Require,Bonus Provided,Bonus Decision,Meeting Scheduled Date,Intimation/Confirmed,Type of Meeting
57,,,,,,,,,,,,,06/11/2020,scheduled on,meeting of the Board of Directors
123,dividend,40%,Rs. 0.80,Rs. 2,,"June 30, 2020",recommended,,,,,,,,
152,dividend,100%,Rs. 10,Rs. 10,,,Recommended,,,,,,,,
77,Dividend,,,,,,consider,,,,,,,,
295,,,,,,,,,,,,,09/11/2020,scheduled on,meeting of the Board of Directors


In [144]:
'''
converting each row into a list and then appending it to a final 2D list
'''

tags=[] #2D List

for i in range(len(data)):
    temp = [str(x) for x in data.iloc[i].tolist()]
    tags.append(temp)
    temp=[]
    
#print(tags)


In [145]:
'''
convert the data into specific format of the training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]
'''

train_data = []

count = 0;
for summary,tag in zip(summaries, tags):
    temp=[]
    for index in range(len(tag)):
        entity = tag[index]
        if entity != 'nan':
            pos = summary.find(entity)
            if(pos != -1):
                entity_set = (pos, pos+len(entity), labels[index]) #(350, 358, 'Summary')
                temp.append(entity_set) #list of entitity (described above)
        
    #print(temp)
    
    ''' temp = [(350, 358, 'Summary'), (None, None, 'Dividend'), (None, None, 'Dividend percentage (DP)'), (None, None, 'Dividend Offer Value'), (None, None, 'Share Face Value (‘FV’)'), (None, None, 'Dividend Ex- Date'), (145, 153, 'Dividend Announce Date'), (None, None, 'Divided Decision'), (None, None, 'Bonus/ Right Ratio'), (None, None, 'Bonus/Right Ex-date'), (None, None, 'Bonus Require '), (None, None, 'Bonus Provided'), (None, None, 'Bonus Decision'), (None, None, 'Meeting Scheduled Date'), (None, None, 'Intimation/Confirmed')]'''
    
    inner_dict = {'entities': temp}
    inner_set = (summary, inner_dict)
    train_data.append(inner_set)
    
    count+=1

In [158]:
'''
training data
'''
print(train_data[24])

('Pursuant to Regulation 42 of the SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, we wish to inform you that the Register of Members and Share Transfer Register shall remain closed from 9th December, 2020 to 15th December, 2020 (both days inclusive) for the purpose of 32nd Annual General Meeting of the Company which is scheduled to be held on Tuesday, the 15th December, 2020 through Video Conferencing (VC) or Other Audio-Visual Means (OAVM) in compliance with Circular dated May 5, 2020 read with Circulars dated April 8, 2020 and April 13, 2020 issued by the Ministry of Corporate Affairs.&lt;BR&gt; &lt;BR&gt; Kindly take the above information on record and acknowledge receipt of the same.', {'entities': [(233, 252, 'SDate'), (346, 369, 'Int'), (299, 321, 'Type')]})


In [159]:
for i in range(len(train_data)):
    if(train_data[i][0] == 'Recommended a dividend of Rs. 10/- per Ordinary Share of Rs. 10/- each (100%) and Rs. 2.504 per partly paid Ordinary Share of Rs. 10/- each (paid-up Rs. 2.504 per share) (100%) to the shareholders for the Financial Year ended March 31, 2020.'):
        print(i)
        print(train_data[i][1])

152
{'entities': [(14, 22, 'Div'), (72, 76, 'Percent'), (26, 32, 'OfferV'), (26, 32, 'FaceV'), (0, 11, 'Dec')]}


In [147]:
exception_summary=[]
exception_annotations=[]

In [148]:
l=[]
s=()
i=0
index=[]
for set_temp in train_data:
    annotations = set_temp[1]['entities']
    l=[]
    for entity in annotations:
        l.append(entity[0])
        l.append(entity[1])
    if( len(l) > len(set(l)) ):
        print(set_temp[0])
        print(annotations)
        print("-------------------------------------------")
        exception_summary.append(set_temp[0])
        exception_annotations.append(set_temp[1])
        index.append(i)
    i+=1

Thyrocare Technologies Ltd has informed BSE that the Board of Directors of the Company at its meeting held on October 28, 2020, inter alia, have approved payment of interim dividend for the financial year 2020-21, at the rate of Rs. 10/- (Rupees Ten only) per share (100% of the face value of Rs. 10/? each).
[(173, 181, 'Div'), (267, 271, 'Percent'), (229, 235, 'OfferV'), (229, 235, 'FaceV'), (145, 153, 'Dec')]
-------------------------------------------
The Board declared an interim dividend of Rs.10 (100%) per equity share of Rs.10 each. &lt;BR&gt; &lt;BR&gt; As intimated vide our letter dated October 13, 2020, Thursday, October 29, 2020 is fixed as the Record Date to ascertain the list of shareholders eligible for the aforesaid interim dividend to be paid on or before November 19, 2020.&lt;BR&gt;
[(30, 38, 'Div'), (49, 53, 'Percent'), (42, 47, 'OfferV'), (42, 47, 'FaceV'), (323, 340, 'DivExDate'), (10, 18, 'Dec')]
-------------------------------------------
We would like to inform yo

In [149]:
index

[18, 102, 113, 129, 135, 143, 152, 180, 215, 217, 242, 248, 253, 254, 255]

In [206]:
x=0
for i in index:
    train_data[i][1]['entities'] = ast.literal_eval(exception['Annotations'][x][13:-1])
    x+=1

In [209]:
t = pd.DataFrame(columns=['Summary', 'Annotations'])

In [210]:
s=[]
a=[]
for set_temp in train_data:
    s.append(set_temp[0])
    a.append(set_temp[1])

In [211]:
t['Summary']=s
t['Annotations']=a

In [212]:
t.to_csv('data/training_data.csv')

### Load NER (Name Entity Recognition)spacy model

In [14]:
nlp = spacy.blank('en')

In [15]:
'''
creating and adding "ner" pipeline if not exist
'''

if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")
    


In [16]:
'''
fixing number of iterations model has to do in training
At each iteration, the training data is shuffled to ensure the model doesn’t make any generalizations based on the order of examples.
'''

n_iter=100

### Add the above made labels in the ner (name entity recognition) pipe

In [17]:
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [18]:
'''
get names of other pipes to disable them during training
'''
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

### Train the model

In [19]:
count=0
exception_summary=[]
exception_annotations=[]
with nlp.disable_pipes(*other_pipes):  # only train NER pipeline
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        try:
            for text, annotations in tqdm(train_data):
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.5,  # dropout makes it harder for the model to memorize the training data.
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
        except:
            count+=1
            exception_summary.append(text)
            exception_annotations.append(annotations)   
        print(losses)

  **kwargs
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
 12%|█▏        | 37/300 [00:03<00:22, 11.83it/s]
  0%|          | 1/300 [00:00<00:34,  8.71it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 1662.6420784432219}
{'ner': 5.883854251024134}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  7%|▋         | 21/300 [00:01<00:19, 14.14it/s]
  1%|          | 2/300 [00:00<00:19, 15.01it/s]

{'ner': 110.2978538758804}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
 15%|█▍        | 44/300 [00:03<00:17, 14.27it/s]
  gold = GoldParse(doc, **gold)
  1%|          | 2/300 [00:00<00:23, 12.87it/s]

{'ner': 222.54969501170098}


  1%|          | 3/300 [00:00<00:21, 13.56it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  1%|          | 2/300 [00:00<00:20, 14.35it/s]

{'ner': 13.792146088391117}
{}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  7%|▋         | 20/300 [00:01<00:22, 12.29it/s]
  1%|          | 2/300 [00:00<00:21, 13.68it/s]

{'ner': 92.48975672786078}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
 12%|█▏        | 35/300 [00:02<00:21, 12.60it/s]
  gold = GoldParse(doc, **gold)


{'ner': 140.8620941598906}


  gold = GoldParse(doc, **gold)
  8%|▊         | 25/300 [00:01<00:20, 13.75it/s]
  gold = GoldParse(doc, **gold)
  1%|          | 2/300 [00:00<00:20, 14.80it/s]

{'ner': 156.30044409058553}


  1%|▏         | 4/300 [00:00<00:21, 14.01it/s]
  1%|          | 2/300 [00:00<00:18, 15.79it/s]

{'ner': 131.98133654905416}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  8%|▊         | 24/300 [00:01<00:19, 13.87it/s]
  0%|          | 1/300 [00:00<00:25, 11.79it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 301.3091120531806}
{'ner': 6.932655032112962}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
 10%|█         | 30/300 [00:02<00:19, 13.73it/s]
  1%|          | 2/300 [00:00<00:21, 13.62it/s]

{'ner': 437.95410744788745}


  4%|▍         | 13/300 [00:00<00:18, 15.79it/s]
  0%|          | 1/300 [00:00<00:33,  8.99it/s]

{'ner': 99.56795012117759}


  gold = GoldParse(doc, **gold)
  5%|▌         | 16/300 [00:01<00:20, 13.86it/s]
  1%|          | 2/300 [00:00<00:19, 15.17it/s]

{'ner': 115.40457832409082}


  8%|▊         | 24/300 [00:01<00:18, 14.70it/s]
  1%|          | 2/300 [00:00<00:24, 12.07it/s]

{'ner': 135.37662722726827}


  gold = GoldParse(doc, **gold)
  5%|▍         | 14/300 [00:00<00:20, 14.23it/s]
  1%|          | 2/300 [00:00<00:24, 12.08it/s]

{'ner': 99.54021357340883}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
 13%|█▎        | 40/300 [00:02<00:17, 15.19it/s]
  1%|          | 2/300 [00:00<00:24, 12.25it/s]

{'ner': 216.25744278530843}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
 30%|███       | 91/300 [00:06<00:14, 14.41it/s]
  1%|          | 2/300 [00:00<00:19, 15.45it/s]

{'ner': 762.8107225349428}


  4%|▎         | 11/300 [00:00<00:18, 15.37it/s]
  1%|          | 2/300 [00:00<00:19, 15.18it/s]

{'ner': 45.81857069577885}


  9%|▉         | 27/300 [00:01<00:18, 14.46it/s]
  1%|          | 3/300 [00:00<00:19, 15.23it/s]

{'ner': 187.08931114414656}



  1%|          | 2/300 [00:00<00:17, 17.15it/s]
  1%|          | 2/300 [00:00<00:23, 12.45it/s]

{'ner': 9.623247069138117}
{'ner': 5.9660571940414675}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
 11%|█         | 33/300 [00:02<00:18, 14.71it/s]
  1%|          | 2/300 [00:00<00:21, 13.60it/s]

{'ner': 137.6896155948741}


  gold = GoldParse(doc, **gold)
 16%|█▌        | 48/300 [00:03<00:17, 14.36it/s]
  1%|          | 2/300 [00:00<00:18, 15.80it/s]

{'ner': 270.35414545209983}


  4%|▍         | 12/300 [00:00<00:21, 13.66it/s]
  1%|          | 2/300 [00:00<00:23, 12.93it/s]

{'ner': 63.89605446433404}


  gold = GoldParse(doc, **gold)
 12%|█▏        | 35/300 [00:02<00:18, 14.63it/s]
  1%|          | 2/300 [00:00<00:15, 19.75it/s]

{'ner': 149.02208213978705}


  3%|▎         | 10/300 [00:00<00:19, 14.72it/s]
  1%|          | 2/300 [00:00<00:20, 14.69it/s]

{'ner': 33.28426048046905}


  1%|▏         | 4/300 [00:00<00:25, 11.84it/s]
  1%|          | 2/300 [00:00<00:19, 15.37it/s]

{'ner': 53.947477680032804}


  4%|▍         | 13/300 [00:00<00:17, 16.24it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  1%|          | 2/300 [00:00<00:23, 12.43it/s]

{'ner': 50.87890301559068}
{}


 20%|█▉        | 59/300 [00:04<00:16, 14.28it/s]
  1%|          | 2/300 [00:00<00:19, 14.96it/s]

{'ner': 262.1729116448219}


  3%|▎         | 10/300 [00:00<00:20, 13.98it/s]
  0%|          | 1/300 [00:00<00:33,  8.87it/s]

{'ner': 34.83231701615687}


  1%|▏         | 4/300 [00:00<00:24, 12.02it/s]
  0%|          | 1/300 [00:00<00:33,  9.04it/s]

{'ner': 10.395462860245942}


 11%|█▏        | 34/300 [00:02<00:17, 14.79it/s]
  1%|          | 2/300 [00:00<00:15, 19.78it/s]

{'ner': 141.0725912387887}


  gold = GoldParse(doc, **gold)
  4%|▍         | 13/300 [00:00<00:20, 13.93it/s]
  1%|          | 2/300 [00:00<00:22, 13.46it/s]

{'ner': 61.01532532846931}


 13%|█▎        | 38/300 [00:02<00:17, 15.29it/s]
  gold = GoldParse(doc, **gold)
  1%|          | 2/300 [00:00<00:18, 16.04it/s]

{'ner': 164.21775605695635}


  gold = GoldParse(doc, **gold)
 18%|█▊        | 55/300 [00:03<00:16, 14.81it/s]
  1%|          | 2/300 [00:00<00:18, 16.09it/s]

{'ner': 239.48181831681833}


 24%|██▍       | 72/300 [00:04<00:15, 14.66it/s]
  1%|          | 2/300 [00:00<00:20, 14.66it/s]

{'ner': 311.4444881692498}


  gold = GoldParse(doc, **gold)
  6%|▋         | 19/300 [00:01<00:21, 13.33it/s]
  1%|          | 2/300 [00:00<00:19, 15.40it/s]

{'ner': 128.5284902102716}


  2%|▏         | 6/300 [00:00<00:22, 13.02it/s]
  1%|          | 2/300 [00:00<00:20, 14.68it/s]

{'ner': 3.1817312188847473}


  6%|▋         | 19/300 [00:01<00:20, 13.76it/s]
  1%|          | 2/300 [00:00<00:22, 13.24it/s]

{'ner': 152.31307876153542}


  4%|▍         | 13/300 [00:00<00:19, 14.37it/s]
  1%|          | 2/300 [00:00<00:23, 12.52it/s]

{'ner': 49.725340784669285}


  gold = GoldParse(doc, **gold)
 13%|█▎        | 38/300 [00:02<00:19, 13.26it/s]
  1%|          | 2/300 [00:00<00:26, 11.14it/s]

{'ner': 355.7866893321188}


  gold = GoldParse(doc, **gold)
  7%|▋         | 22/300 [00:01<00:19, 14.18it/s]
  1%|          | 2/300 [00:00<00:24, 12.38it/s]

{'ner': 155.11806462526314}


  1%|▏         | 4/300 [00:00<00:22, 13.18it/s]
  1%|          | 2/300 [00:00<00:16, 18.14it/s]

{'ner': 22.10650945184433}


  7%|▋         | 21/300 [00:01<00:18, 14.90it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  1%|          | 2/300 [00:00<00:16, 17.63it/s]

{'ner': 137.51541574173834}
{}


  8%|▊         | 24/300 [00:01<00:18, 14.70it/s]
  1%|          | 2/300 [00:00<00:17, 17.41it/s]

{'ner': 77.33303929336263}


  6%|▋         | 19/300 [00:01<00:19, 14.20it/s]
  1%|          | 2/300 [00:00<00:19, 15.54it/s]

{'ner': 138.18782321006358}


  6%|▌         | 17/300 [00:01<00:18, 15.53it/s]
  1%|          | 2/300 [00:00<00:19, 15.24it/s]

{'ner': 27.385408400373436}


  2%|▏         | 5/300 [00:00<00:18, 16.20it/s]
  1%|          | 2/300 [00:00<00:24, 11.92it/s]

{'ner': 16.026326401426623}


  gold = GoldParse(doc, **gold)
 12%|█▏        | 35/300 [00:02<00:18, 14.46it/s]
  1%|          | 2/300 [00:00<00:19, 15.01it/s]

{'ner': 172.26056180155587}


  5%|▌         | 16/300 [00:01<00:20, 13.86it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  1%|          | 2/300 [00:00<00:19, 15.42it/s]

{'ner': 181.8544292216065}
{}


  3%|▎         | 8/300 [00:00<00:20, 14.01it/s]
  gold = GoldParse(doc, **gold)
  1%|          | 2/300 [00:00<00:20, 14.42it/s]

{'ner': 8.564969359011974}


 20%|██        | 61/300 [00:04<00:16, 14.39it/s]
  1%|          | 2/300 [00:00<00:24, 12.15it/s]

{'ner': 180.714077290873}


  9%|▉         | 28/300 [00:02<00:19, 13.90it/s]
  1%|          | 2/300 [00:00<00:19, 14.98it/s]

{'ner': 109.9464033864501}


  2%|▏         | 7/300 [00:00<00:20, 13.97it/s]
  1%|          | 2/300 [00:00<00:21, 13.66it/s]

{'ner': 14.96956118888146}


  gold = GoldParse(doc, **gold)
 19%|█▉        | 57/300 [00:04<00:17, 13.79it/s]
  1%|          | 2/300 [00:00<00:19, 15.51it/s]

{'ner': 201.27206963053953}


 12%|█▏        | 36/300 [00:02<00:19, 13.42it/s]
  1%|          | 2/300 [00:00<00:26, 11.39it/s]

{'ner': 212.6010529451749}


 21%|██▏       | 64/300 [00:04<00:17, 13.63it/s]
  1%|          | 2/300 [00:00<00:20, 14.68it/s]

{'ner': 373.9472283591279}


  8%|▊         | 24/300 [00:01<00:21, 12.87it/s]
  1%|          | 2/300 [00:00<00:25, 11.83it/s]

{'ner': 134.30186874554812}


  1%|          | 3/300 [00:00<00:23, 12.44it/s]
  1%|          | 2/300 [00:00<00:26, 11.08it/s]

{'ner': 15.886272284931538}


  2%|▏         | 6/300 [00:00<00:23, 12.60it/s]
  0%|          | 1/300 [00:00<00:32,  9.08it/s]

{'ner': 33.88168843694724}


  3%|▎         | 10/300 [00:00<00:23, 12.46it/s]
  1%|          | 2/300 [00:00<00:19, 15.09it/s]

{'ner': 37.58096563969956}


  2%|▏         | 6/300 [00:00<00:20, 14.32it/s]
  1%|          | 2/300 [00:00<00:24, 12.32it/s]

{'ner': 3.6363718793121738}


 12%|█▏        | 35/300 [00:02<00:18, 14.22it/s]
  0%|          | 1/300 [00:00<00:33,  8.91it/s]

{'ner': 173.19875393941825}


 12%|█▏        | 36/300 [00:02<00:20, 12.69it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  1%|          | 2/300 [00:00<00:29, 10.05it/s]

{'ner': 121.61737159966455}
{}



  3%|▎         | 9/300 [00:00<00:21, 13.55it/s]
  1%|          | 2/300 [00:00<00:17, 17.14it/s]

{'ner': 0.00013678678401190736}
{'ner': 53.962981295880276}


 33%|███▎      | 98/300 [00:07<00:15, 13.13it/s]
  1%|          | 2/300 [00:00<00:24, 12.15it/s]

{'ner': 607.1445304091515}


  2%|▏         | 7/300 [00:00<00:23, 12.35it/s]
  1%|          | 2/300 [00:00<00:21, 13.84it/s]

{'ner': 6.073534949661808}


  8%|▊         | 25/300 [00:01<00:20, 13.52it/s]
  1%|          | 2/300 [00:00<00:24, 12.35it/s]

{'ner': 73.08976367789441}


  7%|▋         | 20/300 [00:01<00:20, 13.79it/s]
  1%|          | 2/300 [00:00<00:21, 13.59it/s]

{'ner': 87.26485465614626}


 19%|█▉        | 57/300 [00:04<00:18, 13.06it/s]
  1%|          | 2/300 [00:00<00:24, 12.21it/s]

{'ner': 329.8338724045824}


  6%|▋         | 19/300 [00:01<00:20, 13.56it/s]
  1%|          | 2/300 [00:00<00:19, 15.21it/s]

{'ner': 70.97264340921846}


 12%|█▏        | 37/300 [00:03<00:22, 11.46it/s]
  0%|          | 1/300 [00:00<00:30,  9.91it/s]

{'ner': 93.06164936015111}


 13%|█▎        | 38/300 [00:02<00:19, 13.26it/s]
  1%|          | 2/300 [00:00<00:24, 12.20it/s]

{'ner': 118.8463942088503}


  7%|▋         | 20/300 [00:01<00:22, 12.68it/s]
  1%|          | 2/300 [00:00<00:20, 14.47it/s]

{'ner': 36.23972865905825}


  1%|          | 3/300 [00:00<00:21, 13.78it/s]
  1%|          | 2/300 [00:00<00:23, 12.89it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 49.614135271363146}
{'ner': 11.492070455854044}


  0%|          | 1/300 [00:00<00:14, 20.89it/s]
  1%|          | 2/300 [00:00<00:23, 12.42it/s]

{'ner': 2.670395540073514}


  3%|▎         | 10/300 [00:00<00:21, 13.77it/s]
  1%|          | 2/300 [00:00<00:20, 14.82it/s]

{'ner': 77.7956406961088}


 13%|█▎        | 39/300 [00:02<00:19, 13.36it/s]
  0%|          | 1/300 [00:00<00:30,  9.93it/s]

{'ner': 148.16297967564185}


  1%|          | 3/300 [00:00<00:24, 11.94it/s]
  1%|          | 2/300 [00:00<00:22, 13.32it/s]

{'ner': 6.639977722869116}


  4%|▍         | 12/300 [00:00<00:20, 13.90it/s]
  1%|          | 2/300 [00:00<00:15, 19.16it/s]

{'ner': 79.7591277036381}


  8%|▊         | 25/300 [00:01<00:20, 13.41it/s]
  0%|          | 1/300 [00:00<00:31,  9.54it/s]

{'ner': 76.8976089790339}


  1%|▏         | 4/300 [00:00<00:25, 11.51it/s]
  0%|          | 1/300 [00:00<00:19, 15.65it/s]
  0%|          | 0/300 [00:00<?, ?it/s]

{'ner': 41.07084092331687}
{'ner': 0.055027961644701244}


  8%|▊         | 23/300 [00:01<00:21, 12.71it/s]
  0%|          | 0/300 [00:00<?, ?it/s]
  1%|          | 2/300 [00:00<00:26, 11.39it/s]

{'ner': 128.47241259579425}
{}


  8%|▊         | 24/300 [00:01<00:20, 13.17it/s]
  1%|          | 2/300 [00:00<00:22, 13.47it/s]

{'ner': 119.01099802834395}


  7%|▋         | 21/300 [00:01<00:20, 13.55it/s]
  1%|          | 2/300 [00:00<00:21, 13.56it/s]

{'ner': 65.36366403815063}


  2%|▏         | 5/300 [00:00<00:23, 12.42it/s]
  1%|          | 2/300 [00:00<00:22, 13.52it/s]

{'ner': 8.027108539081224}


  4%|▍         | 13/300 [00:01<00:23, 12.22it/s]
  1%|          | 2/300 [00:00<00:22, 13.38it/s]

{'ner': 24.730119351650142}


  2%|▏         | 5/300 [00:00<00:22, 12.92it/s]
  1%|          | 2/300 [00:00<00:24, 12.36it/s]

{'ner': 8.006540469188016}


  2%|▏         | 6/300 [00:00<00:22, 13.06it/s]
  1%|          | 2/300 [00:00<00:20, 14.32it/s]

{'ner': 11.839158513420703}


  8%|▊         | 24/300 [00:01<00:21, 13.08it/s]

{'ner': 115.31049517375882}





In [20]:
print(count)

100


### Save/Download the model 

In [21]:
nlp.to_disk(r'ner_spacy_model')

### Load the model for testing

In [22]:
nlp2 = spacy.load(r'ner_spacy_model')

### Test model

In [23]:
text = input()

 KALYANI INVESTMENT COMPANY LTD.has informed BSE that the meeting of the Board of Directors of the Company is scheduled on 09/11/2020 ,inter alia, to consider and approve Pursuant to Regulation 29 of SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, this is to inform you that the meeting of the Board of Directors of the Company is scheduled to be held on Monday, November 9, 2020, inter alia, to consider and approve the Unaudited (Standalone &amp; Consolidated) Financial Results for the quarter and half year ended September 30, 2020. &lt;BR&gt; &lt;BR&gt; The Trading Window for dealing in securities of the Company, for all Designated and Connected Persons including their immediate relatives, shall be re-opened from Thursday, November 12, 2020. &lt;BR&gt;


In [24]:
'''
tokenize the input text
'''
sentences = sent_tokenize(text)
print(sentences)

['KALYANI INVESTMENT COMPANY LTD.has informed BSE that the meeting of the Board of Directors of the Company is scheduled on 09/11/2020 ,inter alia, to consider and approve Pursuant to Regulation 29 of SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, this is to inform you that the meeting of the Board of Directors of the Company is scheduled to be held on Monday, November 9, 2020, inter alia, to consider and approve the Unaudited (Standalone &amp; Consolidated) Financial Results for the quarter and half year ended September 30, 2020.', '&lt;BR&gt; &lt;BR&gt; The Trading Window for dealing in securities of the Company, for all Designated and Connected Persons including their immediate relatives, shall be re-opened from Thursday, November 12, 2020.', '&lt;BR&gt;']


In [25]:
'''
test the model
'''
for sentence in sentences:
    doc2 = nlp2(sentence)
    #print(doc2)
    for ent in doc2.ents:
        print(ent.label_, "->", ent.text)

Type -> meeting of the Board of Directors
Int -> scheduled on
SDate -> 09/11/2020


In [26]:
doc2 = nlp2(text)
    #print(doc2)
for ent in doc2.ents:
    print(ent.label_, "->", ent.text)

Type -> meeting of the Board of Directors
Int -> scheduled on
SDate -> 09/11/2020


In [27]:
temp_count=0
for i,j in zip(exception_summary,exception_annotations):
    if(temp_count==5):
        break
    print(i)
    print(j)
    print("******************************************************")
    temp_count+=1

Force Motors Ltd has informed BSE that the Board of Directors of the Company at its meeting held on June 29, 2020, inter alia, has recommended a dividend of Rs. 10/- (Rupees ten only) per equity share of Rs. 10/- each, for the Financial Year ended March 31, 2020.
{'entities': [(145, 153, 'Div'), (157, 163, 'OfferV'), (157, 163, 'FaceV'), (100, 113, 'AnnDate'), (131, 142, 'Dec')]}
******************************************************
The Board of Directors of the Company has also recommended, subject to the approval of the Shareholders at the Annual General Meeting dividend as under: Dividend @ 125% i.e Rs. 2.50 /- per Equity Share of the face value of Rs. 2/- each for the financial year ended 31° March, 2020. 
{'entities': [(134, 142, 'Div'), (164, 168, 'Percent'), (173, 181, 'OfferV'), (173, 178, 'FaceV'), (47, 58, 'Dec')]}
******************************************************
This is to inform you that the Board of Directors at its Meeting held today has recommended Bonus Issue of 

In [28]:
import pandas as pd
df = pd.DataFrame(columns=['Summary','annotations'])
df['Summary']=exception_summary
df['annotations']=exception_annotations
df.to_csv("exception.csv",index=False)


In [29]:
test="rs 10 and rs 10 writing shit rifafas now and rs 10"

In [30]:
def findOccurrences(test_str, test_sub):
    len_test=len(test_sub)
    res = [i for i in range(len(test_str)) if test_str.startswith(test_sub, i)] 
    for _ in res:
        print(_," ",_+len_test)
findOccurrences(test,'rs 10')

0   5
10   15
45   50


In [79]:
def findOccurrences(test_str, test_sub):
    len_test=len(test_sub)
    res = [i for i in range(len(test_str)) if test_str.startswith(test_sub, i)] 
    count=0
    ret=[]
    for _ in res:
        count+=1
        ret.append([_,_+len_test])
        #print(_," ",_+len_test)    
    return(count,ret)

new=[]
d = dict()

temp_count=1
for i,j in zip(exception_summary,exception_annotations):
    print(temp_count)
    print(i,'\n')
    print(j,'\n')
    
    #print(j['entities'])
    for k in j['entities']:
        print(k ," --->   ",i[k[0]:k[1]])
        
        cnt,ret=findOccurrences(i,i[k[0]:k[1]])
        if(cnt>1):
            for _ in ret:
                print(_[0],",",_[1])
            #print('\n')
            
        #print('\n')
    
    temp_count+=1
    print("*******************************************************************************************")
    
    
    s = (i)
    

1
Essel Propack Ltd has informed BSE that Board of Directors of the Company at its Meeting held on April 26, 2014 has recommended Bonus Issue of Equity Shares in the ratio of 1 (One) Equity Share of Rs. 2/- each for every 1 (One) Equity Share of Rs. 2/- each held by the shareholders of the Company as on the record date/date fixed for this purpose, subject to the approval of the Members. 

{'entities': [(173, 180, 'BonusReq'), (173, 180, 'BonusProv'), (116, 127, 'BonusDec')]} 

(173, 180, 'BonusReq')  --->    1 (One)
173 , 180
220 , 227
(173, 180, 'BonusProv')  --->    1 (One)
173 , 180
220 , 227
(116, 127, 'BonusDec')  --->    recommended
*******************************************************************************************
2
We would like to inform you that at the Board Meeting held today, the Directors have declared a second interim dividend of Rs. 12 per Equity Share of Rs. 1 each of the Company.&lt;BR&gt; &lt;BR&gt; The second interim dividend shall be paid on Tuesday, Novembe

In [80]:
exception_annotations

[{'entities': [(173, 180, 'BonusReq'),
   (173, 180, 'BonusProv'),
   (116, 127, 'BonusDec')]},
 {'entities': [(111, 119, 'Div'),
   (123, 129, 'OfferV'),
   (123, 128, 'FaceV'),
   (252, 268, 'DivExDate'),
   (85, 93, 'Dec')]},
 {'entities': [(597, 604, 'BonusReq'),
   (597, 604, 'BonusProv'),
   (235, 246, 'BonusDec')]},
 {'entities': [(134, 142, 'Div'),
   (164, 168, 'Percent'),
   (173, 181, 'OfferV'),
   (173, 178, 'FaceV'),
   (47, 58, 'Dec')]},
 {'entities': [(145, 153, 'Div'),
   (157, 163, 'OfferV'),
   (157, 163, 'FaceV'),
   (100, 113, 'AnnDate'),
   (131, 142, 'Dec')]},
 {'entities': [(443, 451, 'Div'),
   (753, 756, 'Percent'),
   (556, 560, 'OfferV'),
   (556, 561, 'FaceV'),
   (429, 440, 'Dec')]},
 {'entities': [(14, 22, 'Div'),
   (72, 76, 'Percent'),
   (26, 32, 'OfferV'),
   (26, 32, 'FaceV'),
   (0, 11, 'Dec')]},
 {'entities': [(385, 392, 'BonusReq'),
   (385, 392, 'BonusProv'),
   (137, 147, 'BonusDec')]},
 {'entities': [(97, 100, 'BonusReq'),
   (97, 100, 'BonusPro

In [81]:
exception_summary

['Essel Propack Ltd has informed BSE that Board of Directors of the Company at its Meeting held on April 26, 2014 has recommended Bonus Issue of Equity Shares in the ratio of 1 (One) Equity Share of Rs. 2/- each for every 1 (One) Equity Share of Rs. 2/- each held by the shareholders of the Company as on the record date/date fixed for this purpose, subject to the approval of the Members.',
 'We would like to inform you that at the Board Meeting held today, the Directors have declared a second interim dividend of Rs. 12 per Equity Share of Rs. 1 each of the Company.&lt;BR&gt; &lt;BR&gt; The second interim dividend shall be paid on Tuesday, November 3, 2020 to the equity shareholders of the Company, whose names appear on the Register of Members of the Company or in the records of the Depositories as beneficial owners of the shares as on Thursday, October 15, 2020 which is the Record Date fixed for the purpose.',
 'LG Balakrishnan &amp; Bros Ltd has informed BSE that the Board of Directors

In [213]:
l

"[(173, 181, 'Div'), (267, 271, 'Percent'), (229, 235, 'OfferV'), (293, 299, 'FaceV'), (145, 153, 'Dec')]"