In [1]:
#Import all required libraries
import spacy
import random
import time
import pickle
import pandas as pd
import numpy as np
from spacy.util import minibatch, compounding
import sys
from spacy import displacy
from itertools import chain
import matplotlib.pyplot as plt 
from matplotlib.ticker import MaxNLocator

In [2]:
dframe = pd.read_csv("ner.csv", encoding = "ISO-8859-1", error_bad_lines=False)

b'Skipping line 281837: expected 25 fields, saw 34\n'


In [3]:
dataset=dframe.drop(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',
       'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',
       'next-word', 'prev-iob', 'prev-lemma', 'prev-pos',
       'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',
       'prev-prev-word', 'prev-shape', 'prev-word',"pos",'shape'],axis=1)

In [129]:
dataset.to_csv('annotate_corpus.csv',index=False)

In [20]:
dataset=pd.read_csv('annotate_corpus.csv')

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050795 entries, 0 to 1050794
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   sentence_idx  1050794 non-null  float64
 1   word          1050794 non-null  object 
 2   tag           1050794 non-null  object 
dtypes: float64(1), object(2)
memory usage: 24.1+ MB


In [28]:
tag=list(dataset.tag.unique())

In [64]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [132]:
getter = SentenceGetter(dataset)

In [133]:
sentences = getter.sentences

In [134]:
len(sentences)

35177

In [65]:
def convert_format(sent):
    data=[]
    for i in sent:
        sentence=[]
        start=0
        end=0
        trigger=0
        entitles=[]
        for j in i:
            sentence.append(j[0])
            if trigger==0:
                end+=len(j[0])
                trigger+=1
                entitles.append((start,end,j[1]))
            else:
                start=end+1
                end=start+len(j[0])
                entitles.append((start,end,j[1]))

        d=' '.join(sentence)
        data.append((d,{'entities':entitles}))
    return data

In [137]:
data=convert_format(sentences)

In [142]:
d=random.sample(data,2000)

In [152]:
len(d)

2000

In [153]:
pickle.dump(d,open('spacy_blank_training.pickle','wb'))

### train with blank model

In [77]:
def ner_blank(d):

    TRAIN_DATA = d
    
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

    # add labels
    for text, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  
        optimizer = nlp.begin_training()# only train NER
        for itn in range(20):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses={}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                  # callable to update weights
                    losses=losses)
            print(losses)

Statring iteration 0
{'ner': 8399.06561235348}
Statring iteration 1
{'ner': 5561.670450416847}
Statring iteration 2
{'ner': 4627.349648685211}
Statring iteration 3
{'ner': 4095.175961650999}
Statring iteration 4
{'ner': 3561.444163593866}
Statring iteration 5
{'ner': 3232.1402157207604}
Statring iteration 6
{'ner': 2979.754204158834}
Statring iteration 7
{'ner': 2885.036320095188}
Statring iteration 8
{'ner': 2395.0915484483075}
Statring iteration 9
{'ner': 2260.5362868493785}
Statring iteration 10
{'ner': 2255.1655550164237}
Statring iteration 11
{'ner': 1992.5990144615023}
Statring iteration 12
{'ner': 1854.8667081208223}
Statring iteration 13
{'ner': 1823.7338688142208}
Statring iteration 14
{'ner': 1663.5948840292888}
Statring iteration 15
{'ner': 1474.2891270572677}
Statring iteration 16
{'ner': 1457.9582199962522}
Statring iteration 17
{'ner': 1291.1375534874576}
Statring iteration 18
{'ner': 1336.6704331669966}
Statring iteration 19
{'ner': 1184.7581889149883}


In [93]:
from pathlib import Path
output_dir=Path("ner")

In [81]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to ner


In [145]:
nlp=spacy.load('ner')

In [7]:
#evaluate 
from spacy.gold import GoldParse
from spacy.scorer import Scorer
def evaluate(ner_model, examples):

    scorer = Scorer()
    for sents, ents in examples:
        doc_gold = ner_model.make_doc(sents)
        gold = GoldParse(doc_gold, entities=ents['entities'])
        pred_value = ner_model(sents)
        scorer.score(pred_value, gold)
    return scorer.score

In [155]:
results = evaluate(nlp, d)

In [156]:
results

<bound method Scorer.score of <spacy.scorer.Scorer object at 0x0000021BE51C0BE0>>

### retrain

In [33]:
tag=tag[:-1]


In [54]:
l=['a','bb','abd','k']

In [56]:
np.split(l,[3])

[array(['a', 'bb', 'abd'], dtype='<U3'), array(['k'], dtype='<U3')]

In [81]:
def retrain_data(l):
    d=[]
    retrain=[]
    test=[]
    for i in l:
        train=dataset[dataset['tag']==i]['sentence_idx'].to_list()
        train=list(set(train))
        train_,test_=np.split(train,[int(len(train)*0.2)])
        d.append(train_)
        test.append(test_)
    retrain=[j for i in d for j in i]
    test_=[j for i in test for j in i]
    t=[]
    for i in retrain:
        if i not in t:
            t.append(i)
    data=dataset[dataset['sentence_idx'].isin(t)]
    test_d=dataset[dataset['sentence_idx'].isin(test_)]
    
    return data,test_d
    

In [82]:
retrain,test=retrain_data(tag)

In [83]:
len(retrain)

411922

In [84]:
len(test)

904952

In [85]:
getter = SentenceGetter(retrain)

In [86]:
sentences = getter.sentences

In [87]:
retrain_d=convert_format(sentences)

In [88]:
len(retrain_d)

11111

In [89]:
def retrain_ner(data):     
    
    TRAIN_DATA=data
    nlp=spacy.load("ner") 

    # Getting the ner component
    ner=nlp.get_pipe('ner')
       
    # add labels
    for text, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(20):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses={}
            for texts, annotations in TRAIN_DATA:
                nlp.update(
                    [texts],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                  # callable to update weights
                    
                    losses=losses)
            print(losses)


In [90]:
retrain_ner(retrain_d)

Statring iteration 0


  gold = GoldParse(doc, **gold)


{'ner': 35300.300735924546}
Statring iteration 1
{'ner': 30149.484013022666}
Statring iteration 2
{'ner': 27850.340095281164}
Statring iteration 3
{'ner': 26886.93461522671}
Statring iteration 4
{'ner': 26494.943516155166}
Statring iteration 5
{'ner': 25682.587797234035}
Statring iteration 6
{'ner': 25411.61490940239}
Statring iteration 7
{'ner': 24582.443030167997}
Statring iteration 8
{'ner': 23921.132220119198}
Statring iteration 9
{'ner': 24029.681311608656}
Statring iteration 10
{'ner': 23306.16035063829}
Statring iteration 11
{'ner': 23077.420118945203}
Statring iteration 12
{'ner': 22474.846542156836}
Statring iteration 13
{'ner': 22495.096729559504}
Statring iteration 14
{'ner': 21846.21764334765}
Statring iteration 15
{'ner': 21906.165624637404}
Statring iteration 16
{'ner': 21371.9170684983}
Statring iteration 17
{'ner': 21291.52914118827}
Statring iteration 18
{'ner': 21154.28981090123}
Statring iteration 19
{'ner': 20963.068749496237}


In [95]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to ner


### evaluation

In [102]:
getter = SentenceGetter(test)
sentences = getter.sentences
test=convert_format(sentences)

In [103]:
t=random.sample(test,200)

In [105]:
import spacy
from spacy.gold import GoldParse
from spacy.scorer import Scorer
def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [112]:
# retrain on small sample adding optimizer
def retrain_ner(data):     
    
    TRAIN_DATA=data
    nlp=spacy.load("ner") 

    # Getting the ner component
    ner=nlp.get_pipe('ner')
       
    # add labels
    for text, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    optimizer = nlp.begin_training()
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(20):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses={}
            
            for texts, annotations in TRAIN_DATA:
                nlp.update(
                    [texts],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,
                    sgd=optimizer,
                  # callable to update weights
                    
                    losses=losses)
            print(losses)

In [113]:
retrain_ner(t)

  proc.begin_training(


Statring iteration 0
{'ner': 683.099425269623}
Statring iteration 1
{'ner': 460.15715160107203}
Statring iteration 2
{'ner': 289.013202121412}
Statring iteration 3
{'ner': 199.15601010778005}
Statring iteration 4
{'ner': 132.3267414417312}
Statring iteration 5
{'ner': 79.32180101299417}
Statring iteration 6
{'ner': 56.05614689030193}
Statring iteration 7
{'ner': 58.11145833690745}
Statring iteration 8
{'ner': 45.37361945653317}
Statring iteration 9
{'ner': 34.61627961099172}
Statring iteration 10
{'ner': 41.868522352570245}
Statring iteration 11
{'ner': 33.65719534584754}
Statring iteration 12
{'ner': 22.791765024701895}
Statring iteration 13
{'ner': 23.46970282084221}
Statring iteration 14
{'ner': 19.08025796348342}
Statring iteration 15
{'ner': 24.411872420383425}
Statring iteration 16
{'ner': 18.033285509444053}
Statring iteration 17
{'ner': 16.417678022951296}
Statring iteration 18
{'ner': 18.829629871943595}
Statring iteration 19
{'ner': 17.63433071618141}


In [121]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    optimizer = nlp.begin_training()
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

Saved model to ner


In [114]:
a=random.sample(test,100)
t_re=[]
for i in a:
    if i not in t:
        t_re.append(i)


In [117]:
results = evaluate(nlp,test
                )

  gold = GoldParse(doc_gold_text, entities=annot['entities'])


In [118]:
results

{'uas': 0.0,
 'las': 0.0,
 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'ents_p': 95.31313820054703,
 'ents_r': 95.16303362789607,
 'ents_f': 95.23802676933609,
 'ents_per_type': {'O': {'p': 98.658400430825,
   'r': 98.62251791278179,
   'f': 98.64045590855038},
  'B-tim': {'p': 85.65512671856882,
   'r': 85.51347775756574,
   'f': 85.58424362793778},
  'B-gpe': {'p': 93.40059790732437,
   'r': 88.38048090523338,
   'f': 90.82122093023257},
  'I-geo': {'p': 76.70120898100173,
   'r': 62.93042369278731,
   'f': 69.13676344671909},
  'B-geo': {'p': 76.20546163849154,
   'r': 86.62015549708812,
   'f': 81.07973490875384},
  'B-nat': {'p': 40.0, 'r': 14.953271028037381, 'f': 21.768707482993193},
  'I-gpe': {'p': 54.87804878048781,
   'r': 20.930232558139537,
   'f': 30.303030303030305},
  'I-org': {'p': 65.20393299344501,
   'r': 73.14631987471914,
   'f': 68.94714886243301},
  'B-org': {'p': 68.91235736829327,
   'r': 64.0022547914318,
   'f': 66.36661211129297},
  'I-per': {'p'