# Load Packages

In [13]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

# Processing

Data source: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

In [14]:
df = pd.read_csv('data/ner_dataset.csv')
df['Sentence #'] = df['Sentence #'].ffill()
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [16]:
def get_spacy_file(df, sentences, train_test="train"):
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin obje
    for i in tqdm(sentences):
        df2 = df[df['Sentence #']==i]
        word_length = df2['Word'].apply(lambda x: len(x)+1)
        df2['End'] = word_length.cumsum()-1
        df2['Start'] = df2['End']-word_length+1

        # get content
        content = ' '.join([df['Word'][j] for j in df2.index])
        doc = nlp.make_doc(content)

        # get entities
        ents = []
        df2 = df2[df2['Tag'] != 'O']
        for index, row in df2.iterrows():
            span = doc.char_span(row['Start'], row['End'], label=row['Tag'], alignment_mode="contract")
            if span:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(f"./{train_test}.spacy")

train_pct = 0.8
dev_pct = 0.1
sentences = df['Sentence #'].unique()
n = len(sentences)
n_train = int(train_pct*n)
n_dev = int(dev_pct*n)
train_sentences = df['Sentence #'].unique()[:n_train]
dev_sentences = df['Sentence #'].unique()[n_train:n_train+n_dev]
get_spacy_file(df, train_sentences, 'train')
get_spacy_file(df, dev_sentences, 'dev')


  0%|          | 0/38367 [00:00<?, ?it/s]

  0%|          | 0/4795 [00:00<?, ?it/s]

# Fill Config

In [17]:
!python3 -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


# Train Model

In [18]:
!python3 -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Using CPU[0m
[1m
[2021-04-26 21:07:56,772] [INFO] Set up nlp object from config
[2021-04-26 21:07:56,790] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-04-26 21:07:56,797] [INFO] Created vocabulary
[2021-04-26 21:07:56,797] [INFO] Finished initializing nlp object
[2021-04-26 21:08:31,612] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     57.59    0.00    0.00    0.00    0.00
  0     200         82.50   3479.50   55.03   58.81   51.69    0.55
  0     400        272.74   2419.22   69.91   70.38   69.45    0.70
  0     600        242.02   2463.52   74.89   75.21   74.57    0.75
  0     800        375.82   2787.09   77.27   80.01   74.72    0.77
  0    1000        386.50   3389.05   78

# Prediction

In [92]:
nlp = spacy.load(R"./output/model-best")
type1_error, type2_error, n_words = 0, 0, 0
test_sentences = df['Sentence #'].unique()[n_train+n_dev:]
for i in tqdm(test_sentences):
    df2 = df[df['Sentence #']==i]
    sentence = ' '.join([df2['Word'][j] for j in df2.index])
    df2 = df2[df2['Tag'] != 'O']
    n_words += len(df2)
    
    # truth
    words = df2['Word'].values
    tags = df2['Tag'].values
    truths = []
    for j in range(len(words)):
        truths.append((words[j], tags[j]))
    
    # predict
    doc = nlp(sentence)
    predictions = [(ent.text, ent.label_) for ent in doc.ents]
    
    # type 1 error
    for prediction in predictions:
        if prediction not in truths:
            type1_error += 1
    
    # type 2 error
    for truth in truths:
        if truth not in predictions:
            type2_error += 1
    

  0%|          | 0/4797 [00:00<?, ?it/s]

In [94]:
type1_error = type1_error/n_words
type2_error = type2_error/n_words
print('Type 1 error: {}'.format(type1_error))
print('Type 2 error: {}'.format(type2_error))

Type 1 error: 0.14954776359806715
Type 2 error: 0.15208772147193655


In [100]:
# visualize
df2 = df[df['Sentence #']==test_sentences[0]]
df2

Unnamed: 0,Sentence #,Word,POS,Tag
944610,Sentence: 43163,At,IN,O
944611,Sentence: 43163,the,DT,O
944612,Sentence: 43163,Group,NNP,B-org
944613,Sentence: 43163,of,IN,I-org
944614,Sentence: 43163,Eight,CD,I-org
944615,Sentence: 43163,summit,NN,I-org
944616,Sentence: 43163,in,IN,O
944617,Sentence: 43163,Scotland,NNP,B-geo
944618,Sentence: 43163,",",",",O
944619,Sentence: 43163,Japanese,JJ,B-gpe


In [101]:
sentence = ' '.join([df2['Word'][j] for j in df2.index])
doc = nlp(sentence)
spacy.displacy.render(doc, style="ent", jupyter=True)