### spaCy NER with Statistical Modelling

In [1]:
import spacy
import pandas as pd
from spacy import displacy
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1ebbe65ac40>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1ebbe62afa0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1ebbe767040>)]

![](https://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg) 

### tokenization layer is therefore perma, rest can be customized

In [4]:
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1ebbe767040>)

In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x1ebbe65ac40>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x1ebbe62afa0>)]

### Matching x Index/Entity Parsing Fxn
https://spacy.io/usage/training#training-simple-style

In [8]:
from common import create_patterns #common.py contains the matcher
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("PROG_LANG", None, *create_patterns()) 

In [9]:
doc = nlp("I do code with datastuff using python and golang.")

for idx, start, end in matcher(doc):
    print(doc[start:end],)

python
golang


In [10]:
type(doc[start:end])

spacy.tokens.span.Span

In [11]:
def parse_train_data(doc):
    detections = [(doc[start:end].start_char, doc[start:end].end_char, 'PROGLANG') for idx, start, end in matcher(doc)]
    return (doc.text, {'entities': detections})

parse_train_data(nlp("i like python, javascript and golang"))

('i like python, javascript and golang',
 {'entities': [(7, 13, 'PROGLANG'),
   (15, 25, 'PROGLANG'),
   (30, 36, 'PROGLANG')]})

### Training Data


In [13]:
import random
import datetime as dt

In [57]:
df = (pd.read_csv("../data/have_label.txt", 
                  nrows=5_000, 
                  sep='\t', 
                  usecols=['Label', 'Title']))

titles = df.loc[lambda d: d['Label'] == 1]['Title']

In [60]:
TRAIN_DATA = [parse_train_data(d) for d in nlp.pipe(titles) if len(matcher(d)) == 1]
TRAIN_DATA[5:8]

[('How to set up unit testing for Visual Studio C++',
  {'entities': [(45, 48, 'PROGLANG')]}),
 ('How do you pack a visual studio c++ project for release?',
  {'entities': [(32, 35, 'PROGLANG')]}),
 ('How do you get leading wildcard full-text searches to work in SQL Server?',
  {'entities': [(62, 65, 'PROGLANG')]})]

### Training Loop (Blank English Model, +NER )

https://spacy.io/usage/training#training-simple-style)



In [65]:
def create_blank_nlp(train_data):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp

In [69]:
from spacy.util import minibatch, compounding

In [71]:
nlp = create_blank_nlp(TRAIN_DATA)
optimizer = nlp.begin_training()
for i in range(20):
    losses = {}
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,  # batch of texts
            annotations,  # batch of annotations
            drop=0.1,  # dropout - make it harder to memorise data
            losses=losses,
        )
    print(f"Losses at iteration {i} - {dt.datetime.now()} {losses}")

Losses at iteration 0 - 2020-02-29 16:13:35.823776 {'ner': 421.81081383064986}
Losses at iteration 1 - 2020-02-29 16:13:40.236429 {'ner': 16.171604070858784}
Losses at iteration 2 - 2020-02-29 16:13:45.031095 {'ner': 10.869232156674228}
Losses at iteration 3 - 2020-02-29 16:13:50.309758 {'ner': 5.347369765463781}
Losses at iteration 4 - 2020-02-29 16:13:54.814064 {'ner': 5.267283654703734}
Losses at iteration 5 - 2020-02-29 16:13:59.583930 {'ner': 7.034331411273773}
Losses at iteration 6 - 2020-02-29 16:14:04.977785 {'ner': 20.55244086534093}
Losses at iteration 7 - 2020-02-29 16:14:11.207178 {'ner': 16.854737952514622}
Losses at iteration 8 - 2020-02-29 16:14:16.702827 {'ner': 12.846826920458023}
Losses at iteration 9 - 2020-02-29 16:14:22.886344 {'ner': 7.316021861073125}
Losses at iteration 10 - 2020-02-29 16:14:29.519257 {'ner': 0.20566945497729483}
Losses at iteration 11 - 2020-02-29 16:14:36.143884 {'ner': 3.7788202090958585}
Losses at iteration 12 - 2020-02-29 16:14:42.415683 {'

In [72]:
nlp.pipeline

[('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x13cabd2e8>)]

### Room for Improvement

In [74]:
doc = nlp("i write code in python")

In [81]:
doc = nlp("i write code in python")
displacy.render(doc, style="ent")

In [83]:
doc = nlp("i write code in python and go")
displacy.render(doc, style="ent")