In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
ENTITIES_PATH = "entities.txt"
DATASET_PATH = "sentences.csv"

In [3]:
df = pd.read_csv(DATASET_PATH, index_col=0, header=0, names=["name"])
df.head()

Unnamed: 0,name
0,the minister for social and family development...
1,mr speaker
2,mr desmond lee
3,ms denise phua lay peng jalan besar
4,ms sylvia lim aljunied


In [4]:
with open(ENTITIES_PATH) as f:
    entities = np.array([re.sub(r"[^a-z. ]", "", line.replace("\n","").lower().strip()) for line in f.readlines() if line.strip()])

Special case:

In [5]:
df.loc[(df['name'] == "the prime minister"), "name"] = 'lee hsien loong'

In [6]:
def map_text_to_mp(text, entities):
    for entity in entities:
        if entity in text:
            content = re.sub(entity, lambda x: f" {x.group()} ", text)
            content = content.replace("  ", " ").strip()
            match = re.search(entity, content)
            return content, match.span()
    return np.nan

In [7]:
df['data'] = df['name'].map(lambda text: map_text_to_mp(text, entities))
df.head()

Unnamed: 0,name,data
0,the minister for social and family development...,(the minister for social and family developmen...
1,mr speaker,
2,mr desmond lee,"(mr desmond lee, (3, 14))"
3,ms denise phua lay peng jalan besar,"(ms denise phua lay peng jalan besar, (3, 23))"
4,ms sylvia lim aljunied,"(ms sylvia lim aljunied, (3, 13))"


In [8]:
df = df.dropna()

Create a training set

In [9]:
import spacy
from spacy.tokens import DocBin

LABEL = "PERSON"
def make_example(content, span):
    return (content, [(span[0], span[1], LABEL)])

nlp = spacy.blank("en")
training_data = [make_example(content, span) for content, span in df.loc[:550, 'data']]
db = DocBin()
for text, annotations in training_data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        assert span is not None, f"span is None: {text} {start} {end} {label}"
        ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("./train.spacy")

training_data = [make_example(content, span) for content, span in df.loc[550:, 'data']]
db = DocBin()
for text, annotations in training_data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations:
        span = doc.char_span(start, end, label=label)
        assert span is not None, f"span is None: {text} {start} {end} {label}"
        ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("./val.spacy")

In [10]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [11]:
!python -m spacy train config.cfg --output ./models

[i] Saving to output directory: models
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     40.50    0.00    0.00    0.00    0.00
  4     200         37.11   1270.77   97.06   98.51   95.65    0.97
  9     400          0.33      0.34   97.06   98.51   95.65    0.97
 16     600          8.79      3.36   97.06   98.51   95.65    0.97
 24     800          2.83      0.99   97.81   98.53   97.10    0.98
 34    1000         55.29     17.91   96.35   97.06   95.65    0.96
 46    1200          0.35      0.22   96.35   97.06   95.65    0.96
 61    1400          0.00      0.00   96.35   97.06   95.65    0.96
 79    1600          0.00      0.00   96.35   97.06   95.65    0.96
101    1800          0.00      0.00   96.35   97.06   95.65    0.96
128    2000          0.00      0.

[2022-03-19 18:19:49,796] [INFO] Set up nlp object from config
[2022-03-19 18:19:49,805] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-03-19 18:19:49,809] [INFO] Created vocabulary
[2022-03-19 18:19:49,810] [INFO] Finished initializing nlp object
[2022-03-19 18:19:50,266] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [23]:
trained_nlp = spacy.load("models/model-best")
text = "mr edwin tong".lower()
doc = trained_nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

edwin tong PERSON
