In [1]:
from spacy import displacy
import spacy
spacy.__version__

'3.7.5'

In [2]:
nlp = spacy.load('grc_odycy_joint_trf')

  from .autonotebook import tqdm as notebook_tqdm


# Read data

In [3]:
import pandas as pd

df = pd.read_csv('acts.csv')
df['Verse'] = df['Verse'].astype(int)
df['Proper Noun'] = df['Proper Noun'].str.replace('TRUE,', '')

# Inspect

In [4]:
df['Word'].head(20)

0            Τὸν
1            μὲν
2         πρῶτον
3          λόγον
4     ἐποιησάμην
5           περὶ
6         πάντων
7              ὦ
8        Θεόφιλε
9             ὧν
10        ἤρξατο
11             ὁ
12        Ἰησοῦς
13        ποιεῖν
14            τε
15           καὶ
16     διδάσκειν
17          ἄχρι
18            ἧς
19        ἡμέρας
Name: Word, dtype: object

In [5]:
df['Proper Noun'].value_counts()

Proper Noun
PERSON    374
GPE       177
DIVINE    157
GROUP     141
TITLE     117
ORG        22
LOC        21
DEITY      11
EVENT       8
BOOK        7
LANG        2
TIME        1
Name: count, dtype: int64

In [6]:
len(df)

10038

# Preprocessing


### Split train vs. test

Take the first 7 chapters as training data

In [7]:
df_train = df[df['Verse'].astype(int) < 50801]
df_test = df[df['Verse'].astype(int) >= 50801]

### Generate NER inputs

In [28]:
def generate_ner_inputs(df):
    inputs, verse_ids = [], []
    for _, df_verse in df.groupby('Verse'):
        verse_id = df_verse['Verse'].iloc[0]
        verse_ids.append(verse_id)
        verse = ' '.join(df_verse['Word'])
        df_verse['Word Size'] = df_verse['Word'].str.len()
        df_verse['Character Count Cumsum'] = (df_verse['Word Size'] + 1).cumsum()
        # df_verse.at[df_verse.index[-1], 'Character Count Cumsum']  1
        df_verse['Word Start Index'] = df_verse['Character Count Cumsum'].shift(1).fillna(0).astype(int)
        df_verse['Word End Index'] = df_verse['Character Count Cumsum'] - 1
        entities = []
        df_entities = df_verse[~df_verse['Proper Noun'].isna()]
        for row_idx, row in df_entities.iterrows():
            entities.append((row['Word Start Index'], row['Word End Index'], row['Proper Noun']))
        inputs.append((verse, {'entities': entities}))
    return inputs, verse_ids

ner_train_inputs, train_verse_ids = generate_ner_inputs(df_train)
ner_test_inputs, test_verse_ids = generate_ner_inputs(df_test)

In [9]:
ner_train_inputs[0:5]

[('Τὸν μὲν πρῶτον λόγον ἐποιησάμην περὶ πάντων ὦ Θεόφιλε ὧν ἤρξατο ὁ Ἰησοῦς ποιεῖν τε καὶ διδάσκειν',
  {'entities': [(46, 53, 'PERSON'), (66, 72, 'PERSON')]}),
 ('ἄχρι ἧς ἡμέρας ἐντειλάμενος τοῖς ἀποστόλοις διὰ πνεύματος ἁγίου οὓς ἐξελέξατο ἀνελήμφθη',
  {'entities': [(33, 43, 'GROUP'), (48, 63, 'DIVINE')]}),
 ('οἷς καὶ παρέστησεν ἑαυτὸν ζῶντα μετὰ τὸ παθεῖν αὐτὸν ἐν πολλοῖς τεκμηρίοις δι’ ἡμερῶν τεσσεράκοντα ὀπτανόμενος αὐτοῖς καὶ λέγων τὰ περὶ τῆς βασιλείας τοῦ θεοῦ',
  {'entities': [(154, 158, 'DIVINE')]}),
 ('καὶ συναλιζόμενος παρήγγειλεν αὐτοῖς ἀπὸ Ἱεροσολύμων μὴ χωρίζεσθαι ἀλλὰ περιμένειν τὴν ἐπαγγελίαν τοῦ πατρὸς ἣν ἠκούσατέ μου',
  {'entities': [(41, 52, 'GPE'), (102, 108, 'DIVINE')]}),
 ('ὅτι Ἰωάννης μὲν ἐβάπτισεν ὕδατι ὑμεῖς δὲ ἐν πνεύματι βαπτισθήσεσθε ἁγίῳ οὐ μετὰ πολλὰς ταύτας ἡμέρας',
  {'entities': [(4, 11, 'PERSON'), (44, 52, 'DIVINE')]})]

In [10]:
nlp.pipe_names

['transformer',
 'tagger',
 'morphologizer',
 'parser',
 'trainable_lemmatizer',
 'frequency_lemmatizer']

# Create an NER pipeline in odyCy

In [11]:
# ner_pipe = nlp.add_pipe("ner")

In [13]:
if 'ner' in nlp.pipe_names:
    nlp.remove_pipe("ner")
ner_pipe = nlp.add_pipe("ner")
# ner_pipe = nlp.get_pipe('ner')
for label in df['Proper Noun'].unique():
    if str(label) != 'nan':
        ner_pipe.add_label(label)

In [14]:
ner_pipe.labels

('BOOK',
 'DEITY',
 'DIVINE',
 'EVENT',
 'GPE',
 'GROUP',
 'LANG',
 'LOC',
 'ORG',
 'PERSON',
 'TIME',
 'TITLE')

# Train the NER pipeline

In [15]:
if "frequency_lemmatizer" in nlp.pipe_names:
    nlp.remove_pipe("frequency_lemmatizer")
if "trainable_lemmatizer" in nlp.pipe_names:
    nlp.remove_pipe("trainable_lemmatizer")
print("Updated pipeline:", nlp.pipe_names)

Updated pipeline: ['transformer', 'tagger', 'morphologizer', 'parser', 'ner']


In [16]:
# Disable other pipeline components during training (if any)
enabled_pipes = ['ner']
other_pipes = list(set(nlp.pipe_names) - set(enabled_pipes))
other_pipes

['parser', 'transformer', 'tagger', 'morphologizer']

In [17]:
len(ner_train_inputs)

253

In [18]:
import random
from spacy.training.example import Example
from spacy.util import minibatch, compounding

ner = nlp.get_pipe("ner")
ner.cfg["entity_tagger"] = "bio"

try:
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()  # Initialize the model
        optimizer.learn_rate = 0.01
        for i in range(30):  # Number of training iterations (adjust as needed)
            if i >= 9: 
                optimizer.learn_rate = 1e-3
            if i >= 18:
                optimizer.learn_rate = 1e-4
            random.shuffle(ner_train_inputs)
            losses = {}
            batches = minibatch(ner_train_inputs, size=8)
            for batch in batches:
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example], drop=0.1, sgd=optimizer, losses=losses)
            print(f"Iteration {i}, Losses: {losses}")
except Exception as e:
    print(e)

Iteration 0, Losses: {'ner': 745.3154391540772}
Iteration 1, Losses: {'ner': 584.0769424879805}
Iteration 2, Losses: {'ner': 457.5183563799422}
Iteration 3, Losses: {'ner': 674.4264358493301}
Iteration 4, Losses: {'ner': 448.86567710825176}
Iteration 5, Losses: {'ner': 335.05419278878526}
Iteration 6, Losses: {'ner': 305.8969668723571}
Iteration 7, Losses: {'ner': 293.32673063683245}
Iteration 8, Losses: {'ner': 426.3595177714365}
Iteration 9, Losses: {'ner': 164.19233191771946}
Iteration 10, Losses: {'ner': 123.4704915248998}
Iteration 11, Losses: {'ner': 115.38630447694601}
Iteration 12, Losses: {'ner': 107.13690338553319}
Iteration 13, Losses: {'ner': 82.53998280391474}
Iteration 14, Losses: {'ner': 116.1989592844416}
Iteration 15, Losses: {'ner': 69.82931312230033}
Iteration 16, Losses: {'ner': 75.55982103294356}
Iteration 17, Losses: {'ner': 72.26496800415195}
Iteration 18, Losses: {'ner': 66.88909319751268}
Iteration 19, Losses: {'ner': 37.73250009120546}
Iteration 20, Losses: {'

In [76]:
try:
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.resume_training()  # Initialize the model
        optimizer.learn_rate = 1e-8
        for i in range(30):  # Number of training iterations (adjust as needed)
            if i >= 9: 
                optimizer.learn_rate = 1e-9
            if i >= 18:
                optimizer.learn_rate = 1e-10
            random.shuffle(ner_train_inputs)
            losses = {}
            batches = minibatch(ner_train_inputs, size=8)
            for batch in batches:
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example], drop=0.1, sgd=optimizer, losses=losses)
            print(f"Iteration {i}, Losses: {losses}")
except Exception as e:
    print(e)

Iteration 0, Losses: {'ner': 19.677851563674444}
Iteration 1, Losses: {'ner': 26.02974700061989}
Iteration 2, Losses: {'ner': 17.964574074179716}
Iteration 3, Losses: {'ner': 19.660696260087306}
Iteration 4, Losses: {'ner': 17.445558394918905}
Iteration 5, Losses: {'ner': 27.796410740120077}
Iteration 6, Losses: {'ner': 23.36597270980115}
Iteration 7, Losses: {'ner': 21.427785733789484}
Iteration 8, Losses: {'ner': 28.36631520661781}
Iteration 9, Losses: {'ner': 24.104098416128178}
Iteration 10, Losses: {'ner': 26.763916227431476}
Iteration 11, Losses: {'ner': 25.79023606844047}


KeyboardInterrupt: 

# Save model

In [20]:
nlp.to_disk('./odycy_with_ner/')

# Reload model

In [354]:
nlp.pipe_names

['transformer',
 'tagger',
 'morphologizer',
 'parser',
 'trainable_lemmatizer',
 'frequency_lemmatizer']

In [21]:
# nlp = spacy.load('grc_odycy_joint_trf')
# ner = nlp.create_pipe('ner')
# ner.from_disk('./ner_model')

# ner_pipe = nlp.get_pipe('ner')
# ner_pipe.labels

# Test examples

In [31]:
import warnings
warnings.filterwarnings('ignore')

In [74]:
idx = 298
text, annotations = ner_test_inputs[idx]
verse_id = test_verse_ids[idx]
doc = nlp(text)

spacy.displacy.render(doc, style='ent')
print('Verse id:', verse_id)
print('Correct annotations:')
print(annotations)

Verse id: 51535
Correct annotations:
{'entities': [(0, 6, 'PERSON'), (14, 22, 'PERSON'), (36, 45, 'GPE'), (115, 121, 'TITLE')]}


Translation (ESV): But Paul and Barnabas remained in Antioch, teaching and preaching the word of the Lord, with many others also. 

# Test scores

In [None]:
from spacy.scorer import Scorer
scorer = Scorer()
examples = []
for text, annot in ner_test_inputs:
    doc = nlp(text)
    example = Example.from_dict(nlp.make_doc(text), annot)
    examples.append(example)

In [78]:
scores = scorer.score(examples)

In [79]:
scores

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'tag_acc': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'ents_p': 0.0,
 'ents_r': 0.0,
 'ents_f': 0.0,
 'ents_per_type': {'GROUP': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'ORG': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'TITLE': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'DIVINE': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'GPE': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'PERSON': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'LOC': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'LANG': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'EVENT': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'DEITY': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'BOOK': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'TIME': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'cats_score': 0.0,
 'cats_score_desc': 'macro F',
 'cats_micro_p': 0.0,
 'cats_micro