After extracting specimen candidates from the publications, I annotated the candidates in Prodigy. 

The annotations were then exported from Prodigy in jsonl. Here, I convert the jsonl training data into a format compatible with spaCy v3.4. The result is two files, train.spacey and valid.spacy.

In [1]:
import json
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import random

In [2]:
prodigyannotations = []
with open('./data/prodigyannotations.jsonl') as f:
    for line in f:
        annotation = json.loads(line)  
        prodigyannotations.append(annotation)

In [3]:
random.shuffle(prodigyannotations)

In [4]:
print(len(prodigyannotations))

2251


In [5]:
print(prodigyannotations[1])

{'text': 'Each specimen used in this study can be traced by a unique specimen identifier affixed to the pin (e.g. CASENT0053630).', '_input_hash': -487040886, '_task_hash': -255990888, '_is_binary': False, 'tokens': [{'text': 'Each', 'start': 0, 'end': 4, 'id': 0, 'ws': True}, {'text': 'specimen', 'start': 5, 'end': 13, 'id': 1, 'ws': True}, {'text': 'used', 'start': 14, 'end': 18, 'id': 2, 'ws': True}, {'text': 'in', 'start': 19, 'end': 21, 'id': 3, 'ws': True}, {'text': 'this', 'start': 22, 'end': 26, 'id': 4, 'ws': True}, {'text': 'study', 'start': 27, 'end': 32, 'id': 5, 'ws': True}, {'text': 'can', 'start': 33, 'end': 36, 'id': 6, 'ws': True}, {'text': 'be', 'start': 37, 'end': 39, 'id': 7, 'ws': True}, {'text': 'traced', 'start': 40, 'end': 46, 'id': 8, 'ws': True}, {'text': 'by', 'start': 47, 'end': 49, 'id': 9, 'ws': True}, {'text': 'a', 'start': 50, 'end': 51, 'id': 10, 'ws': True}, {'text': 'unique', 'start': 52, 'end': 58, 'id': 11, 'ws': True}, {'text': 'specimen', 'start':

In [6]:
trainingdata = prodigyannotations[0:1126]
validationdata = prodigyannotations[1126:2251]

In [7]:
print(len(trainingdata))
print(len(validationdata))

1126
1125


In [8]:
TRAIN_DATA = []
for annotation in trainingdata:
    entities = {}
    entitylist = []
    text = annotation.get("text")
    spans = annotation.get("spans")
    if spans is not None:
        for i in spans:
            start = i.get("start")
            end = i.get("end")
            entity = (start, end,"SPECIMEN")  #tuple
            entitylist.append(entity) #list of tuples
        entities['entities'] = entitylist #dictionary where the value is the list above
        traindata = [str(text), entities]
        TRAIN_DATA.append(traindata)

In [9]:
for annotation in trainingdata:
    entities = {}
    entitylist = []
    text = annotation.get("text")
    spans = annotation.get("spans")
    if spans is None:
        entity = (0, 0,"SPECIMEN")  #tuple
        entitylist.append(entity) #list of tuples
        entities['entities'] = entitylist #dictionary where the value is the list above
        traindata = [str(text), entities]
        TRAIN_DATA.append(traindata)

In [10]:
print(TRAIN_DATA)

[['Delta Amacuro: RÃ\xado Orinoco, shallow river, north side of river across from Isla Tres CaÃ±os, 131.8 nautical miles (= 243.8 km) from sea buoy (8Â°39â€²48â€³N, 62Â°01â€²W); USNM 228787, 8 (1 CS, 207-314).', {'entities': [(172, 183, 'SPECIMEN')]}], ['Among nondidelphid taxa with premolariform teeth at both loci, P3 is taller than P2 in â€\xa0Pucadelphys, â€\xa0Allqokirus, â€\xa0Mayulestes, â€\xa0Herpetotherium, paucituberculatans (e.g., Caenolestes (B, AMNH 196686) , and Echymipera kalubu (C, AMNH 221654).', {'entities': [(194, 205, 'SPECIMEN'), (235, 246, 'SPECIMEN')]}], ['C&S: 26 mm); MCZ 129373, 2 spp.', {'entities': [(13, 23, 'SPECIMEN')]}], ['A) Camptosaurus dispar, left pes, USNM 5473; B) Camptosaurus dispar, right pes, USNM 4277; C) Camptosaurus dispar, left pes, USNM 4697; D) Corythosaurus casuarius, right pes, USNM 15578; E) Saurolophus sp.', {'entities': [(34, 43, 'SPECIMEN'), (80, 89, 'SPECIMEN'), (175, 185, 'SPECIMEN')]}], ['These fluid specimens were labelled as types 

In [11]:
print(len(TRAIN_DATA))

1126


In [12]:
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [13]:
for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

 97%|█████████▋| 1088/1126 [00:00<00:00, 2314.49it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

100%|██████████| 1126/1126 [00:00<00:00, 2417.07it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





In [14]:
TRAIN_DATA = []
for annotation in validationdata:
    entities = {}
    entitylist = []
    text = annotation.get("text")
    spans = annotation.get("spans")
    if spans is not None:
        for i in spans:
            start = i.get("start")
            end = i.get("end")
            entity = (start, end,"SPECIMEN")  #tuple
            entitylist.append(entity) #list of tuples
        entities['entities'] = entitylist #dictionary where the value is the list above
        traindata = [str(text), entities]
        TRAIN_DATA.append(traindata)

In [15]:
for annotation in validationdata:
    entities = {}
    entitylist = []
    text = annotation.get("text")
    spans = annotation.get("spans")
    if spans is None:
        entity = (0, 0,"SPECIMEN")  #tuple
        entitylist.append(entity) #list of tuples
        entities['entities'] = entitylist #dictionary where the value is the list above
        traindata = [str(text), entities]
        TRAIN_DATA.append(traindata)

In [16]:
for text, annot in tqdm(TRAIN_DATA): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./valid.spacy") # save the docbin object

 65%|██████▍   | 728/1125 [00:00<00:00, 2917.08it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

100%|██████████| 1125/1125 [00:00<00:00, 2744.92it/s]



Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skippin