In [1]:
import json
import spacy
from spacy.tokens import DocBin
from pathlib import Path

# Load data from dummy/admin.jsonl
data_path = Path("admin.jsonl")
with open(data_path, "r") as f:
    data = [json.loads(line) for line in f]

# Lowercase all text
for example in data:
    example["text"] = example["text"].lower()

In [2]:
# Create a blank English NLP pipeline and add NER
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add all labels to the NER component
for example in data:
    for start, end, label in example["label"]:
        ner.add_label(label)

# Convert examples to Doc objects with annotated entities
doc_bin = DocBin()
for example in data:
    doc = nlp.make_doc(example["text"])
    ents = []
    for start, end, label in example["label"]:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)

# Save to .spacy file
output_path = Path("train.spacy")
doc_bin.to_disk(output_path)
print(f"Saved training data to {output_path}")

Saved training data to train.spacy


In [None]:
# Run spaCy training using your existing config
!python -m spacy train config.cfg --output dummy_models/env_ner

In [13]:
# Load your trained model
nlp = spacy.load("./dummy_models/env_ner/model-best")
texts = [
    "Florentis is a resilient species found in alpine meadows.",
    "The team discovered Graviton nesting near the glacier edge.",
    "Researchers recorded growth patterns of Luroxis in volcanic soils.",
    "Veltara was identified as a pioneer organism in the post-fire zone.",
    "Micranta thrives in nutrient-poor tundra slopes and resists wind erosion.",
    "Samples of Zarna were stored at −20°C for analysis.",
    "Lion was misclassified in earlier taxonomy records."
]

for text in texts:
    doc = nlp(text.lower())
    print(f"\nText: {text}")
    print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])



Text: Florentis is a resilient species found in alpine meadows.
Entities: []

Text: The team discovered Graviton nesting near the glacier edge.
Entities: []

Text: Researchers recorded growth patterns of Luroxis in volcanic soils.
Entities: []

Text: Veltara was identified as a pioneer organism in the post-fire zone.
Entities: []

Text: Micranta thrives in nutrient-poor tundra slopes and resists wind erosion.
Entities: []

Text: Samples of Zarna were stored at −20°C for analysis.
Entities: [('zarna', 'TAXONOMY')]

Text: Lion was misclassified in earlier taxonomy records.
Entities: []
