In [18]:
import os
import json
from pathlib import Path
import re

# Set paths
BASE_DIR = Path("..") / "data" / "raw_data"
OUTPUT_DIR = Path("..") / "data" / "processed_data"
VOCAB_DIR = Path("..") / "vocabularies"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load the input text data
with open(BASE_DIR / "env_data.txt", encoding="utf-8") as f:
    preprocessed_texts = [line.strip().lower() for line in f if line.strip()]


In [20]:

def fix_missing_spaces(text):
    return re.sub(r'(?<=[a-zA-Z0-9])(?=[.?!])(?=[^\s])', r'\g<0> ', text)

with open(BASE_DIR / "env_data.txt", encoding="utf-8") as f:
    preprocessed_texts = [
        fix_missing_spaces(line.strip().lower())
        for line in f if line.strip()
    ]

In [22]:
import ahocorasick

def build_automaton(vocab_terms):
    A = ahocorasick.Automaton()
    for term in vocab_terms:
        A.add_word(term, term)
    A.make_automaton()
    return A

def is_inside_hyphenated_word(text, start, end):
    # Check if the match is attached to another token via a hyphen
    if start > 0 and text[start - 1] == '-':
        return True
    if end < len(text) and text[end] == '-':
        return True
    return False

def annotate_text_with_vocab(text, automaton, label):
    text_length = len(text)
    matches = []

    # Iterate using the automaton
    for end_index, term in automaton.iter(text):
        start_index = end_index - len(term) + 1

        # Whole word check
        if (start_index == 0 or not text[start_index - 1].isalnum()) and (
            end_index + 1 == text_length or not text[end_index + 1].isalnum()
        ):
            if not is_inside_hyphenated_word(text, start_index, end_index + 1):
                matches.append([start_index, end_index + 1, label])

    # Sort by start index, then longer spans first
    matches.sort(key=lambda x: (x[0], x[1] - x[0]), reverse=False)
    
    # Don't block overlaps — just collect all clean, whole-word matches
    annotations = [[start, end, label] for start, end, label in matches]

    return annotations


In [22]:
# import inflect
# from pathlib import Path

# p = inflect.engine()

# # Load vocab file
# VOCAB_DIR = Path("..") / "vocabularies"
# input_path = VOCAB_DIR / "pollutant.txt"
# output_path = VOCAB_DIR / "pollutant_with_plurals.txt"

# # Read and pluralise
# with open(input_path, "r", encoding="utf-8") as f:
#     original_terms = {line.strip().lower() for line in f if line.strip()}

# expanded_terms = set()

# for term in original_terms:
#     expanded_terms.add(term)
#     # Try to pluralise single words
#     if len(term.split()) == 1:
#         plural = p.plural(term)
#         if plural and plural != term:
#             expanded_terms.add(plural)

# # Write expanded vocab
# with open(output_path, "w", encoding="utf-8") as f:
#     for term in sorted(expanded_terms):
#         f.write(term + "\n")

# print(f"✅ Expanded {len(original_terms)} terms to {len(expanded_terms)} (with plurals added).")
# print(f"📁 Saved to: {output_path}")


✅ Expanded 196 terms to 312 (with plurals added).
📁 Saved to: ..\vocabularies\meas_with_plurals.txt


In [23]:
theme_files = [
    "measurement.txt", 
    "pollutant.txt", 
    "env_process.txt", 
    "habitat.txt", 
    "taxonomy.txt"
]


In [24]:
def resolve_overlaps(entities):
    entities = sorted(entities, key=lambda x: (x[0], -(x[1] - x[0])))
    resolved = []
    occupied = set()
    for start, end, label in entities:
        if not any(pos in occupied for pos in range(start, end)):
            resolved.append([start, end, label])
            occupied.update(range(start, end))
    return sorted(resolved, key=lambda x: x[0])


In [25]:
from collections import defaultdict
import time

start_time = time.time()

text_to_annotations = defaultdict(list)

for fname in theme_files:
    theme_name = fname.replace(".txt", "")
    label = theme_name.upper()
    print(f"Annotating category: {label} from {fname}")

    with open(VOCAB_DIR / fname, encoding="utf-8") as f:
        vocab_terms = [line.strip().lower() for line in f if line.strip()]

    automaton = build_automaton(vocab_terms)

    for i, text in enumerate(preprocessed_texts):
        annotations = annotate_text_with_vocab(text, automaton, label)
        if annotations:
            text_to_annotations[text].extend(annotations)
            text_to_annotations[text] = resolve_overlaps(text_to_annotations[text])
        if (i + 1) % 100000 == 0:
            print(f"Processed {i + 1:,}/{len(preprocessed_texts):,} texts...")

# Save final output
annotated_path = OUTPUT_DIR / "training_data.jsonl"
with open(annotated_path, "w", encoding="utf-8") as f:
    for text, annotations in text_to_annotations.items():
        json.dump({"text": text, "label": annotations}, f, ensure_ascii=False)
        f.write("\n")

end_time = time.time()
print(f"✅ Done! Annotated {len(text_to_annotations):,} unique texts.")
print(f"📁 Saved to: {annotated_path}")
print(f"⏱ Total time: {end_time - start_time:.2f} seconds")


Annotating category: MEASUREMENT from measurement.txt
Processed 100,000/564,547 texts...
Processed 200,000/564,547 texts...
Processed 300,000/564,547 texts...
Processed 400,000/564,547 texts...
Processed 500,000/564,547 texts...
Annotating category: POLLUTANT from pollutant.txt
Processed 100,000/564,547 texts...
Processed 200,000/564,547 texts...
Processed 300,000/564,547 texts...
Processed 400,000/564,547 texts...
Processed 500,000/564,547 texts...
Annotating category: ENV_PROCESS from env_process.txt
Processed 100,000/564,547 texts...
Processed 200,000/564,547 texts...
Processed 300,000/564,547 texts...
Processed 400,000/564,547 texts...
Processed 500,000/564,547 texts...
Annotating category: HABITAT from habitat.txt
Processed 100,000/564,547 texts...
Processed 200,000/564,547 texts...
Processed 300,000/564,547 texts...
Processed 400,000/564,547 texts...
Processed 500,000/564,547 texts...
Annotating category: TAXONOMY from taxonomy.txt
Processed 100,000/564,547 texts...
Processed 200

In [26]:
import random
import json
from pathlib import Path

INPUT_FILE = Path("..") / "data" / "processed_data" / "training_data.jsonl"
OUTPUT_FILE = Path("..") / "data" / "processed_data" / "sample_for_manual_testing.jsonl"

all_data = []

# Load safely, skip blank or bad lines
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            data = json.loads(line)
            all_data.append(data)
        except json.JSONDecodeError:
            print("⚠️ Skipped malformed line")

# Sample
sampled = random.sample(all_data, min(1000, len(all_data)))

# Save sample
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for item in sampled:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Sampled {len(sampled)} items to {OUTPUT_FILE}")


✅ Sampled 1000 items to ../data/processed_data/sample_for_manual_testing.jsonl


In [None]:
import os
import json
import spacy
from pathlib import Path
from spacy.training.example import Example

# --- Directory Setup ---
BASE_DIR = Path("data") / "raw_data"
OUTPUT_DIR = Path("data") / "processed_data"
annotated_path = OUTPUT_DIR / "training_data.jsonl"
cleaned_path = OUTPUT_DIR / "cleaned_training_data.jsonl"

# --- Helper Functions ---
def has_overlapping_entities(entities):
    sorted_entities = sorted(entities, key=lambda x: x[0])
    for i in range(len(sorted_entities) - 1):
        current_start, current_end, _ = sorted_entities[i]
        next_start, _, _ = sorted_entities[i + 1]
        if current_end > next_start:
            return True
    return False

def resolve_overlaps(entities):
    entities = sorted(entities, key=lambda x: (x[0], -(x[1] - x[0])))
    resolved = []
    occupied = set()
    for start, end, label in entities:
        if not any(pos in occupied for pos in range(start, end)):
            resolved.append([start, end, label])
            occupied.update(range(start, end))
    return sorted(resolved, key=lambda x: x[0])

# --- SpaCy Setup ---
nlp = spacy.blank("en")
nlp.max_length = 5_000_000

# --- Load Annotated Data ---
with open(annotated_path, "r", encoding="utf-8") as f:
    raw_data = [json.loads(line.strip()) for line in f]

valid_data = []
invalid_data = []

for i, example in enumerate(raw_data):
    text = example["text"]
    annotations = example["label"]

    if has_overlapping_entities(annotations):
        annotations = resolve_overlaps(annotations)

    doc = nlp(text)
    try:
        Example.from_dict(doc, {"entities": annotations})
        valid_data.append({"text": text, "label": annotations})
    except Exception as e:
        invalid_data.append({
            "index": i,
            "error": str(e),
            "text": text,
            "label": annotations
        })

# --- Save Cleaned Data ---
with open(cleaned_path, "w", encoding="utf-8") as f:
    for item in valid_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Cleaned {len(valid_data)} valid samples.")
print(f"Skipped {len(invalid_data)} invalid samples.")
print(f"Saved cleaned annotations to: {cleaned_path}")


In [13]:
import spacy
from pathlib import Path

# Paths
BASE_DIR = Path("..") / "data" / "raw_data"
OUTPUT_DIR = Path("..") / "data" / "processed_data"
VOCAB_DIR = Path("..") / "vocabularies"

input_path = VOCAB_DIR / "taxonomy.txt"
output_path = VOCAB_DIR / "taxonomy_lemmatized.txt"

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Read original taxonomy terms
with open(input_path, "r", encoding="utf-8") as f:
    terms = [line.strip().lower() for line in f if line.strip()]

lemmatised_terms = set()

for term in terms:
    doc = nlp(term)
    lemma = " ".join([token.lemma_ for token in doc])
    lemmatised_terms.add(lemma)

# Sort and save
lemmatised_sorted = sorted(lemmatised_terms)

with open(output_path, "w", encoding="utf-8") as f:
    for term in lemmatised_sorted:
        f.write(term + "\n")

print(f"Lemmatized {len(terms)} terms down to {len(lemmatised_terms)} unique ones.")
print(f"Saved to: {output_path}")


Lemmatized 4961 terms down to 4554 unique ones.
Saved to: ..\vocabularies\taxonomy_lemmatized.txt


# Training an example model

In [27]:
from pathlib import Path
import os

BASE_DIR = Path("..") / "data" / "raw_data"
OUTPUT_DIR = Path("..") / "data" / "processed_data"
VOCAB_DIR = Path("..") / "vocabularies"
SPACY_DIR = Path("..") / "data" / "spacy_data"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(SPACY_DIR, exist_ok=True)

INPUT_FILE = OUTPUT_DIR / "training_data.jsonl"
TRAIN_JSONL = SPACY_DIR / "train.jsonl"
DEV_JSONL = SPACY_DIR / "dev.jsonl"

In [28]:
import json
import random

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    all_data = [json.loads(line) for line in f if line.strip()]

random.shuffle(all_data)

split = int(len(all_data) * 0.9)
train_data, dev_data = all_data[:split], all_data[split:]

with open(TRAIN_JSONL, "w", encoding="utf-8") as f:
    for item in train_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

with open(DEV_JSONL, "w", encoding="utf-8") as f:
    for item in dev_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Saved {len(train_data)} train and {len(dev_data)} dev examples.")


Saved 245556 train and 27284 dev examples.


In [30]:
import spacy
from spacy.tokens import DocBin
import json
from pathlib import Path

def convert_to_docbin(input_path, output_path, nlp):
    doc_bin = DocBin()
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            doc = nlp.make_doc(item["text"])
            ents = []
            for start, end, label in item["label"]:
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    ents.append(span)
            doc.ents = ents
            doc_bin.add(doc)
    doc_bin.to_disk(output_path)

# Load blank model for tokenization
nlp = spacy.blank("en")

convert_to_docbin(Path("../data/spacy_data/train.jsonl"), Path("../data/spacy_data/train.spacy"), nlp)
convert_to_docbin(Path("../data/spacy_data/dev.jsonl"), Path("../data/spacy_data/dev.spacy"), nlp)

print("✅ Converted to .spacy format")


✅ Converted to .spacy format


In [34]:
!python -m spacy init config ../config.cfg --lang en --pipeline ner --optimize accuracy

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
../config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output models/env_ner


In [40]:
import spacy

# Load your trained model (adjust path as needed)
nlp = spacy.load("../models/env_ner/model-best")

# Long test text with mixed taxonomy terms (common + scientific)
text = """
In a remote corner of Madagascar, conservationists have discovered a new population of the aye-aye, a rare lemur species known for its elongated middle finger. Meanwhile, sightings of the elusive okapi (*Okapia johnstoni*) have increased near the Ituri forest, prompting renewed efforts in habitat preservation.

Botanists at Kew Gardens have cultivated the corpse flower (*Amorphophallus titanum*), which bloomed for the first time in over a decade, attracting thousands of visitors. In nearby plots, the dragon’s blood tree (*Dracaena cinnabari*) and the queen of the Andes (*Puya raimondii*) are showing promising signs of seed viability.

Marine biologists studying coastal reefs in Australia reported the presence of the ocean sunfish (*Mola mola*), often referred to as the heaviest bony fish, alongside the rarer leafy seadragon. Additionally, deep-sea ROV footage captured what appears to be a specimen of *Leptonychotes weddellii*, a Weddell seal far from its usual Antarctic range.

In fungi research, samples of *Clathrus archeri*, or devil’s fingers, were found in pine forests previously thought unsuitable for its growth. The zombie-ant fungus (*Ophiocordyceps unilateralis*) is also showing unusual activity due to increased humidity levels in Central American rainforests.

On the microbiological front, *Bacillus subtilis* strains are being engineered to combat soil degradation, while *Prochlorococcus marinus*, one of the smallest photosynthetic organisms, continues to intrigue oceanographers for its role in carbon cycling.

In local news, a badger sett was disturbed by hikers in the Peak District, raising concerns among naturalists. Meanwhile, schools in Norfolk are teaching children to identify native British species such as the common frog, red kite, and silver birch tree.

The National Trust has launched a new awareness campaign featuring hedgehogs, tawny owls, and ancient oak trees to rekindle interest in local biodiversity. Interestingly, a small group of dormice was spotted nesting in a restored hazel coppice near the Chiltern Hills.

Elsewhere, climate scientists in Greenland observed behavioral shifts in *Tardigrada*, microscopic organisms renowned for surviving extreme conditions, hinting at yet-unknown responses to melting permafrost.

The Wildlife Photographer of the Year exhibition featured dramatic captures of a *Nannospalax leucodon* tunneling near an archaeological dig site, and a nocturnal glimpse of *Desmodus rotundus*, the common vampire bat, feeding on a stray cow near a rainforest outpost.
"""

# Process the text with your model
doc = nlp(text.lower())

# Print predicted entities and their labels
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


okapi -> TAXONOMY
forest -> HABITAT
habitat -> HABITAT
gardens -> HABITAT
reefs -> HABITAT
ocean sunfish -> TAXONOMY
weddell seal -> TAXONOMY
forests -> HABITAT
humidity -> MEASUREMENT
carbon -> POLLUTANT
badger -> TAXONOMY
red kite -> TAXONOMY
hedgehogs -> TAXONOMY
owls -> TAXONOMY
dormice -> TAXONOMY
coppice -> HABITAT
hills -> HABITAT
climate -> ENV_PROCESS
bat -> TAXONOMY
cow -> TAXONOMY
