In [63]:
from pathlib import Path
import joblib

import spacy
from spacy.lang.en import English
from spacy.scorer import Scorer
from spacy.tokens import Span
from spacy.training.example import Example
from spacy.tokens import DocBin

from sklearn.metrics import classification_report, confusion_matrix

from collections import defaultdict, Counter

TEST_DATA_PATH = Path("../data/spaCy/test.spacy")

CRF_MODEL_PATH = Path("../models/crf/final_crf_model.joblib")
CNN_MODEL_PATH = Path("../models/spaCy/cnn_best/model-best")
TRANSFORMER_MODEL_PATH = Path("../models/spaCy/transformer_2/model-best")

In [58]:
def load_test_data(path):
    doc_bin = DocBin().from_disk(path)
    return list(doc_bin.get_docs(cnn_model.vocab))

test_docs = load_test_data(TEST_DATA_PATH)

In [64]:
# Load CRF model
crf_model = joblib.load(CRF_MODEL_PATH)

# Load SpaCy CNN model
cnn_model = spacy.load(CNN_MODEL_PATH)

# Load SpaCy Transformer model
transformer_model = spacy.load(TRANSFORMER_MODEL_PATH)

In [65]:
def word2features(sentence, i):
    word = sentence[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }

    if i > 0:
        prev_word = sentence[i - 1][0]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sentence) - 1:
        next_word = sentence[i + 1][0]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sentence_to_features(sentence):
    return [word2features(sentence, i) for i in range(len(sentence))]

In [18]:
def flatten_predictions(gold_docs, cnn_model, transformer_model, crf_model):
    y_true = []
    y_pred_cnn = []
    y_pred_trans = []
    y_pred_crf = []

    for doc in gold_docs:
        # True entity spans
        true_ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}

        # CNN prediction
        pred_cnn = cnn_model(doc.text)
        pred_ents_cnn = {(ent.start_char, ent.end_char): ent.label_ for ent in pred_cnn.ents}

        # Transformer prediction
        pred_trans = transformer_model(doc.text)
        pred_ents_trans = {(ent.start_char, ent.end_char): ent.label_ for ent in pred_trans.ents}

        # CRF prediction — needs tokenised sentence with dummy tags
        tokens = [t.text for t in doc]
        dummy_sentence = [(token, "O") for token in tokens]
        features = sentence_to_features(dummy_sentence)
        pred_labels_crf = crf_model.predict_single(features)

        # Convert BIO tags to character spans
        pred_ents_crf = {}
        start = None
        current_label = None
        for i, tag in enumerate(pred_labels_crf):
            if tag.startswith("B-"):
                if start is not None:
                    end_idx = doc[i - 1].idx + len(doc[i - 1])
                    pred_ents_crf[(doc[start].idx, end_idx)] = current_label
                start = i
                current_label = tag[2:]
            elif tag.startswith("I-") and start is not None and tag[2:] == current_label:
                continue
            else:
                if start is not None:
                    end_idx = doc[i - 1].idx + len(doc[i - 1])
                    pred_ents_crf[(doc[start].idx, end_idx)] = current_label
                    start = None
                    current_label = None
        if start is not None:
            end_idx = doc[-1].idx + len(doc[-1])
            pred_ents_crf[(doc[start].idx, end_idx)] = current_label

        # Align and record predictions
        for span, true_label in true_ents.items():
            y_true.append(true_label)
            y_pred_cnn.append(pred_ents_cnn.get(span, "O"))
            y_pred_trans.append(pred_ents_trans.get(span, "O"))
            y_pred_crf.append(pred_ents_crf.get(span, "O"))

    return y_true, y_pred_cnn, y_pred_trans, y_pred_crf


In [None]:
y_true, y_cnn, y_trans, y_crf = flatten_predictions(test_docs, cnn_model, transformer_model, crf_model)


In [22]:
# Extract just the overall (weighted avg) precision, recall, f1 for each model
def get_overall_scores(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    return {
        "precision": round(report["weighted avg"]["precision"], 3),
        "recall": round(report["weighted avg"]["recall"], 3),
        "f1": round(report["weighted avg"]["f1-score"], 3),
    }

overall = pd.DataFrame({
    "crf": get_overall_scores(y_true, y_crf),
    "cnn": get_overall_scores(y_true, y_cnn),
    "transformer": get_overall_scores(y_true, y_trans)
})

print("Overall Weighted Scores")
print(overall)


Overall Weighted Scores
            crf   cnn  transformer
precision 1.000 0.999        0.998
recall    0.983 0.972        0.939
f1        0.991 0.985        0.967


In [23]:
# Extract per-label scores only (excluding O, avg rows)
def get_per_label_scores(y_true, y_pred, model_name):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    rows = {}
    for label in report:
        if label in ["accuracy", "macro avg", "weighted avg", "O"]:
            continue
        rows[label] = {
            f"{model_name}_precision": round(report[label]["precision"], 3),
            f"{model_name}_recall": round(report[label]["recall"], 3),
            f"{model_name}_f1": round(report[label]["f1-score"], 3),
        }
    return pd.DataFrame.from_dict(rows, orient="index")

# Create and merge
df_crf = get_per_label_scores(y_true, y_crf, "crf")
df_cnn = get_per_label_scores(y_true, y_cnn, "cnn")
df_trans = get_per_label_scores(y_true, y_trans, "transformer")

per_label = pd.concat([df_crf, df_cnn, df_trans], axis=1)

print("\nPer-Label Scores")
print(per_label)



Per-Label Scores
             crf_precision  crf_recall  crf_f1  cnn_precision  cnn_recall  \
ENV_PROCESS          1.000       0.981   0.990          1.000       0.980   
HABITAT              1.000       0.990   0.995          1.000       0.989   
MEASUREMENT          1.000       0.980   0.990          1.000       0.974   
POLLUTANT            0.999       0.990   0.994          0.993       0.980   
TAXONOMY             0.998       0.973   0.986          0.999       0.932   

             cnn_f1  transformer_precision  transformer_recall  transformer_f1  
ENV_PROCESS   0.990                  0.999               0.951           0.974  
HABITAT       0.995                  0.999               0.979           0.989  
MEASUREMENT   0.987                  0.998               0.929           0.962  
POLLUTANT     0.986                  0.994               0.946           0.970  
TAXONOMY      0.964                  0.997               0.863           0.925  


In [66]:

# Sample input text
text = (
    "Fieldwork commenced just after dawn in the upper Wensum catchment, where volunteers recorded temperatures below 6 °C "
    "and detected elevated levels of nitrates and trace concentrations of microplastics near the riparian buffer zone. "
    "A cluster of Eurasian badgers was seen retreating into a sett bordering a damp woodland patch, while two red squirrels "
    "leapt between oak canopies. Near the estuary inlet, European shags were observed preening, with occasional calls echoing "
    "across the saltmarsh. Later in the morning, we encountered several common chiffchaffs and a grey heron standing motionless "
    "in the wetland shallows. Evidence of European rabbits digging under bramble thickets was noted alongside tracks left by roe deer. "
    "Air pressure dropped slightly as cloud cover thickened, consistent with an approaching Atlantic front. Beneath scattered birches, "
    "we documented a Eurasian coot nest, disturbed only briefly by a curious fox. Scat near a hollow log suggested recent pine marten activity, "
    "while the adjacent heathland yielded a slow worm and a smooth snake basking in a sunlit patch of leaf litter. "
    "Near the lower transect, yellowhammers foraged on dried seed heads, and a tawny owl fledgling was spotted perched under a beech branch. "
    "In a shaded brook, two common frogs swam toward the bank while a solitary common toad remained hidden under moss. "
    "As the team crossed an open grassland strip, a kestrel hovered overhead and long-tailed tits passed in a burst of movement. "
    "We documented vocalisations from Eurasian nuthatches and woodpeckers, and spotted a muntjac deer grazing under blackthorn. "
    "Bank vole tunnels were visible along the hedgerow margin. As dusk approached, the air cooled and pipistrelles emerged, silhouetted "
    "against the sky. Atmospheric conditions suggested early signs of climate change effects on the breeding rhythms of local amphibians."
)

# Get predictions from CNN and Transformer models
results = {}
entity_counts = {}

doc_cnn = cnn_model(text)
doc_transformer = transformer_model(text)

results["cnn"] = {(ent.text, ent.label_) for ent in doc_cnn.ents}
results["transformer"] = {(ent.text, ent.label_) for ent in doc_transformer.ents}
entity_counts["cnn"] = len(results["cnn"])
entity_counts["transformer"] = len(results["transformer"])

In [67]:
# Tokenise the input using SpaCy's English tokenizer
nlp_dummy = English()
tokens = [token.text for token in nlp_dummy(text)]

# Create dummy sentence and extract features
sentence = [(t, "O") for t in tokens]
X_test = sentence_to_features(sentence)
y_pred = crf_model.predict_single(X_test)

# Convert BIO tags to entity spans
spans = []
current = []
label = None

for token, tag in zip(tokens, y_pred):
    if tag == 'O':
        if current:
            spans.append((" ".join(current), label))
            current = []
            label = None
    elif tag.startswith('B-'):
        if current:
            spans.append((" ".join(current), label))
        current = [token]
        label = tag[2:]
    elif tag.startswith('I-') and label == tag[2:]:
        current.append(token)
    else:
        if current:
            spans.append((" ".join(current), label))
        current = []
        label = None

if current:
    spans.append((" ".join(current), label))

results["crf"] = set(spans)
entity_counts["crf"] = len(spans)


In [68]:
import pandas as pd

# Combine all entity predictions
all_entities = sorted(set.union(*results.values()), key=lambda x: text.find(x[0]))

# Construct table
rows = []
for span, label in all_entities:
    row = {
        "Entity": span,
        "Label": label,
        "crf": "✔" if (span, label) in results["crf"] else "",
        "cnn": "✔" if (span, label) in results["cnn"] else "",
        "transformer": "✔" if (span, label) in results["transformer"] else ""
    }
    rows.append(row)

# Add count summary row
count_row = {"Entity": "**Entity Count**", "Label": ""}
for name in entity_counts:
    count_row[name] = str(entity_counts[name])
rows.append(count_row)

# Display table
df = pd.DataFrame(rows)
pd.set_option("display.max_rows", None)
print(df.to_markdown(index=False))


| Entity              | Label       | crf   | cnn   | transformer   |
|:--------------------|:------------|:------|:------|:--------------|
| ° C                 | MEASUREMENT | ✔     |       |               |
| temperatures        | MEASUREMENT | ✔     | ✔     | ✔             |
| °C                  | MEASUREMENT |       | ✔     |               |
| nitrates            | POLLUTANT   | ✔     | ✔     | ✔             |
| microplastics       | POLLUTANT   | ✔     | ✔     |               |
| Eurasian badgers    | TAXONOMY    | ✔     | ✔     |               |
| badgers             | TAXONOMY    |       |       | ✔             |
| red squirrels       | TAXONOMY    | ✔     | ✔     | ✔             |
| estuary             | HABITAT     | ✔     | ✔     | ✔             |
| European shags      | TAXONOMY    | ✔     | ✔     |               |
| grey heron          | TAXONOMY    | ✔     | ✔     |               |
| heron               | TAXONOMY    |       |       | ✔             |
| wetland           