# Training an example model

In [None]:
OUTPUT_DIR = Path("..") / "data" / "json"
VOCAB_DIR = Path("..") / "vocabularies"
SPACY_DIR = Path("..") / "data" / "spacy_data"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(SPACY_DIR, exist_ok=True)

INPUT_FILE = OUTPUT_DIR / "training_data.jsonl"
TRAIN_JSONL = SPACY_DIR / "train.jsonl"
DEV_JSONL = SPACY_DIR / "dev.jsonl"

In [None]:

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    all_data = [json.loads(line) for line in f if line.strip()]

random.shuffle(all_data)

split = int(len(all_data) * 0.9)
train_data, dev_data = all_data[:split], all_data[split:]

with open(TRAIN_JSONL, "w", encoding="utf-8") as f:
    for item in train_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

with open(DEV_JSONL, "w", encoding="utf-8") as f:
    for item in dev_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Saved {len(train_data)} train and {len(dev_data)} dev examples.")


In [None]:


def convert_to_docbin(input_path, output_path, nlp):
    doc_bin = DocBin()
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            doc = nlp.make_doc(item["text"])
            ents = []
            for start, end, label in item["label"]:
                span = doc.char_span(start, end, label=label)
                if span is not None:
                    ents.append(span)
            doc.ents = ents
            doc_bin.add(doc)
    doc_bin.to_disk(output_path)

# Load blank model for tokenization
nlp = spacy.blank("en")

convert_to_docbin(Path("../data/spacy_data/train.jsonl"), Path("../data/spacy_data/train.spacy"), nlp)
convert_to_docbin(Path("../data/spacy_data/dev.jsonl"), Path("../data/spacy_data/dev.spacy"), nlp)

print("✅ Converted to .spacy format")


In [None]:
!python -m spacy init config ../config.cfg --lang en --pipeline ner --optimize accuracy

In [None]:
!python -m spacy train config.cfg --output models/env_ner


In [None]:
# Load your trained model (adjust path as needed)
nlp = spacy.load("../models/env_ner/model-best")

# Long test text with mixed taxonomy terms (common + scientific)
text = """
Researchers in the Galápagos Islands have observed an unusual breeding pattern in Chelonoidis hoodensis, the Española giant tortoise, within fragmented dry forest patches. This habitat is increasingly threatened by rising nitrogen oxide levels, a pollutant linked to agricultural runoff.

Meanwhile, in the peat bogs of northern Scotland, the rare moss Sphagnum balticum is showing signs of resilience under fluctuating temperature gradients — a direct consequence of erratic jet stream shifts, an environmental process becoming more frequent due to Arctic warming.

An alarming spike in mercury concentration was recorded in wetland fish near the Mekong delta, where biodiversity is under pressure. The riverine habitat has also experienced abnormal algal blooms — a sign of nutrient loading, which is a key measurement in eutrophication studies.

In southern India, the tropical deciduous forests host a variety of mammals including the rusty-spotted cat (*Prionailurus rubiginosus*), one of the smallest wild felines, which now faces threats due to increasing particulate matter in the air. This pollutant, PM2.5, has exceeded safe thresholds in over 40% of sampled areas.

Glaciologists reported that meltwater runoff from the Vatnajökull glacier has doubled in the past decade — a clear signal of accelerated cryospheric change, an environmental process linked to anthropogenic CO₂ emissions. Concurrently, the lichen *Xanthoria parietina* has colonised newly exposed rock surfaces, suggesting rapid ecological succession in formerly ice-covered zones.

In a monitoring project across coral reef habitats in the Seychelles, elevated salinity and pH fluctuations were documented — measurements that correlate strongly with declining populations of *Symbiodinium*, a genus of symbiotic algae critical for coral health. Oil residues, another pollutant from passing tankers, were also found on reef sediments.

A surprising comeback of the great bustard (*Otis tarda*) in steppe habitats of eastern Spain was attributed to habitat restoration programmes and reduced pesticide use. The species had previously vanished due to soil degradation — a gradual environmental process driven by overgrazing and monoculture practices.
"""

# Process the text with your model
doc = nlp(text.lower())

# Print predicted entities and their labels
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


In [None]:
# Load your trained model (adjust path as needed)
nlp = spacy.load("../models/env_ner/model-best")

# Long test text with mixed taxonomy terms (common + scientific)
text = """
The Swiss village of Blatten has been partially destroyed after a huge chunk of glacier crashed down into the valley.

Although the village had been evacuated some days ago because of fears the Birch glacier was disintegrating, one person has been reported missing, and many homes have been completely flattened.

Blatten's mayor, Matthias Bellwald, said "the unimaginable has happened" but promised the village still had a future.

Local authorities have requested support from the Swiss army's disaster relief unit and members of the Swiss government are on their way to the scene.

The disaster that has befallen Blatten is the worst nightmare for communities across the Alps.

The village's 300 inhabitants had to leave their homes on 19 May after geologists monitoring the area warned that the glacier appeared unstable. Now many of them may never be able to return.

Appearing to fight back tears, Bellwald said: "We have lost our village, but not our heart. We will support each other and console each other. After a long night, it will be morning again."

The Swiss government has already promised funding to make sure residents can stay, if not in the village itself, at least in the locality.

However, Raphaël Mayoraz, head of the regional office for Natural Hazards, warned that further evacuations in the areas close to Blatten might be necessary.

Climate change is causing the glaciers - frozen rivers of ice - to melt faster and faster, and the permafrost, often described as the glue that holds the high mountains together, is also thawing.

Drone footage showed a large section of the Birch glacier collapsing at about 15:30 (14:30 BST) on Wednesday. The avalanche of mud that swept over Blatten sounded like a deafening roar, as it swept down into the valley leaving an enormous cloud of dust.

Glaciologists monitoring the thaw have warned for years that some alpine towns and villages could be at risk, and Blatten is not even the first to be evacuated.

In eastern Switzerland, residents of the village of Brienz were evacuated two years ago because the mountainside above them was crumbling.

Since then, they have only been permitted to return for short periods.

In 2017, eight hikers were killed, and many homes destroyed, when the biggest landslide in over a century came down close to the village of Bondo.

The most recent report into the condition of Switzerland's glaciers suggested they could all be gone within a century, if global temperatures could not be kept within a rise of 1.5C above pre-industrial levels, agreed ten years ago by almost 200 countries under the Paris climate accord.

Many climate scientists suggest that target has already been missed, meaning the glacier thaw will continue to accelerate, increasing the risk of flooding and landslides, and threatening more communities like Blatten.
"""


# Process the text with your model
doc = nlp(text.lower())

# Print predicted entities and their labels
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")
