# Vocabulary Curation for Environmental Science NER

## 1. Scraping Domain Terms from GEMET
We used the GEMET vocabulary via EIONET to retrieve domain-specific terms for categories like pollution, habitats, and climate. The scraper iterates through each themed page, collecting distinct terms.


The ‚Äúenvironmental_policy‚Äù theme was excluded as it did not align with the NER categories chosen for this project.


In [1]:
import requests
from bs4 import BeautifulSoup
import time
import os

# Themes mapped to their GEMET IDs
themes = {
    "pollution": 26,
    "biology": 4,
    "natural_areas": 23,
    "climate": 7,
    "environmental_policy": 11
}

BASE_URL = "https://www.eionet.europa.eu/gemet/en/theme/{}/concepts/?page={}&letter=0"
OUTPUT_DIR = "gemet_terms"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def scrape_theme(theme_id, theme_name):
    print(f"Scraping theme: {theme_name}")
    page = 1
    terms = set()

    while True:
        url = BASE_URL.format(theme_id, page)
        print(f"  -> Page {page}")
        response = requests.get(url)
        if response.status_code != 200:
            print("  [!] Failed to load page, stopping.")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        term_elements = soup.select("ul.listing.columns.split-20 li a")
        if not term_elements:
            break

        new_terms = {el.get_text(strip=True) for el in term_elements}
        if new_terms.issubset(terms):  # if no new terms, stop
            break

        terms.update(new_terms)
        page += 1
        time.sleep(0.5)

    with open(f"{OUTPUT_DIR}/{theme_name}.txt", "w", encoding="utf-8") as f:
        for term in sorted(terms):
            f.write(term + "\n")
    print(f"  ‚úÖ Saved {len(terms)} terms for {theme_name}.")

# Run scraper for all selected themes
for name, theme_id in themes.items():
    scrape_theme(theme_id, name)

print("\nüéâ All themes scraped and saved in 'gemet_terms/' folder.")


Scraping theme: pollution
  -> Page 1
  -> Page 2
  -> Page 3
  -> Page 4
  -> Page 5
  -> Page 6
  -> Page 7
  -> Page 8
  -> Page 9
  -> Page 10
  -> Page 11
  -> Page 12
  -> Page 13
  -> Page 14
  -> Page 15
  -> Page 16
  [!] Failed to load page, stopping.
  ‚úÖ Saved 578 terms for pollution.
Scraping theme: biology
  -> Page 1
  -> Page 2
  -> Page 3
  -> Page 4
  -> Page 5
  -> Page 6
  -> Page 7
  -> Page 8
  -> Page 9
  -> Page 10
  -> Page 11
  -> Page 12
  -> Page 13
  -> Page 14
  -> Page 15
  -> Page 16
  -> Page 17
  -> Page 18
  -> Page 19
  [!] Failed to load page, stopping.
  ‚úÖ Saved 691 terms for biology.
Scraping theme: natural_areas
  -> Page 1
  -> Page 2
  -> Page 3
  -> Page 4
  -> Page 5
  -> Page 6
  -> Page 7
  -> Page 8
  -> Page 9
  -> Page 10
  -> Page 11
  -> Page 12
  -> Page 13
  [!] Failed to load page, stopping.
  ‚úÖ Saved 467 terms for natural_areas.
Scraping theme: climate
  -> Page 1
  -> Page 2
  -> Page 3
  -> Page 4
  -> Page 5
  [!] Failed to

## 2. Cleaning and Filtering Terms
Lemmatise all GEMET terms.

Remove:

* Words that are too generic (e.g. "animal", "life")

* Words that are < 3 characters

* Anything that doesn't fit your NER context.

## 3. Lemmatization Script (with spaCy)

one of the below is correct and should be used

In [None]:
import spacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")
VOCAB_DIR = Path("gemet_terms")
CLEAN_DIR = Path("../vocabularies")
CLEAN_DIR.mkdir(exist_ok=True)

for vocab_file in VOCAB_DIR.glob("*.txt"):
    with open(vocab_file, "r", encoding="utf-8") as f:
        terms = {line.strip() for line in f if line.strip()}

    lemmatised = set()
    for term in terms:
        doc = nlp(term)
        lemma = " ".join([token.lemma_ for token in doc])
        lemmatised.add(lemma.lower())

    # Filter out generic or short terms (e.g., less than 3 chars)
    filtered = sorted({term for term in lemmatised if len(term) > 2 and not term.isnumeric()})
    
    out_path = CLEAN_DIR / vocab_file.name
    with open(out_path, "w", encoding="utf-8") as f:
        for term in filtered:
            f.write(term + "\n")
    
    print(f"{vocab_file.name}: {len(filtered)} terms saved.")


In [None]:
import spacy
from pathlib import Path

# Paths
BASE_DIR = Path("..") / "data" / "raw_data"
OUTPUT_DIR = Path("..") / "data" / "processed_data"
VOCAB_DIR = Path("..") / "vocabularies"

input_path = VOCAB_DIR / "taxonomy.txt"
output_path = VOCAB_DIR / "taxonomy_lemmatized.txt"

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Read original taxonomy terms
with open(input_path, "r", encoding="utf-8") as f:
    terms = [line.strip().lower() for line in f if line.strip()]

lemmatised_terms = set()

for term in terms:
    doc = nlp(term)
    lemma = " ".join([token.lemma_ for token in doc])
    lemmatised_terms.add(lemma)

# Sort and save
lemmatised_sorted = sorted(lemmatised_terms)

with open(output_path, "w", encoding="utf-8") as f:
    for term in lemmatised_sorted:
        f.write(term + "\n")

print(f"Lemmatized {len(terms)} terms down to {len(lemmatised_terms)} unique ones.")
print(f"Saved to: {output_path}")


This step uses SpaCy to lemmatise all terms to their base forms (e.g., "habitats" ‚Üí "habitat"). Generic terms and words shorter than 3 characters were removed.


We now have cleaned and curated vocabulary lists for each NER category:

- `taxonomy.txt` (from BHL)
- `habitat.txt`
- `pollutant.txt`
- `env_process.txt`
- `measurement.txt`

These vocabulary files are stored in the `vocabularies/` directory and are ready to be used for Aho-Corasick-based annotation. If additional terms are discovered later, they can be appended and recompiled on the fly.


### I should move this to annotation and maybe say FlashText was too slow compared to Ahocorasick

In [4]:
# --- FlashText Annotation ---
def annotate_text_with_vocab(text, vocab_terms, label):
    processor = KeywordProcessor()
    for term in vocab_terms:
        processor.add_keyword(term)

    results = processor.extract_keywords(text, span_info=True)

    annotations = []
    for keyword, start_idx, end_idx in results:
        annotations.append([start_idx, end_idx, label])
    return annotations


In [6]:
# --- Annotate and Save JSONL ---
theme_files = ["pollution.txt", "pollutants.txt", "climate.txt", "natural_areas.txt"]
theme_labels = {
    "pollution": "POLLUTION",
    "pollutants": "POLLUTANT",
    "climate": "CLIMATE",
    "natural_areas": "HABITAT"
}

combined_annotations = []

for fname in theme_files:
    theme_name = fname.replace(".txt", "")
    label = theme_labels[theme_name]
    with open(VOCAB_DIR / fname, encoding="utf-8") as f:
        vocab_terms = [line.strip().lower() for line in f if line.strip()]

    for text in preprocessed_texts:
        annotations = annotate_text_with_vocab(text.lower(), vocab_terms, label)
        if annotations:
            combined_annotations.append({"text": text, "label": annotations})

# Save the annotated data
annotated_path = OUTPUT_DIR / "training_data.jsonl"
with open(annotated_path, "w", encoding="utf-8") as f:
    for item in combined_annotations:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Saved {len(combined_annotations)} annotated samples to {annotated_path}")


  ‚è≥ Processed 563200/564547 texts...
  ‚è≥ Processed 563300/564547 texts...
  ‚è≥ Processed 563400/564547 texts...
  ‚è≥ Processed 563500/564547 texts...
  ‚è≥ Processed 563600/564547 texts...
  ‚è≥ Processed 563700/564547 texts...
  ‚è≥ Processed 563800/564547 texts...
  ‚è≥ Processed 563900/564547 texts...
  ‚è≥ Processed 564000/564547 texts...
  ‚è≥ Processed 564100/564547 texts...
  ‚è≥ Processed 564200/564547 texts...
  ‚è≥ Processed 564300/564547 texts...
  ‚è≥ Processed 564400/564547 texts...
  ‚è≥ Processed 564500/564547 texts...
‚úÖ Finished POLLUTION: 30593 annotated samples saved to ..\training_data\training_data_pollution.jsonl

üîç Annotating category: POLLUTANT from pollutants.txt
  ‚è≥ Processed 100/564547 texts...
  ‚è≥ Processed 200/564547 texts...
  ‚è≥ Processed 300/564547 texts...
  ‚è≥ Processed 400/564547 texts...
  ‚è≥ Processed 500/564547 texts...
  ‚è≥ Processed 600/564547 texts...
  ‚è≥ Processed 700/564547 texts...
  ‚è≥ Processed 800/564547 texts...
  ‚è≥

KeyboardInterrupt: 

In [None]:
# --- Clean Overlapping Annotations ---
import spacy
from spacy.training.example import Example

def has_overlapping_entities(entities):
    sorted_entities = sorted(entities, key=lambda x: x[0])
    for i in range(len(sorted_entities) - 1):
        current_start, current_end, _ = sorted_entities[i]
        next_start, _, _ = sorted_entities[i + 1]
        if current_end > next_start:
            return True
    return False

def resolve_overlaps(entities):
    entities = sorted(entities, key=lambda x: (x[0], -(x[1] - x[0])))
    resolved = []
    occupied = set()
    for start, end, label in entities:
        if not any(pos in occupied for pos in range(start, end)):
            resolved.append([start, end, label])
            occupied.update(range(start, end))
    return sorted(resolved, key=lambda x: x[0])

nlp = spacy.blank("en")
nlp.max_length = 5_000_000

# Load previously saved annotations
with open(annotated_path, "r", encoding="utf-8") as f:
    raw_data = [json.loads(line.strip()) for line in f]

valid_data = []
invalid_data = []

for i, example in enumerate(raw_data):
    text = example["text"]
    annotations = example["label"]

    if has_overlapping_entities(annotations):
        annotations = resolve_overlaps(annotations)

    doc = nlp(text)
    try:
        Example.from_dict(doc, {"entities": annotations})
        valid_data.append({"text": text, "label": annotations})
    except Exception as e:
        invalid_data.append({
            "index": i,
            "error": str(e),
            "text": text,
            "label": annotations
        })

# Save cleaned data
cleaned_path = OUTPUT_DIR / "cleaned_training_data.jsonl"
with open(cleaned_path, "w", encoding="utf-8") as f:
    for item in valid_data:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"‚úÖ Cleaned: {len(valid_data)} valid / {len(invalid_data)} invalid")
print(f"üìÅ Saved to: {cleaned_path}")
