# 1. Data Collection

This notebook focuses on collecting raw text data from a range of environmental science sources. The goal is to gather unprocessed content that will later be used for training and evaluating a Named Entity Recognition (NER) model tailored to the environmental domain.

For each source, the notebook extracts available text fields such as titles, descriptions, and full document content where possible. These are written line by line into a unified text file, without applying any preprocessing. This ensures the original structure and language of the data are preserved for later inspection, annotation, and model development.


## UKCEH Data Collection – Setup

This notebook is focused on collecting raw text data from the UKCEH data catalogue. The goal is to extract dataset titles, descriptions, and any supporting documents. At this stage, no preprocessing or annotation is applied.

The extracted content will be stored in a single text file (`data/raw_data/ukceh/ukceh_data.txt`) with one dataset per line.


In [1]:
from pathlib import Path
import os
import time
import requests
import tempfile
import zipfile
import re
import unicodedata
import contextlib
import io

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from docx import Document
from pdfminer.high_level import extract_text as extract_pdf_text
import filetype
import fitz
import pytesseract
from PIL import Image

BASE_DIR = Path("..") / "data" / "raw_data" / "ukceh"
BASE_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE = BASE_DIR / "ukceh_data.txt"

BASE_URL = "https://catalogue.ceh.ac.uk"

### Define page scraper

This function loads a single catalogue page using Selenium and returns all dataset links from it. Each dataset link corresponds to a document detail page that may include downloadable supporting documents.


In [31]:
def get_links_from_page(page_num):
    url = f"{BASE_URL}/?page={page_num}"
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    results_list = soup.find("div", class_="results__list")
    if not results_list:
        return []

    dataset_links = [
        BASE_URL + a['href']
        for a in results_list.find_all("a", href=True)
        if a['href'].startswith("/documents/")
    ]
    return dataset_links

### Get dataset metadata

Given a dataset URL, this function extracts the title, description, and supporting document download link. These elements are scraped directly from the HTML structure of the dataset detail page.


In [32]:
def get_dataset_info(dataset_url):
    response = requests.get(dataset_url)
    soup = BeautifulSoup(response.text, "html.parser")

    title = soup.find("h1").get_text(strip=True)
    desc_div = soup.find("div", class_="description-text")
    description = desc_div.get_text(strip=True) if desc_div else ""

    supporting_tag = soup.find("a", class_="btn btn-access")
    supporting_link = supporting_tag['href'] if supporting_tag else None

    return title, description, supporting_link

### Read text from a single file

This function attempts to read content from a file of type PDF, DOCX, or TXT. For PDFs, it first uses a text-based parser and falls back to OCR if needed. Files that do not contain extractable text are skipped.


In [33]:
def read_file_text(file_path):
    try:
        kind = filetype.guess(file_path)
        ext = kind.extension if kind else os.path.splitext(file_path)[-1].lower()

        if ext == "pdf":
            with contextlib.redirect_stderr(io.StringIO()):
                text = extract_pdf_text(file_path).replace("\n", " ").strip()
            
            if text:
                return text
            
            text_parts = []
            doc = fitz.open(file_path)
            for page in doc:
                pix = page.get_pixmap(dpi=300)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                ocr_text = pytesseract.image_to_string(img).strip()
                if ocr_text:
                    text_parts.append(ocr_text.replace("\n", " "))
            return " ".join(text_parts).strip()

        elif ext == "docx":
            doc = Document(file_path)
            return " ".join(p.text for p in doc.paragraphs).strip()

        elif ext == "txt":
            with open(file_path, encoding="utf-8", errors="ignore") as f:
                return f.read().replace("\n", " ").strip()

        else:
            return ""

    except Exception as e:
        return ""

### Download and extract ZIP content

This function downloads a ZIP file containing supporting documents for a dataset, extracts all readable files, and combines their content into a single string. If a file fails to process, it is ignored and an error is printed.


In [34]:
def extract_text_from_zip(zip_url, dataset_id):
    try:
        response = requests.get(zip_url)
        if not response.ok:
            
            return ""

        with tempfile.TemporaryDirectory() as tmpdir:
            zip_path = os.path.join(tmpdir, f"{dataset_id}.zip")
            with open(zip_path, "wb") as f:
                f.write(response.content)

            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(tmpdir)

            text_chunks = []
            for root, _, files in os.walk(tmpdir):
                for name in files:
                    file_path = os.path.join(root, name)
                    try:
                        text = read_file_text(file_path)
                        if text:
                            text_chunks.append(text)
                    except Exception as e:
                        print(f"Failed to read file {name}: {e}")

            return " ".join(text_chunks).strip()
    except Exception as e:
        print(f"Exception while processing zip: {e}")
        return ""


In [35]:
def clean_text(text):
    if not text:
        return ""

    text = text.lower()
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[•▪●‣–—·]", " ", text)
    text = re.sub(r"\.{3,}", "...", text)
    text = re.sub(r"[\x00-\x1F\x7F-\x9F]", " ", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"[- ]{2,}", " ", text)

    return text.strip()

### Write line to file

This writes the dataset title, description, and raw extracted text into a single line in the output file. Each field is separated by a pipe (`|`). Preprocessing such as text cleaning is not applied at this stage.


In [36]:
def write_to_file(title, url, description, extracted_text, output_file):
    cleaned_text = clean_text(extracted_text)
    line = f"{title} | {description} | {cleaned_text}".strip()
    
    with open(output_file, "a", encoding="utf-8") as f:
        f.write(line + "\n")

### Loop through all catalogue pages

This loop scrapes dataset links page-by-page and processes each dataset by extracting its metadata and any available supporting document content. The output is written to `data/raw_data/ukceh/ukceh_data.txt`.

The process runs for 114 pages. If a page has no results, the loop stops early.


In [37]:
for page in range(1, 115):
    print(f"Scraping page {page}")
    urls = get_links_from_page(page)

    if not urls:
        print(f"No results on page {page}")
        break

    for url in urls:
        try:
            title, desc, zip_link = get_dataset_info(url)
            dataset_id = url.split("/")[-1]
            extracted = ""

            if zip_link:
                extracted = extract_text_from_zip(zip_link, dataset_id)

            write_to_file(title, url, desc, extracted, OUTPUT_FILE)
        except Exception as e:
            print(f"Error on dataset: {e}")
            continue

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Exception while processing zip: Response ended prematurely
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Scraping page 43
Scraping page 44
Scraping page 45
Scraping page 46
Scraping page 47
Scraping page 48
Scraping page 49
Scraping page 50
Scraping page 51
Scraping page 52
Scraping page 53
Scraping page 54
Scraping page 55
Scraping page 5

## 2. Preprocessing Data

This section performs lightweight preprocessing on the collected data. The goal is to remove unwanted noise, broken characters, extra whitespace, and junk lines (e.g. "Table 1", bullets, control characters). This improves model input quality without aggressively altering the raw data.

For PubMed abstracts, a special step merges broken lines within each abstract using double newlines as separators. This ensures that full abstract texts are preserved and not split mid-sentence due to export formatting.

### Impact on Sources

- **PubMed**: Very positive. This merging step reconstructs the abstract into full coherent text blocks, critical for effective sentence segmentation.


In [66]:
import re
import unicodedata
from pathlib import Path


RAW_BASE = Path("../data/raw_data")
CLEAN_BASE = Path("../data/processed")

def clean_line(line):
    line = line.strip()
    if not line:
        return None  # skip empty lines

    line = unicodedata.normalize("NFKD", line)  # remove weird Unicode ligatures
    line = re.sub(r"[•▪●‣–—·]", " ", line)  # bullet points
    line = re.sub(r"\.{3,}", "...", line)  # collapse dot chains
    line = re.sub(r"https?://\S+", "", line)  # remove URLs
    line = re.sub(r"[\x00-\x1F\x7F-\x9F]", "", line)  # remove control chars
    line = re.sub(r"\s{2,}", " ", line)  # extra spaces

    if len(line.split()) < 3:  # junk (like "Table 1", "Appendix", etc)
        return None

    return line


In [63]:
def merge_pubmed_lines(input_path):
    """Merge broken abstract lines using blank lines as separators."""
    with open(input_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f]

    abstracts = []
    current = []
    for line in lines:
        if line:
            current.append(line)
        else:
            if current:
                abstracts.append(" ".join(current))
                current = []
    if current:
        abstracts.append(" ".join(current))
    return abstracts


In [65]:
def clean_file(input_path, output_path, is_pubmed=False):
    if is_pubmed:
        raw_lines = merge_pubmed_lines(input_path)
    else:
        with open(input_path, "r", encoding="utf-8") as infile:
            raw_lines = infile.readlines()

    cleaned_lines = []
    for line in raw_lines:
        cleaned = clean_line(line)
        if cleaned:
            cleaned_lines.append(cleaned)

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as outfile:
        for line in cleaned_lines:
            outfile.write(line + "\n")

    print(f"Saved cleaned file: {output_path.name} ({len(cleaned_lines)} lines)")


In [67]:
for source in ["ukceh", "environment_news", "pubmed"]:
    raw_dir = RAW_BASE / source
    clean_dir = CLEAN_BASE / source
    clean_dir.mkdir(parents=True, exist_ok=True)

    for txt_file in raw_dir.glob("*.txt"):
        is_pubmed = (source == "pubmed")
        clean_file(txt_file, clean_dir / txt_file.name, is_pubmed=is_pubmed)


Saved cleaned file: ukceh_data.txt (3948 lines)
Saved cleaned file: env_news_data.txt (28669 lines)
Saved cleaned file: abstract-habitat.txt (136427 lines)
Saved cleaned file: abstract-env_process.txt (66957 lines)
Saved cleaned file: abstract-environment.txt (66376 lines)
Saved cleaned file: abstract-taxonomy.txt (45741 lines)
Saved cleaned file: abstract-pollutants.txt (61937 lines)
Saved cleaned file: abstract-measurement.txt (58694 lines)


## 3. Sentence segmentation

After cleaning the raw documents, the next step involves sentence segmentation. This is necessary for two main reasons:

1. Named Entity Recognition (NER) models work best when input is divided into grammatically coherent units (sentences).
2. Many downstream annotation and evaluation tools expect one sentence per line.

We use `spaCy` for this step, which applies robust rule-based and statistical models for accurate sentence boundary detection. The segmented sentences are stored under `../data/sentences/`, preserving the original filenames.


In [69]:
import spacy
from pathlib import Path

# Load model and increase max length (safely)
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2_000_000  # You can increase more if needed

INPUT_DIR = Path("../data/processed")
OUTPUT_DIR = Path("../data/sentences")

def segment_sentences_streaming(input_path, output_path):
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
        for block in infile:
            block = block.strip()
            if not block:
                continue

            doc = nlp(block)
            for sent in doc.sents:
                sentence = sent.text.strip()
                if sentence:
                    outfile.write(sentence + "\n")

    print(f"Done: {output_path.name}")

# Apply to all files
for source in ["ukceh", "environment_news", "pubmed"]:
    input_folder = INPUT_DIR / source
    output_folder = OUTPUT_DIR / source
    output_folder.mkdir(parents=True, exist_ok=True)

    for file_path in input_folder.glob("*.txt"):
        output_path = output_folder / file_path.name
        segment_sentences_streaming(file_path, output_path)


Done: ukceh_data.txt
Done: env_news_data.txt
Done: abstract-habitat.txt
Done: abstract-env_process.txt
Done: abstract-environment.txt
Done: abstract-taxonomy.txt
Done: abstract-pollutants.txt
Done: abstract-measurement.txt


# This is for processing news data so include this somewhere in the code

In [None]:
import pandas as pd
from pathlib import Path

# Load CSV (with error handling for malformed rows)
csv_path = Path("data.csv")
df = pd.read_csv(csv_path, engine="python", on_bad_lines="skip")

# Clean and combine Title, Authors, and Article Text columns
combined_lines = df[["Title", "Intro Text", "Article Text"]].dropna().astype(str).apply(
    lambda row: " | ".join([
        row["Title"].replace("\n", " ").replace("\r", " ").strip(),
        row["Intro Text"].replace("\n", " ").replace("\r", " ").strip(),
        row["Article Text"].replace("\n", " ").replace("\r", " ").strip()
    ]),
    axis=1
)

output_path = csv_path.parent / "data.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for line in combined_lines:
        f.write(line + "\n")
