In [7]:
!pip install beautifulsoup4 requests pandas --quiet



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
import sys
!{sys.executable} -m pip install beautifulsoup4 requests pandas





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
from pathlib import Path
import re, time
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

BASE      = "https://www.neso.energy"
LIST_TPL  = BASE + "/data-portal/historic-demand-data?page={page}"
RAW_DIR   = Path("data/electricity/raw")
INT_DIR   = Path("data/electricity/intermediate")
RAW_DIR.mkdir(parents=True, exist_ok=True)
INT_DIR.mkdir(parents=True, exist_ok=True)

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; edu-project/1.0)"}
YEAR_MIN, YEAR_MAX = 2001, 2025

In [10]:
def parse_page(page:int):
    """Return list of dicts with year, href, filename from the table's download column."""
    url = LIST_TPL.format(page=page)
    r = requests.get(url, headers=HEADERS, timeout=30)
    if r.status_code != 200:
        return []
    soup = BeautifulSoup(r.text, "html.parser")

    rows = []
    for tr in soup.select("table tbody tr"):
        title_td = tr.select_one("td.views-field-title")
        # the column you highlighted
        download_td = tr.select_one("td.views-field-download") or tr.find(
            "td", class_=lambda c: c and "views-field-download" in c
        )
        if not download_td:
            continue
        a = download_td.find("a", href=True)
        if not a:
            continue
        href = a["href"]
        if href.startswith("/"):
            href = BASE + href
        if ".csv" not in href.lower():
            continue

        # pull year from title or href
        text = (title_td.get_text(strip=True) if title_td else "") + " " + href
        year = None
        for y in re.findall(r"\b(20\d{2}|200\d)\b", text):
            yi = int(y)
            if YEAR_MIN <= yi <= YEAR_MAX:
                year = yi
                break
        if year is None:
            continue

        m = re.search(r"([^/]+\.csv)(?:\?.*)?$", href)
        filename = m.group(1) if m else f"neso_{year}.csv"
        rows.append({"year": year, "href": href, "filename": filename})
    return rows


In [11]:
seen = set()
links = []
empty_streak = 0
MAX_PAGES = 200  # safety cap

for page in range(MAX_PAGES):
    page_rows = parse_page(page)
    new_count = 0
    for r in page_rows:
        key = (r["year"], r["href"])
        if key not in seen:
            seen.add(key)
            links.append(r)
            new_count += 1
    if new_count == 0:
        empty_streak += 1
    else:
        empty_streak = 0
    # stop after a few empty pages in a row
    if empty_streak >= 3:
        break
    time.sleep(0.2)  # be polite

# keep only 2001–2025 and sort
links = [r for r in links if YEAR_MIN <= r["year"] <= YEAR_MAX]
links.sort(key=lambda d: (d["year"], d["filename"]))
print(f"Found {len(links)} CSVs across pages.")


Found 25 CSVs across pages.


In [12]:
for r in tqdm(links, desc="Downloading"):
    dest = RAW_DIR / f"{r['year']}_{r['filename']}"
    if dest.exists() and dest.stat().st_size > 0:
        continue
    with requests.get(r["href"], headers=HEADERS, timeout=120, stream=True) as resp:
        resp.raise_for_status()
        with open(dest, "wb") as f:
            for chunk in resp.iter_content(1024 * 256):
                if chunk:
                    f.write(chunk)

print("Raw files stored in:", RAW_DIR)

Downloading: 100%|██████████| 25/25 [00:00<00:00, 12483.05it/s]

Raw files stored in: data\electricity\raw





In [13]:
files = sorted(RAW_DIR.glob("*.csv"))
assert files, f"No CSVs in {RAW_DIR}"

# union of columns
all_cols = set()
for f in files:
    try:
        cols = pd.read_csv(f, nrows=0).columns.tolist()
        all_cols.update(cols)
    except Exception as e:
        print(f"Header read error {f.name}: {e}")
all_cols = sorted(all_cols)

combined_csv = INT_DIR / "electricity_all.csv"
header_written = False
for f in tqdm(files, desc="Combining"):
    try:
        for chunk in pd.read_csv(f, chunksize=200_000, low_memory=False):
            chunk = chunk.reindex(columns=all_cols)
            chunk["source_file"] = f.name
            chunk.to_csv(combined_csv, mode="a", index=False, header=not header_written)
            header_written = True
    except Exception as e:
        print(f"Error combining {f.name}: {e}")

print("Combined CSV:", combined_csv)


Combining: 100%|██████████| 25/25 [00:03<00:00,  7.89it/s]

Combined CSV: data\electricity\intermediate\electricity_all.csv



