## Scraping Unstructured Policy Data
From European Alternative Fuels Agency

In [33]:
%pip install pyyaml

Note: you may need to restart the kernel to use updated packages.


In [34]:
import pyyaml
from pathlib import Path
import os
import time
import random
import re
import json
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser

import requests # for making HTTP requests to web pages
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup # enables HTML parsing
import pandas as pd

ModuleNotFoundError: No module named 'pyyaml'

In [32]:
config_file = "/Users/katehodges/Desktop/Applications/Portfolio/Electric-Europe/scripts/config.yml"
config = configparser.ConfigParser()
config.read(config_file)

MissingSectionHeaderError: File contains no section headers.
file: '/Users/katehodges/Desktop/Applications/Portfolio/Electric-Europe/scripts/config.yml', line: 1
'default: \n'

In [16]:
# define target countries for scraping
target_countries = [
    "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czech Republic","Denmark",
    "Estonia","Finland","France","Germany","Greece","Hungary","Iceland","Ireland",
    "Italy","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Netherlands",
    "Norway","Poland","Portugal","Romania","Slovakia","Slovenia","Spain","Sweden",
    "Switzerland","Turkey","United Kingdom"
]


loop through country list to get URLs for each country's policy page

In [17]:

# define empty list to store
country_urls = []

for country in target_countries:
    slug = country.lower().replace(" ", "-") # all chars lower case, replace white space with url friendly dash
    url = f"https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/{slug}/incentives-legislations"
    country_urls.append(url)
    #print(country, '->', url) #print to confirm

print(country_urls)

['https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/austria/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/belgium/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/bulgaria/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/croatia/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/cyprus/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/czech-republic/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/denmark/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/estonia/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/finland/incentives-legislations', 'https://alternative-fuels-observatory

Define function to scrape country policy text data

In [23]:
def scrape_country_policy(url, country_name):
    
    # request page HTML
    headers = {
    "User-Agent": "Mozilla/5.0"
    }
    resp = requests.get(url, headers=headers, timeout=10)
    # raise exception for exception to see what failed
    resp.raise_for_status()

    # parse HTML to BeautifulSoup DOM (tree like structure that enables searching & manipulation down the line)
    soup = BeautifulSoup(resp.text, 'html.parser')
    # try to find main content area; fallback to body or whole soup if not found
    main_section = soup.find('main') or soup.find("div", {"role":"main"}) or soup.body or soup

    # collect text segments from elements containing policy text
        # headings: (h2/h3), paragraphs (p), list items (li)
    segments = []

    # find extract contents of main section by looping through elements
    for elem in main_section.find_all(["h2", "h3", "p", "li"]):
        text = elem.get_text(" ", strip=True) # join 'children''with spaces and strip leading/ trailing whitespace
        if text:
            segments.append(text)
    
    # join segments into one large string (two spaces between segments to keep readability)
    policy_text = "  ".join(segments)

    # return simple dict (later convert to pandas df)
    return {
        "country": country_name,
        "url": url,
        "policy_text": policy_text
    }


In [24]:
### test scraper on single country

try:
    assert len(target_countries) == len(country_urls)
except Exception as e:
    raise RuntimeError("Make sure 'target_countries' and 'country_urls' are defined and the same length") from e

# Pick one country to test (change index if you want another)
test_idx = 0  # 0 => the first country in your lists
test_country = target_countries[test_idx]
test_url = country_urls[test_idx]

print(">>> TEST SCRAPE (single country):", test_country, test_url)

# Run the single-country scraper
try:
    test_result = scrape_country_policy(test_url, test_country)
except Exception as e:
    print("Error scraping the test page:", e)
else:
    # Show what the function returned (data structure explanation)
    # test_result is a Python dict: {'country': str, 'url': str, 'policy_text': str}
    print("Scraped keys:", test_result.keys())
    print("Country:", test_result["country"])
    print("URL:", test_result["url"])
    # show a short preview of the scraped text:
    print("\nPolicy text preview (first 700 chars):\n")
    print(test_result["policy_text"][:700])  # preview first 700 chars
    # Optionally store the single test result to disk for manual inspection
    pd.DataFrame([test_result]).to_csv("raw_data/test_single_country.csv", index=False)

>>> TEST SCRAPE (single country): Austria https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/austria/incentives-legislations
Scraped keys: dict_keys(['country', 'url', 'policy_text'])
Country: Austria
URL: https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/austria/incentives-legislations

Policy text preview (first 700 chars):

Summary  Target tracker  Vehicles and fleet  Infrastructure  Incentives & legislation  Useful information  Incentives and Legislation The incentives and legislations section is updated for 2025, published on 18th April 2025, representing the situation as of that date. Major changes of incentives and polices are updated on a rolling basis from that date onwards. Incentives and legislation that aim to increase uptake of alternative fuels vehicles and infrastructure. If you know of other national or local incentives that should be included in this section, please send us an email , or use the button on the right, and let us k

--------------------------------------------------------------------

In [18]:

# declare function that  returns policy text data as dictionary for every country - takes URL & country name as inputs
def scrape_country_policy(url, country_name):
    """Scrape incentives/legislation text for one country."""

    # send HTTP request (storing result in requests.Response object)
    response = requests.get(url)
    # parses HTML string into parsable object
    soup = BeautifulSoup(response.text, "html.parser")
    
    # attempt to find main section of site (which should contain policy text)
    main_section = soup.find("main")

    # create empty list to collect strings
    policy_text = []
    
    # pull headings, paragraphs, bullet points
    for elem in main_section.find_all(["h2", "h3", "p", "li"]):
        # strip leading/ trailing whitespace
        text = elem.get_text(strip=True)
        # append to policy text string just defined
        if text:
            policy_text.append(text)
    
    return {
        "country": country_name,
        "policy_text": " ".join(policy_text)
    }

Define site URLs

In [19]:
SITE_URL = "https://alternative-fuels-observatory.ec.europa.eu"
base_url = SITE_URL + "/transport-mode/road"

Create loop to store country specific URLs

In [20]:
# parse URLs as beautiful soup object
res = requests.get(base_url)
soup = BeautifulSoup(res.text, "html.parser")

# find all <a> tags with "/road/{country}/incentives-legislations"
links = soup.find_all("a", href=True)
# empty list to store country specific URLs
country_links = []
# looping over country pages
for a in links:
    href = a["href"]
    if href.startswith("/transport-mode/road/") and href.endswith("incentives-legislations"):
        country_name = a.get_text(strip=True)
        full_url = SITE_URL + href
        country_links.append((country_name, full_url))

# print how many country pages found
print(f"Found {len(country_links)} country pages")

Found 0 country pages


In [21]:
# Loop over countries
all_data = []
for name, url in country_links:
    try:
        print(f"Scraping {name}...")
        data = scrape_country_policy(url, name)
        all_data.append(data)
        time.sleep(1)  # polite delay
    except Exception as e:
        print(f"❌ Failed for {name}: {e}")

In [22]:
# convert list of dictionaries to pandas dataframe
df = pd.DataFrame(all_data)
df.to_csv("raw_data/policy_incentives.csv", index=False)

print("✅ Saved to raw_data/policy_incentives.csv")

✅ Saved to raw_data/policy_incentives.csv
