## Scraping Unstructured Policy Data
From European Alternative Fuels Agency

In [2]:
%pip install pyyaml

Note: you may need to restart the kernel to use updated packages.


### 0. Setup & Import Libraries

In [4]:

from pathlib import Path
import os
import time
import random
import re
import json
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser

import requests # for making HTTP requests to web pages
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup # enables HTML parsing
import pandas as pd

### 1. Define Target Countries and URLs

In [6]:
# define target countries for scraping
target_countries = [
    "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czech Republic","Denmark",
    "Estonia","Finland","France","Germany","Greece","Hungary","Iceland","Ireland",
    "Italy","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Netherlands",
    "Norway","Poland","Portugal","Romania","Slovakia","Slovenia","Spain","Sweden",
    "Switzerland","Turkey","United Kingdom"
]


In [7]:

# define empty list to store
country_urls = []

for country in target_countries:
    slug = country.lower().replace(" ", "-") # all chars lower case, replace white space with url friendly dash
    url = f"https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/{slug}/incentives-legislations"
    country_urls.append(url)
    #print(country, '->', url) #print to confirm

print(country_urls)

['https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/austria/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/belgium/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/bulgaria/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/croatia/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/cyprus/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/czech-republic/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/denmark/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/estonia/incentives-legislations', 'https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/finland/incentives-legislations', 'https://alternative-fuels-observatory

### 2. Understanding Web Page Structure
Since the goal was to scrape policy sub-headings and text on each web page, it was first essential to understand the page structure, and where the relevant information was stored.

Using Finland as single page example here - also inspected page on internet browser but requesting 'main section' here is easier to inspect at once.

In [33]:
## scrape but split into sub-sections for storage

#USING FINLAND AS TEST PAGE
url = "https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road/finland/incentives-legislations"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# find the <main> section which contains all the content
main_section = soup.find("main")
print(main_section)


<main class="ecl-u-pb-xl" id="main-content">
<div class="ecl-container">
<div class="ecl-row">
<div class="ecl-col-s-12">
<div>
<div class="hidden" data-drupal-messages-fallback=""></div><div class="ecl-u-mb-l" id="block-countrymenublock">
<nav class="ecl-u-border-bottom ecl-u-border-color-grey-20">
<ul class="ecl-unordered-list ecl-unordered-list--no-bullet ecl-u-d-flex ecl-u-justify-content-lg-between ecl-u-flex-wrap eafo-menu">
<li>
<a class="ecl-u-mr-m eafo-menu__item" href="/transport-mode/road/finland" title="Summary">
          Summary
        </a>
</li>
<li>
<a class="ecl-u-mr-m eafo-menu__item" href="/transport-mode/road/finland/target-tracker" title="Target tracker">
          Target tracker
        </a>
</li>
<li>
<a class="ecl-u-mr-m eafo-menu__item" href="/transport-mode/road/finland/vehicles-and-fleet" title="Vehicles and fleet">
          Vehicles and fleet
        </a>
</li>
<li>
<a class="ecl-u-mr-m eafo-menu__item" href="/transport-mode/road/finland/infrastructure" ti

In [None]:
## better understanding how the page is stored, therefore how to target the desired data
# elements = main_section.find_all(["dt", "p", "li"]) if main_section else []
# print(len(elements))
# for e in elements[:10]:
#     print(e.name, e.get_text(strip=True))

* number of sub-headings on each country is variable, depending on the policies in place or leve of detail offered. 
* taking the approach of storing each subheading as its own row - so multiple rows refer to the same country

### 3. Define Scraping Function


In [30]:
# define full scraping function
def scrape_country_policy_sections(url, country):
    # fetch the page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    main_section = soup.find("main")

    # find the <dl> containing all the incentives
    dl = main_section.find("dl", class_="ecl-description-list")
    sections = [] 

    if dl:
        for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):
            section_title = dt.get_text(strip=True)

            # dd can have multiple p - join them
            section_text = " ".join(p.get_text(strip=True) for p in dd.find_all("p"))

            # skip introductiry/ boilerplate text if present 
            skip_phrases = [
                "The incentives and legislations section is updated",
                "representing the situation as of that date",
                "Major changes of incentives"
            ]
            if any(phrase in section_text for phrase in skip_phrases):
                continue


            sections.append({"country": country,        # hardcoded for now
                            "section": section_title,  # e.g. "Purchase subsidies"
                            "text": section_text                 # actual description
            
        })
    return sections


### 4. Loop Function over Country Pages & Store Results

In [31]:
all_results = []
# loop through all countries 
for country, url in zip(target_countries, country_urls):
    print("Scraping:", country)
    try:
        items = scrape_country_policy_sections(url, country)
        all_results.extend(items)
    except Exception as e:
        print('Failed for', country, ':', e)
    time.sleep(1.0) # polite pause between requests

# convert to df
df = pd.DataFrame(all_results)
print("Scraped rows:", df.shape[0])
print(df.head(2))

Scraping: Austria
Scraping: Belgium
Scraping: Bulgaria
Scraping: Croatia
Scraping: Cyprus
Scraping: Czech Republic
Scraping: Denmark
Scraping: Estonia
Scraping: Finland
Scraping: France
Scraping: Germany
Scraping: Greece
Scraping: Hungary
Scraping: Iceland
Scraping: Ireland
Scraping: Italy
Scraping: Latvia
Scraping: Liechtenstein
Scraping: Lithuania
Scraping: Luxembourg
Scraping: Malta
Scraping: Netherlands
Scraping: Norway
Scraping: Poland
Scraping: Portugal
Scraping: Romania
Scraping: Slovakia
Scraping: Slovenia
Scraping: Spain
Scraping: Sweden
Scraping: Switzerland
Scraping: Turkey
Scraping: United Kingdom
Scraped rows: 318
   country                    section  \
0  Austria         Purchase subsidies   
1  Austria  Registration tax benefits   

                                                text  
0  The federal purchase subsidy scheme “E-Mobilit...  
1  BEVs are fully exempt from the NoVA registrati...  


In [32]:
# save out csv
df.to_csv("/Users/katehodges/Desktop/Applications/Portfolio/Electric-Europe/data/02 interim/policy_incentives_subsectioned.csv", 
          index=False, encoding="utf-8")
print("Saved to 02 interim/policy_incentives_subsectioned.csv")

Saved to 02 interim/policy_incentives_clean.csv


In [34]:
amounts = []

for i, row in df.iterrows():
    country = row["country"]
    section = row["section"]
    text = row["text"]

    # regex to capture numbers with optional euro symbol, commoas decimals
    number_matches = re.findall(r"(€[\d,]+|\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+)", text)
    
    for match in number_matches:
        # Clean number: remove € and commas, convert to float
        num_clean = float(match.replace("€","").replace(",",""))
        
        # Determine type: flat subsidy vs threshold price (simple heuristic)
        if "subsidy" in text.lower() or "grant" in text.lower():
            amount_type = "flat_subsidy"
            amount_unit = "per_vehicle"
            confidence = 0.95
        elif "below" in text.lower() or "above" in text.lower():
            amount_type = "threshold_price"
            amount_unit = "per_vehicle"
            confidence = 0.8
        else:
            amount_type = "unknown"
            amount_unit = None
            confidence = 0.5
        
        # Extract simple condition if mentioned (e.g., price threshold)
        condition = None
        condition_normalized = {}
        cond_match = re.search(r"priced below €([\d,]+)", text)
        if cond_match:
            condition = f"for fully electric vehicles priced below €{cond_match.group(1)}"
            condition_normalized = {"price_below": float(cond_match.group(1).replace(",",""))}
        
        # Extract year if mentioned
        effective_to_match = re.search(r"ended in (\d{4})", text)
        effective_to = int(effective_to_match.group(1)) if effective_to_match else None
        
        # Store row
        amounts.append({
            "country": country,
            "section": section,
            "amount_raw": match,
            "amount_min_eur": num_clean,
            "amount_max_eur": num_clean,  # simple assumption for now
            "amount_type": amount_type,
            "amount_unit": amount_unit,
            "condition": condition,
            "condition_normalized": condition_normalized,
            "effective_to": effective_to,
            "confidence": confidence,
            "raw_text": text
        })


df_amounts = pd.DataFrame(amounts)
print(df_amounts.head())

   country                               section amount_raw  amount_min_eur  \
0  Austria                    Purchase subsidies        202           202.0   
1  Austria                    Purchase subsidies          4             4.0   
2  Austria                    Purchase subsidies        202           202.0   
3  Austria                    Purchase subsidies          5             5.0   
4  Austria  Ownership / Circulation Tax Benefits          1             1.0   

   amount_max_eur   amount_type  amount_unit condition condition_normalized  \
0           202.0  flat_subsidy  per_vehicle      None                   {}   
1             4.0  flat_subsidy  per_vehicle      None                   {}   
2           202.0  flat_subsidy  per_vehicle      None                   {}   
3             5.0  flat_subsidy  per_vehicle      None                   {}   
4             1.0       unknown         None      None                   {}   

   effective_to  confidence                       

In [35]:
df_amounts.to_csv("/Users/katehodges/Desktop/Applications/Portfolio/Electric-Europe/data/02 interim/policy_amounts.csv",
                  index=False, encoding="utf-8")