## Scraping Unstructured Policy Data

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE_URL = "https://alternative-fuels-observatory.ec.europa.eu"

def scrape_country_policy(url, country_name):
    """Scrape incentives/legislation text for one country."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    main_section = soup.find("main")
    policy_text = []
    
    # Grab headings, paragraphs, bullet points
    for elem in main_section.find_all(["h2", "h3", "p", "li"]):
        text = elem.get_text(strip=True)
        if text:
            policy_text.append(text)
    
    return {
        "country": country_name,
        "policy_text": " ".join(policy_text)
    }

# --- Step 1: Scrape mother page for country links
mother_url = "https://alternative-fuels-observatory.ec.europa.eu/transport-mode/road"
res = requests.get(mother_url)
soup = BeautifulSoup(res.text, "html.parser")

# find all <a> tags with "/road/{country}/incentives-legislations"
links = soup.find_all("a", href=True)
country_links = []
for a in links:
    href = a["href"]
    if href.startswith("/transport-mode/road/") and href.endswith("incentives-legislations"):
        country_name = a.get_text(strip=True)
        full_url = BASE_URL + href
        country_links.append((country_name, full_url))

print(f"Found {len(country_links)} country pages")

# --- Step 2: Loop over countries
all_data = []
for name, url in country_links:
    try:
        print(f"Scraping {name}...")
        data = scrape_country_policy(url, name)
        all_data.append(data)
        time.sleep(1)  # polite delay
    except Exception as e:
        print(f"❌ Failed for {name}: {e}")

# --- Step 3: Store results
df = pd.DataFrame(all_data)
df.to_csv("raw_data/policy_incentives.csv", index=False)

print("✅ Saved to raw_data/policy_incentives.csv")