## Scrape country level data on EV market shares

In [26]:
import configparser
from pathlib import Path

import pandas as pd
import numpy as np
import requests
from io import StringIO
import os
import time
import random


In [21]:
# define filepaths
config = configparser.ConfigParser()
config.read("config.ini")

# access values
raw_path = Path(config["default"]["raw_path"])
interim_path = Path(config["default"]["interim_path"])
processed_path = Path(config["default"]["processed_path"])

In [22]:
# define target countries for scraping
target_countries = [
    "Austria","Belgium","Bulgaria","Croatia","Cyprus","Czech Republic","Denmark",
    "Estonia","Finland","France","Germany","Greece","Hungary","Iceland","Ireland",
    "Italy","Latvia","Liechtenstein","Lithuania","Luxembourg","Malta","Netherlands",
    "Norway","Poland","Portugal","Romania","Slovakia","Slovenia","Spain","Sweden",
    "Switzerland","Turkey","United Kingdom"
]


Define Scraper function to fetch market share data for each country, logging any failed requests (due to too many requests) for retry
* Save scraped data as CSV

In [27]:
def fetch_data(target_countries, output_csv=interim_path/"bev_market_share_data.csv", failed_csv=interim_path/"failed_countries.csv", delay_range=(4,10)):
    """
    Scrape BEV/PHEV yearly market share data from country pages from the European Alternative Fuels Observatory.

    Args: 
        countries (list of str): List of target country names to fetch data for
        output_csv (str): File path to save successfully scraped data
        failed_csv (str): File path to store list of countries failed to fetch
        delay_range (tuple): Min & max seconds to wait between requests (randomised to reduce risk of being rate limited)

    Returns: 
        list: Countries that failed to scrape
    """

    # create list to initially store data as its scraped
    all_data = []

    # in case of encountering timeout error, store failed countries to retry later
    failed_countries = []

    # define scraper loop
    for country in target_countries:
        slug = country.lower().replace(" ", "-")
        url = f"https://alternative-fuels-observatory.ec.europa.eu/sites/default/files/csv/{slug}/market_share_new_registrations_m1.csv"
        
        headers = {"User-Agent": "Mozilla/5.0"}

        response = requests.get(url, headers=headers)

        if response.status_code == 200:
        # read csv as pandas df directly from memory
            df = pd.read_csv(StringIO(response.text))
            print(f'Successfully loaded CSV for {country}')

            df_clean = df[['YEAR', 'BEV', 'PHEV']].copy()
            df_clean['Country'] = country
            df_clean['BEV+PHEV'] = df_clean['BEV'] + df_clean['PHEV']
            all_data.append(df_clean)
        else:
            print(f"Failed for {country} ({response.status_code})")
            failed_countries.append(country)

        # randomise delays to reduce risk of 429 error rate limiting
        time.sleep(random.uniform(*delay_range)) # unpack tuple

    # if any countries were scraped successfully...
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)

        # if file already exists, append without header
        try:
            final_df.to_csv(output_csv, mode="a", index = False, header=not os.path.exists(output_csv))
        except:
            final_df.to_csv(output_csv, index=False)

        num_countries = final_df['Country'].nunique() # count how many countries data successfully scraped for
        print(f"Saved successfully scraped data for {num_countries} countries. ") # success message

    # if anything was stored in failed countries...
    if failed_countries:
        pd.Series(failed_countries, name='country').to_csv("failed_countries.csv", index=False)
        print(f"Saved {len(failed_countries)} to '{failed_csv}'.")

    else:
        print("All countries fetched successfully.")
    return failed_countries
    

#### Loop scraper until all countries fetched successfully

In [28]:
# initially assign entire target countries list as target 'to fetch'
to_fetch = target_countries
attempt = 1

while to_fetch:
    print(f"Attempt {attempt}: scraping {len(to_fetch)} countries")
    # assign failed countries to list
    failed = fetch_data(to_fetch, failed_csv=f"failed_attempt{attempt}.csv") 

    if not failed:
        print("All countries successfully scraped")
        break
    
    # update 'to fetch' list with remaining failed countries
    to_fetch = failed 
    attempt += 1

    time.sleep(60) # wait 1 minute before retry

Attempt 1: scraping 33 countries
Successfully loaded CSV for Austria
Successfully loaded CSV for Belgium
Successfully loaded CSV for Bulgaria
Successfully loaded CSV for Croatia
Successfully loaded CSV for Cyprus
Successfully loaded CSV for Czech Republic
Successfully loaded CSV for Denmark
Successfully loaded CSV for Estonia
Successfully loaded CSV for Finland
Successfully loaded CSV for France
Successfully loaded CSV for Germany
Successfully loaded CSV for Greece
Successfully loaded CSV for Hungary
Successfully loaded CSV for Iceland
Successfully loaded CSV for Ireland
Successfully loaded CSV for Italy
Successfully loaded CSV for Latvia
Successfully loaded CSV for Liechtenstein
Successfully loaded CSV for Lithuania
Successfully loaded CSV for Luxembourg
Successfully loaded CSV for Malta
Successfully loaded CSV for Netherlands
Successfully loaded CSV for Norway
Successfully loaded CSV for Poland
Successfully loaded CSV for Portugal
Successfully loaded CSV for Romania
Successfully load