In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

MAX_PAGES = 200
projects = []

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in background
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

try:
    for page in range(1, MAX_PAGES + 1):
        url = f"https://www.afd.fr/en/projects/list?page={page}"
        print(f"Scraping page {page}: {url}")
        driver.get(url)
        try:
            # Wait for project cards to load (adjust timeout as needed)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".fr-card"))
            )
        except Exception:
            print(f"No projects found on page {page}. Stopping.")
            break

        cards = driver.find_elements(By.CSS_SELECTOR, ".fr-card")
        if not cards:
            print(f"No cards found on page {page}. Stopping.")
            break

        for card in cards:
            try:
                status = card.find_element(By.CSS_SELECTOR, "p.fr-badge").text.strip()
            except:
                status = "N/A"
            try:
                title_elem = card.find_element(By.CSS_SELECTOR, "a.fr-card__link")
                title = title_elem.text.strip()
                url_proj = title_elem.get_attribute("href")
            except:
                title, url_proj = "N/A", ""
            try:
                duration = card.find_element(By.CSS_SELECTOR, ".fr-card__detail").text.strip()
            except:
                duration = "N/A"
            try:
                desc = card.find_element(By.CSS_SELECTOR, ".fr-card__desc").text.strip()
            except:
                desc = "N/A"
            projects.append({
                "Status": status,
                "Title": title,
                "URL": url_proj,
                "Duration": duration,
                "Description": desc
            })
        time.sleep(1)  # Be polite, avoid being blocked

finally:
    driver.quit()

# Save to Excel or CSV
df = pd.DataFrame(projects)
df.to_excel("afd_projects_all.xlsx", index=False)
print(f"Scraped {len(df)} projects. Data saved to afd_projects_all.xlsx")


Scraping page 1: https://www.afd.fr/en/projects/list?page=1
Scraping page 2: https://www.afd.fr/en/projects/list?page=2
Scraping page 3: https://www.afd.fr/en/projects/list?page=3
Scraping page 4: https://www.afd.fr/en/projects/list?page=4
Scraping page 5: https://www.afd.fr/en/projects/list?page=5
Scraping page 6: https://www.afd.fr/en/projects/list?page=6
Scraping page 7: https://www.afd.fr/en/projects/list?page=7
Scraping page 8: https://www.afd.fr/en/projects/list?page=8
Scraping page 9: https://www.afd.fr/en/projects/list?page=9
Scraping page 10: https://www.afd.fr/en/projects/list?page=10
Scraping page 11: https://www.afd.fr/en/projects/list?page=11
Scraping page 12: https://www.afd.fr/en/projects/list?page=12
Scraping page 13: https://www.afd.fr/en/projects/list?page=13
Scraping page 14: https://www.afd.fr/en/projects/list?page=14
Scraping page 15: https://www.afd.fr/en/projects/list?page=15
Scraping page 16: https://www.afd.fr/en/projects/list?page=16
Scraping page 17: https://

In [6]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

# 1. Read URLs from Excel
df = pd.read_excel('afd_projects_all.xlsx')
urls = df['URL'].dropna().tolist()

# 2. Set up Selenium (headless Chrome)
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

def extract_afd_fields(html):
    soup = BeautifulSoup(html, 'html.parser')
    result = {}

    # Project Name (from h1)
    h1 = soup.find('h1')
    result['Project Name'] = h1.get_text(strip=True) if h1 else ''

    # Find all field blocks
    for block in soup.select('div.md\\:flex.my-3.place-items-start'):
        label_div = block.find('dt').find('div')
        label = label_div.get_text(strip=True) if label_div else None
        value_dd = block.find('dd')

        if not label or not value_dd:
            continue

        # Project start date and status are together
        if label.lower().startswith('project start date'):
            date_tag = value_dd.find('time')
            result['Project Start Date'] = date_tag.get_text(strip=True) if date_tag else ''
            status_tag = value_dd.find('p', class_='fr-badge')
            result['Status'] = status_tag.get_text(strip=True) if status_tag else ''
        else:
            # For regular fields, get text or links
            links = value_dd.find_all('a')
            if links:
                value = ', '.join(a.get_text(strip=True) for a in links)
            else:
                # Remove nested tags not needed (like <p>, <span>)
                for tag in value_dd(['p', 'span']):
                    tag.decompose()
                value = value_dd.get_text(separator=' ', strip=True)
            result[label] = value

    return result

results = []
for url in urls:
    print(f"Scraping: {url}")
    driver.get(url)
    time.sleep(3)  # Wait for JS to load. Increase if needed.
    fields = extract_afd_fields(driver.page_source)
    fields['URL'] = url
    results.append(fields)

driver.quit()

# 4. Save to Excel
df_out = pd.DataFrame(results)
df_out.to_excel('afd_project_details.xlsx', index=False)
print("Scraping complete. Data saved to afd_project_details.xlsx")


Scraping: https://www.afd.fr/en/projects/promoting-private-sector-investment-menstrual-health-and-hygiene-ethiopia
Scraping: https://www.afd.fr/en/projects/ethiopia-scientific-cooperation-preserve-ethiopias-cultural-heritage-and-tourism
Scraping: https://www.afd.fr/en/carte-des-projets/ethiopia-support-maze-park
Scraping: https://www.afd.fr/en/projects/ethiopia-establishing-civil-society-innovation-fund-csif-finance-diversification-ethiopian
Scraping: https://www.afd.fr/en/projects/ethiopia-reviving-agriculture-post-conflict
Scraping: https://www.afd.fr/en/projects/ethiopia-catalyzing-market-menstrual-health-products-ethiopia
Scraping: https://www.afd.fr/en/projects/ethiopia-mapping-ground-water-resources-regions-affected-recurrent-droughts
Scraping: https://www.afd.fr/en/carte-des-projets/ethiopia-supporting-technical-sustainability-electric-power
Scraping: https://www.afd.fr/en/projects/ethiopia-supporting-access-drinking-water-43-small-and-medium-towns-ethiopia
Scraping: https://www