In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# SETTINGS
RESULTS_PER_PAGE = "100"      # "100" or "500" (as string)
WAIT_BETWEEN_PAGES = 2        # seconds
MAX_PAGES = 200               # a safe upper bound; script will auto-stop at last page

BASE_URL = "https://www.eib.org/en/projects/all/index"

chrome_options = Options()
chrome_options.add_argument("--headless")  # Remove to see browser window
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=chrome_options)
driver.get(BASE_URL)
time.sleep(2)

# --- REMOVE COOKIE OVERLAY ---
def remove_cookie_overlay(driver):
    try:
        # Try clicking the accept/close button (adjust selector if needed)
        btn = driver.find_element(By.CSS_SELECTOR, ".cookies-home_accept, .cookie-consent-accept, button[title='Accept'], button#cookie-accept")
        btn.click()
        print("Cookie overlay closed via button")
        time.sleep(1)
    except Exception:
        # Force remove overlay via JS
        driver.execute_script("""
            var el = document.querySelector('.cookies-home_overlay');
            if(el) el.remove();
            var popup = document.querySelector('.cookies-home');
            if(popup) popup.remove();
        """)
        time.sleep(1)

remove_cookie_overlay(driver)

# --- SET RESULTS PER PAGE ---
WebDriverWait(driver, 15).until(
    EC.presence_of_element_located((By.ID, "show-entries"))
)
select = Select(driver.find_element(By.ID, "show-entries"))
select.select_by_value(RESULTS_PER_PAGE)
time.sleep(3)  # Wait for reload

remove_cookie_overlay(driver)  # Overlay sometimes reappears after dropdown

# --- SCRAPING LOOP ---
all_rows = []
collected_urls = set()
page = 1

while page <= MAX_PAGES:
    print(f"Scraping page {page}")
    try:
        # Wait for articles to appear
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article.col-xs-12.col-sm-12.col-md-12"))
        )
    except Exception as e:
        print(f"Timeout or error waiting for projects: {e}")
        break

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.select("article.col-xs-12.col-sm-12.col-md-12")

    for art in articles:
        title_tag = art.select_one('h3.row-title a')
        if not title_tag:
            continue
        title = title_tag.text.strip()
        project_url = "https://www.eib.org" + title_tag['href']
        if project_url in collected_urls:
            continue
        collected_urls.add(project_url)
        cols = art.select('div.col-md-2.col-xs-6, div.col-md-1.col-xs-6')
        countries = cols[0].text.strip() if len(cols) > 0 else ''
        sectors = cols[1].text.strip() if len(cols) > 1 else ''
        signed_amount = cols[2].text.strip() if len(cols) > 2 else ''
        current_status = cols[3].text.strip() if len(cols) > 3 else ''
        last_status_change = cols[4].text.strip() if len(cols) > 4 else ''
        row = {
            "Title": title,
            "Countries": countries,
            "Sectors": sectors,
            "Signed Amount": signed_amount,
            "Current Status": current_status,
            "Last Status Change": last_status_change,
            "Project URL": project_url
        }
        all_rows.append(row)

    # --- PAGINATION: Next Page ---
    clicked = False
    for attempt in range(3):
        try:
            remove_cookie_overlay(driver)
            next_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, ".eib-search-pagination-next"))
            )
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_btn)
            time.sleep(1)
            # Stop if button disabled (last page)
            if 'disabled' in next_btn.get_attribute("class"):
                print("Reached last page.")
                clicked = False
                break
            next_btn.click()
            print("Clicked next")
            clicked = True
            time.sleep(WAIT_BETWEEN_PAGES)
            break
        except Exception as e:
            print("Retrying click due to error:", e)
            remove_cookie_overlay(driver)
            time.sleep(2)
    if not clicked:
        print("Failed to click next, stopping loop.")
        break

    page += 1

driver.quit()

# --- FINAL DEDUPLICATION & SAVE ---
df = pd.DataFrame(all_rows)
df = df.drop_duplicates(subset=['Project URL'])
df.to_excel("eib_projects_full_final.xlsx", index=False)
print("Done! Saved as eib_projects_full_final.xlsx")


Scraping page 1
Clicked next
Scraping page 2
Clicked next
Scraping page 3
Clicked next
Scraping page 4
Clicked next
Scraping page 5
Clicked next
Scraping page 6
Clicked next
Scraping page 7
Clicked next
Scraping page 8
Clicked next
Scraping page 9
Clicked next
Scraping page 10
Clicked next
Scraping page 11
Clicked next
Scraping page 12
Clicked next
Scraping page 13
Clicked next
Scraping page 14
Clicked next
Scraping page 15
Clicked next
Scraping page 16
Clicked next
Scraping page 17
Clicked next
Scraping page 18
Clicked next
Scraping page 19
Clicked next
Scraping page 20
Clicked next
Scraping page 21
Clicked next
Scraping page 22
Clicked next
Scraping page 23
Clicked next
Scraping page 24
Clicked next
Scraping page 25
Clicked next
Scraping page 26
Clicked next
Scraping page 27
Clicked next
Scraping page 28
Clicked next
Scraping page 29
Clicked next
Scraping page 30
Clicked next
Scraping page 31
Clicked next
Scraping page 32
Clicked next
Scraping page 33
Clicked next
Scraping page 34
Cl