In [None]:
import concurrent.futures
import pandas as pd 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException



# Base URL for the search results
base_url = "https://eur-lex.europa.eu/search.html?DTC=false&SUBDOM_INIT=PRE_ACTS&DB_AUTHOR=commission&DTS_SUBDOM=PRE_ACTS&DB_INTER_CODE_TYPE=OLP&DTS_DOM=EU_LAW&lang=en&type=advanced&date0=ALL%3A01012000%7C28082024&qid=1724861301256"

# Function to generate the URL for a specific page
def generate_page_url(page_number):
    return f"{base_url}&page={page_number}"

In [None]:
# Function to get the total number of pages
def get_total_pages(base_url):

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    try:
        driver.get(base_url)
        wait = WebDriverWait(driver, 40)
        double_right_arrow = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, ".fa.fa-angle-double-right"))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", double_right_arrow)
        wait.until(EC.visibility_of(double_right_arrow))
        
        try:
            double_right_arrow.click()
        except ElementClickInterceptedException:
            driver.execute_script("arguments[0].click();", double_right_arrow)

        wait.until(
            EC.presence_of_element_located((By.ID, 'pagingInput1'))
        )
        total_pages = int(driver.find_element(By.ID, 'pagingInput1').get_attribute('value'))

    finally:
        driver.quit()

    return total_pages

In [None]:
# Function to process a single page and extract data with retry mechanism
def process_page(page_number, retries=3):
    data_list = []
    driver = None
    
    for attempt in range(retries):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
            driver.set_window_size(1920, 1080)

            page_url = generate_page_url(page_number)
            driver.get(page_url)

            wait = WebDriverWait(driver, 40)  # Increased timeout to 40 seconds
            wait.until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'SearchResult'))
            )
            search_results = driver.find_elements(By.CLASS_NAME, 'SearchResult')

            # Print found results
            print(f"Found {len(search_results)} results on page {page_number}")

            for result in search_results:
                try:
                    more_info_button = result.find_element(By.XPATH, ".//button[contains(@class, 'ViewMoreInfo')]")
                    for child in result.find_elements(By.XPATH, ".//div[contains(@class, 'SearchResultData') and contains(@class, 'collapse') and contains(@class, 'in')]"):
                        driver.execute_script("arguments[0].classList.remove('collapse')", child)
                except NoSuchElementException:
                    continue

                data = {}
                try:
                    title_element = result.find_element(By.TAG_NAME, 'a')
                    data['Title'] = title_element.text
                    data['Link'] = title_element.get_attribute('href')
                except NoSuchElementException:
                    data['Title'] = None
                    data['Link'] = None

                try:
                    data['CELEX'] = result.find_element(By.XPATH, ".//dt[normalize-space(text())='CELEX number:']/following-sibling::dd[1]").text
                except NoSuchElementException:
                    data['CELEX'] = None

                try:
                    data['Form'] = result.find_element(By.XPATH, ".//dt[contains(text(), 'Form')]/following-sibling::dd[1]").text
                except NoSuchElementException:
                    data['Form'] = None

                try:
                    data['Author'] = result.find_element(By.XPATH, ".//dt[contains(text(), 'Author')]/following-sibling::dd[1]").text
                except NoSuchElementException:
                    data['Author'] = None

                try:
                    data['Date'] = result.find_element(By.XPATH, ".//dt[contains(text(), 'Date of document')]/following-sibling::dd[1]").text
                except NoSuchElementException:
                    data['Date'] = None

                try:
                    data['Pages'] = result.find_element(By.XPATH, ".//dt[contains(text(), 'Number of pages')]/following-sibling::dd[1]").text
                except NoSuchElementException:
                    data['Pages'] = None

                data_list.append(data)

            break  # Exit loop if successful

        except TimeoutException as e:
            print(f"TimeoutException on page {page_number}, attempt {attempt + 1}: {str(e)}")
            if attempt == retries - 1:
                print(f"Giving up on page {page_number} after {retries} attempts")
        
        finally:
            if driver:
                driver.quit()

    return page_number, data_list

In [None]:
# Function to process multiple pages in parallel
def process_pages_in_parallel(start_page, end_page):
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        future_to_page = {executor.submit(process_page, page): page for page in range(start_page, end_page + 1)}
        all_data = []
        for future in concurrent.futures.as_completed(future_to_page):
            try:
                page_number, page_data = future.result()
                all_data.append((page_number, page_data))
            except Exception as e:
                print(f"Exception during processing page: {str(e)}")
    
    all_data.sort(key=lambda x: x[0])
    ordered_data = [item for _, sublist in all_data for item in sublist]
    return ordered_data

In [None]:
# Get the total number of pages
total_pages = get_total_pages(base_url)
#total_pages = 10
print(f"Total number of pages: {total_pages}")

In [None]:
# Process the first n pages
data_list = process_pages_in_parallel(1, total_pages)

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data_list)


In [None]:
# Rename the DataFrame
eurlex_data_scrapped = df

# Save the DataFrame to a CSV file in the ../Data directory
eurlex_data_scrapped.to_csv('../Data/eurlex-data-scrapped.csv', index=False)