In [None]:
from sys import path
path.append('..')  # Add the parent directory to the path
from _utils import load_jobs, write_json, load_json, flatten_list
from _helpers_parsing import driver_setup

In [None]:
import json
import math
import os
import re
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
esco_jobs = load_jobs()

In [8]:
def get_eures_overview(esco_jobs, web_driver, output_file, wait_time):
    """
    Scrapes job advertisements from the EURES portal for a given list of ESCO codes.

    Parameters:
        esco_jobs (list): A list of dictionaries containing ESCO job codes and job titles.
        web_driver (webdriver): Selenium WebDriver instance for navigating the EURES portal.
        output_file (str): Path to the file where the scraped data will be saved as JSON.
        wait_time (int): Maximum wait time (in seconds) for elements to load on the page.

    Returns:
        list: A list of dictionaries containing job advertisement details.
    """
    job_data = []

    for esco_job in tqdm(esco_jobs):
        esco_code = esco_job["uri"].split("/")[-1]
        search_url = (
            f"https://europa.eu/eures/portal/jv-se/search?page=1&resultsPerPage=50&orderBy=MOST_RECENT"
            f"&availableLanguages=de&escoOccupation={esco_code}&lang=de"
        )
        web_driver.get(search_url)

        # Check if there are no results for the ESCO code
        if _check_no_results(web_driver):
            continue

        # Determine the number of pages to scrape
        total_pages = _get_number_of_pages(web_driver, wait_time)
        if total_pages is None:
            job_data.append({"searched_esco_job": esco_job["jobtitle"]})
            continue

        # Scrape job advertisements from each page
        for page_number in range(total_pages):
            page_url = (
                f"https://europa.eu/eures/portal/jv-se/search?page={page_number + 1}&resultsPerPage=50"
                f"&orderBy=MOST_RECENT&availableLanguages=de&escoOccupation={esco_code}&lang=de"
            )
            web_driver.get(page_url)
            jobs_on_page = _scrape_jobs_from_page(web_driver, wait_time, esco_job["jobtitle"])
            job_data.extend(jobs_on_page)

        # Save the results to a file
        write_json(output_file, job_data)

    return job_data


def _check_no_results(web_driver):
    """Check if the search results indicate no jobs found."""
    try:
        WebDriverWait(web_driver, 3).until(EC.presence_of_element_located((By.ID, "jv-no-result")))
        return True
    except:
        return False


def _get_number_of_pages(web_driver, wait_time):
    """Determine the number of pages of job results."""
    try:
        WebDriverWait(web_driver, wait_time).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ecl-u-type-heading-2"))
        )
        time.sleep(1)
        soup = BeautifulSoup(web_driver.page_source, "html.parser")
        results_text = soup.find(class_="ecl-u-type-heading-2").text.split(" ")[2]
        total_results = int("".join([char for char in results_text if char.isdigit()]))
        total_pages = math.ceil(total_results / 50)
        return min(total_pages, 6)  # Limit to the top 300 results (6 pages)
    except:
        return None


def _scrape_jobs_from_page(web_driver, wait_time, searched_job_title):
    """Scrape job advertisements from the current page."""
    jobs = []
    try:
        WebDriverWait(web_driver, wait_time).until(
            EC.presence_of_element_located((By.CLASS_NAME, "jv-result-job-category"))
        )
    except:
        return jobs

    time.sleep(1)
    soup = BeautifulSoup(web_driver.page_source, "html.parser")
    job_posts = soup.find_all("article")

    for job_post in job_posts:
        job_url = "https://europa.eu" + job_post.find(href=True)["href"]
        job_title = job_post.find(href=True).text
        publication_date = job_post.find("em").text.replace(" ", "").replace("/", ".")
        esco_categories = [
            category.text.strip(", ")
            for category in job_post.find_all("span", {"class": "jv-result-job-category"})
        ]
        job_details = {
            "searched_esco_job": searched_job_title,
            "title": job_title,
            "url": job_url,
            "esco_jobs": esco_categories,
            "publication_date": publication_date,
        }
        jobs.append(job_details)

    return jobs

In [None]:
# multithreading the parsing, to speed up the process
def multithread_eures_overview(esco_joblist, headless=True, patience=10):
    """
    Multithreaded scraping of job advertisements from the EURES portal.

    Parameters:
        esco_joblist (list): A list of dictionaries containing ESCO job codes and job titles.
        headless (bool): Whether to run the web drivers in headless mode.
        patience (int): Maximum wait time (in seconds) for elements to load on the page.

    Returns:
        list: A list of dictionaries containing job advertisement details.
    """
    # Set up multiple Selenium WebDriver instances
    drivers = [driver_setup(headless) for _ in range(4)]
    currently = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])
    filenames = [f"eures_overview/{currently}_eures_overview_{i}.json" for i in range(1, 5)]
    patience_list = [patience for _ in range(4)]

    # Split the ESCO job list into chunks for parallel processing
    chunks = np.array_split(esco_joblist, 4)
    with ThreadPoolExecutor(max_workers=4) as executor:
        bucket = executor.map(get_eures_overview, chunks, drivers, filenames, patience_list)
        results = [item for block in bucket for item in block]

    # Save the combined results to a single JSON file
    write_json("eures_overview_total.json", results)

    # Quit all WebDriver instances
    [driver.quit() for driver in drivers]

    return results

In [None]:
x = multithread_eures_overview(esco_jobs, headless=False,patience=10)

In [None]:
import pandas as pd

In [None]:
import os
parsed_ads = pd.DataFrame(flatten_list([load_json("eures_overview/"+str(x)) for x in os.listdir('eures_overview')]))

In [None]:
len(parsed_ads.drop_duplicates("url"))

In [None]:
print(len(parsed_ads))
parsedads = parsed_ads.drop_duplicates("url")
parsedads = parsed_ads.dropna()
esco_jobs_parsed = flatten_list([x["esco_jobs"] for x in tqdm(parsed_ads.to_dict("records"))])

In [None]:
len(set(esco_jobs_parsed))

In [None]:
parsed_ads["no_escojobs"] = parsed_ads["esco_jobs"].apply(len)
esco_jobs_parsed = flatten_list([x["esco_jobs"] for x in tqdm(parsed_ads[parsed_ads["no_escojobs"]==1].to_dict("records"))])

# Output

In [None]:
import pandas as pd
from _util import *

In [None]:
results = pd.DataFrame(load_json("../00_data/EURES/eures_overview_total.json"))

In [None]:
results.head(1)

In [None]:
print(f"A total of {len(results)} links to job advertisements were parsed.")
results = results.drop_duplicates(["title", "url"])
print(f"Without duplicates there are {len(results)} job ad links available.")