# Imports

In [None]:
from _util import * 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from _helpers_parsing import driver_setup
from concurrent.futures import ThreadPoolExecutor

In [None]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import numpy as np
from datetime import datetime
import random
from collections import Counter
import os
tqdm.pandas()
random.seed(42)

# Functions

In [None]:
def get_jobdescription_eres(ads, driver, filename):
    results = []
    dead_links = []
    #ads = [ad for ad in ads if ad["url"] not in urls_parsed]
    for ad in tqdm(ads):
        driver.get(ad["url"]+"?jvDisplayLanguage=de&lang=de")
        try:
            WebDriverWait(driver,2).until(EC.presence_of_element_located((By.ID, "error-message-jv-detail")))
            dead_links.append(ad)
            with open(filename+"_deadlinks", 'w',encoding= "utf-8") as fp:
                json.dump(dead_links, fp, indent = 2, ensure_ascii=False)
            continue
        except:
            pass
        try:
            WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.ID, "jv-details-job-description")))
            WebDriverWait(driver, 8).until(EC.presence_of_element_located((By.ID, "jv-job-categories-codes")))
            time.sleep(0.5)
            soup = BeautifulSoup(driver.page_source, "html.parser")
        except:
            continue

        try:
            jobtitle = soup.find("h1").text
        except:
            jobtitle = None
        try:
            desc_content = (soup.find(id="jv-details-job-description").contents)
            description = ""
            for i in range(len(desc_content)):
                if desc_content[i].text != "":
                    description += desc_content[i].text.replace("\xa0","")
                if i+1 < len(desc_content):
                    if str(desc_content[i]) == "<br/>" and str(desc_content[i+1]) == "<br/>":
                        description+= " \n "
        except:
            description = None
        # try:
        #     reqs = soup.find(id="jv-details-job-requirements-section").text
        # except:
        #     reqs = None
        #try:
        jobs_esco_container = soup.find(id="jv-job-categories-codes")
        jobs_esco_list = jobs_esco_container.find_all(class_="ecl-u-ml-2xs ng-star-inserted")
        #print(jobs_esco_list[0])
        ESCO_JOBS = [job.text.replace("  -","").strip() for job in jobs_esco_list]
        #except:
            #ESCO_JOBS = None
        ad.update({"parsed_title":jobtitle, "ESCOJOB":ESCO_JOBS, "description": description})
        results.append(ad)
        if len(results)%50 == 0:
            with open(filename, 'w',encoding= "utf-8") as fp:
                json.dump(results, fp, indent = 2, ensure_ascii=False)
    with open(filename, 'w',encoding= "utf-8") as fp:
        json.dump(results, fp, indent = 2, ensure_ascii=False)
    driver.quit()
    return results

In [None]:
def parse_multithreading_eres(df, urls_parsed, headless=True):
    drivers = [driver_setup(headless) for _ in range(4)]
    currently = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])
    filenames = [f"00_data/03_output/eures_results/{currently}_eures_TESTADS{i}.json" for i in range(1,5)]
    print("Sorting out parsed ads.")
    urllist = df[~df["url"].isin(urls_parsed)].to_dict("records")
    print(f"Parsing {len(urllist)} ads.")
    #urllist = [x for x in tqdm(urllist) if x["url"] not in set(urls_parsed)]
    chunks = np.array_split(urllist, 4)
    with ThreadPoolExecutor(max_workers=4) as executor:  
        bucket = executor.map(get_jobdescription_eres, chunks, drivers, filenames)
        results = [item for block in bucket for item in block]
    write_json(f"00_data/03_output/eures_results/{currently}_eures_TESTADS_total.json",results)
    return results

In [None]:
def choose_random_ads(df, k):
    random_choices = []
    ids_unique = df["esco_id"].unique()
    for id in tqdm(ids_unique):
        filtered_df = df[df["esco_id"] == id]
        if len(filtered_df) <= k:
            random_choices += filtered_df.to_dict("records")
        else:
            random_choices += random.sample(filtered_df.to_dict("records"), k)
    return random_choices

# Main

# load overview

In [None]:
overview = pd.DataFrame(load_json("../00_data/EURES/eures_overview_total.json"))
print(len(overview))
overview = overview.drop_duplicates(["title", "url"])
print(len(overview))
overview = overview[~overview["title"].isnull()]
print(len(overview))

## add esco ids

In [None]:
esco_lookup_dict = load_json(r"../00_data/ESCO/esco_lookup.json")

In [None]:
def extend_jobs(ad):
    extended = []
    for job in ad["esco_jobs"]:
        job_ext = dict(ad)
        job_ext["esco_job"] = job
        try:
            id = esco_lookup_dict[job.lower()]
            job_ext["esco_id"] = id
        except:
            job_ext["esco_id"] = None
            continue
        if "." in id:
            extended.append(job_ext)
    return extended

In [None]:
ads_extended = pd.DataFrame(flatten_list(overview.progress_apply(extend_jobs, axis=1)))
print(len(ads_extended))
ads_extended = ads_extended[~ads_extended["esco_id"].isnull()]
print(len(ads_extended))

In [None]:
unique_ids_unparsed = list(ads_extended["esco_id"].unique())

## load parsed ads

In [None]:
#exclude  already parsed ads in embedding dataset and deadlinks
parsed_ads = pd.DataFrame(load_json("../00_data/EURES/0_pars_short_ads_final"))
deadlinks = load_json("../00_data/EURES/deadlinks_final.json")

In [None]:
parsed_urls = list(parsed_ads["url"].unique())

In [None]:
print(len(ads_extended))
unparsed_ads_filtered = ads_extended[~ads_extended["url"].isin(parsed_urls)]
unparsed_ads_filtered = unparsed_ads_filtered[~unparsed_ads_filtered["url"].isin(deadlinks)]
len(unparsed_ads_filtered)

In [None]:
len(unparsed_ads_filtered["esco_id"].unique())

In [None]:
unparsed_ads_filtered["no_esco_jobs"] = unparsed_ads_filtered["esco_jobs"].apply(len)

## RESULTS

In [5]:
from _util import *
import pandas as pd

In [6]:
parsed_test_ads = pd.DataFrame(load_json("../00_data/EURES/eures_testads_final.json"))
len(parsed_test_ads)

2250