# Creation of Embedding Set for Job Advertisements
This notebook processes job advertisements scraped from the EURES portal to create an embedding dataset. It includes steps for cleaning, extending, and merging job data with ESCO job IDs. The final output is a structured dataset ready for embedding generation and further analysis.

In [4]:
# In this notebook, the Job advertisembnts for the Embedding Set is created

# Imports
### This section imports all necessary libraries and modules.

In [None]:
# Import custom utilities and Selenium modules for web scraping
import sys
sys.path.append('..')  # Adjust path to include parent directory
from _utils import load_json, flatten_list

In [None]:
# Import standard libraries and third-party modules
import random
from tqdm import tqdm
tqdm.pandas()
random.seed(42)

# Main
#### This section contains the main logic for processing job advertisements.

In [3]:
from _helpers_parsing import parse_multithreading_eures, extend_jobs

### load overview

In [None]:
# Import pandas for data manipulation
import pandas as pd

In [None]:
# Load and clean the overview data
overview = pd.DataFrame(load_json("../00_data/EURES/eures_overview_total.json"))
print(len(overview))
overview = overview.drop_duplicates(["title", "url"])
print(len(overview))
overview = overview[~overview["title"].isnull()]
print(len(overview))
overview.head(2)





## Add ESCO IDs
### Extend job advertisements with ESCO job IDs.

In [None]:
# Load ESCO lookup dictionary
esco_lookup_dict = load_json("../00_data/ESCO/esco_lookup.json")

In [None]:
# Extend dataset to full length
ads_extended = pd.DataFrame(flatten_list(overview.progress_apply(extend_jobs, axis=1)))
print(len(ads_extended))
ads_extended = ads_extended[~ads_extended["esco_id"].isnull()]
print(len(ads_extended))







In [None]:
# Get unique ESCO IDs from unparsed advertisements
unique_ids_unparsed = list(ads_extended["esco_id"].unique())
len(unique_ids_unparsed)



## Load Parsed Ads
# Load already parsed job advertisements.

In [None]:
# Load parsed job advertisements
parsed_jobads = pd.DataFrame(load_json("../00_data/EURES/parsed_ads_final.json"))
len(parsed_jobads)
desc_lookup = parsed_jobads[["url","description"]].drop_duplicates(["url", "description"])
len(desc_lookup)



In [None]:
# Display parsed job advertisements
parsed_jobads.head()



### Quality Check of Descriptions


In [None]:
# Check if description contains alphabetic characters
def alphabetic_char(description):
    for c in description:
        if c.isalpha():
            return True
    return False

In [None]:
# Add quality metrics to descriptions
desc_lookup["has_alpha"] = desc_lookup["description"].apply(alphabetic_char)
desc_lookup["length"] = desc_lookup["description"].apply(len)
desc_lookup.sort_values(by="length").iloc[5]["description"]



#### Merging Overview with Already Parsed Ads

In [None]:
# Merge overview with parsed job advertisements
parsed_jobads = pd.merge(ads_extended,desc_lookup, on="url",how="left")
print(len(parsed_jobads))
# Parsing errors in some descriptions
parsed_jobads = parsed_jobads[~parsed_jobads["description"].isna()]
print(len(parsed_jobads))
parsed_jobads = parsed_jobads[~parsed_jobads["esco_id"].isna()]
print(len(parsed_jobads))
unique_ids_parsed = list(parsed_jobads["esco_id"].unique())
print(f"({len(unique_ids_parsed)})")



In [None]:
# Check for missing ESCO IDs
if len(unique_ids_unparsed) != len(unique_ids_parsed):
    print(f"{len(unique_ids_unparsed)}/{len(unique_ids_parsed)}")
    set_missing_ids = set(unique_ids_unparsed)-set(unique_ids_parsed)
    missing_ids = []
    for id in set_missing_ids:
        missing_ids.append({"esco_id":id, "count":0, "need":100})
    missing_ids =pd.DataFrame(missing_ids)
missing_ids





In [None]:
# Count numbers of ESCO IDs and merge on overview
counts = pd.DataFrame(parsed_jobads["esco_id"].value_counts()).reset_index()
counts.columns = ["esco_id","count"]
parsed_jobads = pd.merge(parsed_jobads, counts, on="esco_id")

#### Create Overview Over Job Ads Which Are still missing in dataset

In [None]:
# Identify job advertisements in need
in_need = counts[counts["count"]<100].copy()
in_need["need"] = 100-in_need["count"]
print(len(in_need))
in_need = pd.concat([in_need, missing_ids])
print(len(in_need))



In [None]:
# Create set of URLs that have already been parsed
parsed_urls = (set(parsed_jobads["url"]))
deadlinks = set(load_json("../00_data/EURES/deadlinks_final.json"))
len(deadlinks)



## Filter Overview to Unparsed URLs

In [None]:
# Filter unparsed URLs from the overview
print(len(ads_extended))
unparsed_url_df = ads_extended[~ads_extended["url"].isin(parsed_urls)]
print(len(unparsed_url_df))
unparsed_url_df = unparsed_url_df[~unparsed_url_df["url"].isin(deadlinks)]
print(len(unparsed_url_df))



### Create a DataFrame of URLs to parse based on needs.

In [None]:
# Filter URLs to parse based on needs
jobads_to_parse = []
for need in tqdm(in_need.to_dict("records")):
    filtered_need = unparsed_url_df[unparsed_url_df["esco_id"]==need["esco_id"]].iloc[0:need["need"]].to_dict("records")
    if len(filtered_need) > 0:
        jobads_to_parse += filtered_need



In [None]:
# Create DataFrame of URLs to parse
to_parse_df = pd.DataFrame(jobads_to_parse)
to_parse_df

In [None]:
# Parse job advertisements using multithreading
results = parse_multithreading_eures(to_parse_df,[], headless=False)

# Results

In [None]:
# Load and display parsed advertisements
parsed_ads = (load_json("../00_data/EURES/parsed_ads_final.json"))
len(parsed_ads)

