In [None]:
# Import necessary libraries and modules
import sys
sys.path.append("..")  # Add parent directory to the path
from _utils import load_json
import pandas as pd
import random
import srsly
random.seed(42)  # Set random seed for reproducibility
from tqdm import tqdm

In [None]:
# Define helper functions
def get_combined_text(job_ad):
    """
    Combine the parsed job title and description into a single text string.

    Args:
        job_ad (dict): A dictionary containing job advertisement details.

    Returns:
        str: Combined text of the job title and description.
    """
    return " ".join([job_ad["title_parsed"].split(" bei ")[0], job_ad["description"]])

def initialize_empty_label(job_ad):
    """
    Initialize an empty list as the label for a job advertisement.

    Args:
        job_ad (dict): A dictionary containing job advertisement details.

    Returns:
        list: An empty list as the label.
    """
    return []

In [None]:
def get_shortened_esco_id(esco_id, length):
    """
    Shorten the ESCO ID to a specified length.

    Args:
        esco_id (str): The full ESCO ID.
        length (int): The desired length of the shortened ESCO ID.

    Returns:
        str: The shortened ESCO ID.
    """
    return esco_id[:length]

In [None]:
# Load job advertisements for classifier training
job_ads_df = pd.DataFrame(load_json("../00_data/jobads_for_classifiertraining.json"))

In [None]:
# Process job advertisements to prepare for classifier training
job_ads_df_copy = job_ads_df.copy()

# Combine job title and description into a single text field
job_ads_df_copy["text"] = job_ads_df_copy.apply(get_combined_text, axis=1)

# Initialize empty labels for each job advertisement
job_ads_df_copy["label"] = job_ads_df_copy.apply(initialize_empty_label, axis=1)

# Shorten ESCO IDs to 4 characters for grouping
job_ads_df_copy["esco_short_id"] = job_ads_df_copy["esco_id"].apply(get_shortened_esco_id, args=[4])

# Group job advertisements by shortened ESCO ID and count occurrences
grouped_ads_df = job_ads_df_copy.groupby("esco_short_id").sum().reset_index()
esco_ids = list(grouped_ads_df["esco_short_id"])

# Select up to 5 random job advertisements per ESCO ID
selected_ads = []
for esco_id in tqdm(esco_ids):
    filtered_ads = job_ads_df_copy[job_ads_df_copy["esco_short_id"] == esco_id]
    if len(filtered_ads) <= 5:
        selected_ads += filtered_ads[["text", "label"]].to_dict("records")
    else:
        selected_ads += random.sample(filtered_ads[["text", "label"]].to_dict("records"), 5)

100%|██████████| 256/256 [00:02<00:00, 108.10it/s]


In [None]:
# Save the selected job advertisements to a JSONL file for annotation
srsly.write_jsonl("jsonl_annot_raw.jsonl", selected_ads)

# Further Process
This section describes the next steps for processing the JSONL file.

1. The JSONL file was further processed with [doccano](https://github.com/doccano/doccano).
2. A project for Named Entity Recognition (NER) was created.
3. The relevant paragraphs were marked as "relevant".