# Imports
This section imports all necessary libraries and modules for processing job advertisements.

In [None]:
# Import custom utilities and helper functions
import sys
sys.path.append('..')
from _utils import load_json, write_json
from _classification_helpers import setup_classification_pipeline

In [None]:
# Import standard libraries and third-party modules
from nltk import word_tokenize
import math
from datetime import datetime
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from sentence_transformers import util
from concurrent.futures import ThreadPoolExecutor
import os

In [None]:
# Set random seed for reproducibility
import random
random.seed(42)

In [None]:
# Function to shorten job advertisements by extracting relevant paragraphs
def shorten_job_ads(job_ads, classification_pipeline, output_file):
    """
    Shortens job advertisements by processing their text and extracting relevant paragraphs.

    Args:
        job_ads (list): List of job advertisement texts to be processed.
        classification_pipeline (callable): A classification pipeline function to classify text paragraphs.
        output_file (str): Path to save the shortened job advertisements as a JSON file.

    Returns:
        list: A list of dictionaries containing the original job ad and its shortened version.
    """
    shortened_ads = []

    for job_ad_text in tqdm(job_ads, desc="Processing job ads"):
        annotated_paragraphs = []
        # Split the job ad into paragraphs, ignoring empty lines
        paragraphs = [line for line in job_ad_text.split("\n") if line.strip()]

        # If the job ad has no paragraphs, tokenize and split into chunks of 50 tokens
        if len(paragraphs) == 1:
            tokens = word_tokenize(job_ad_text, language="german")
            num_chunks = math.ceil(len(tokens) / 50)
            paragraphs = [" ".join(chunk) for chunk in np.array_split(tokens, num_chunks)]

        # Process each paragraph using the classification pipeline
        for paragraph in paragraphs:
            try:
                label = classification_pipeline(paragraph)[0]["label"]
            except Exception:
                # Fallback for long paragraphs
                label = classification_pipeline(paragraph[:250])[0]["label"]
            annotated_paragraphs.append({"text": paragraph, "label": label})

        # Combine paragraphs labeled as "LABEL_1" into a shortened version of the job ad
        shortened_text = " ".join(
            [annotation["text"] for annotation in annotated_paragraphs if annotation["label"] == "LABEL_1"]
        )
        shortened_ads.append({"original": job_ad_text, "shortened": shortened_text})

        # Save intermediate results to JSON after every 50 job ads
        if len(shortened_ads) % 50 == 0:
            with open(output_file, 'w', encoding="utf-8") as file:
                json.dump(shortened_ads, file, indent=2, ensure_ascii=False)

    # Save final results to JSON
    with open(output_file, 'w', encoding="utf-8") as file:
        json.dump(shortened_ads, file, indent=2, ensure_ascii=False)

    return shortened_ads

In [None]:
# Function to shorten job advertisements using multithreading
def multithread_shortening(job_ads):
    """
    Shortens job advertisements using multiple threads for faster processing.

    Args:
        job_ads (list): List of job advertisement texts to be processed.

    Returns:
        list: A list of dictionaries containing the original job ad and its shortened version.
    """
    classification_pipelines = [setup_classification_pipeline() for _ in range(4)]
    job_ad_chunks = np.array_split(job_ads, 4)
    timestamp = "".join([char for char in str(datetime.now()).split('.')[0] if char.isdigit()])
    output_files = [f"../00_data/EURES/{timestamp}_shortened_ads_{i}.json" for i in range(1, 5)]

    with ThreadPoolExecutor(max_workers=4) as executor:
        results = executor.map(shorten_job_ads, job_ad_chunks, classification_pipelines, output_files)
        combined_results = [item for result in results for item in result]

    # Save the combined results to a single JSON file
    with open(f"../00_data/EURES/{timestamp}_shortened_ads_total.json", 'w', encoding="utf-8") as file:
        json.dump(combined_results, file, indent=2, ensure_ascii=False)

    return combined_results

In [None]:
# Function to select random job advertisements for testing
def select_random_ads(dataframe, max_per_esco_id):
    """
    Selects random job advertisements for each ESCO ID.

    Args:
        dataframe (pd.DataFrame): DataFrame containing job advertisements.
        max_per_esco_id (int): Maximum number of job ads to select per ESCO ID.

    Returns:
        list: A list of randomly selected job advertisements.
    """
    selected_ads = []
    unique_esco_ids = dataframe["esco_id"].unique()

    for esco_id in tqdm(unique_esco_ids, desc="Selecting random ads"):
        filtered_ads = dataframe[dataframe["esco_id"] == esco_id]
        if len(filtered_ads) <= max_per_esco_id:
            selected_ads += filtered_ads.to_dict("records")
        else:
            selected_ads += random.sample(filtered_ads.to_dict("records"), max_per_esco_id)

    return selected_ads

# Load Data
This section loads the job advertisements data for processing.

In [None]:
# Load parsed job advertisements from JSON file
job_ads = load_json(r"../00_data/EURES/parsed_ads_final.json")
len(job_ads)

In [None]:
# Convert job advertisements to a DataFrame and add additional columns
job_ads_df = pd.DataFrame(job_ads)
job_ads_df.drop(["count"], inplace=True, axis=1)
job_ads_df["num_esco_jobs"] = job_ads_df["esco_jobs"].apply(len)

In [None]:
# Get unique job descriptions
unique_descriptions = job_ads_df["description"].unique()
len(unique_descriptions)

In [None]:
# Filter unique descriptions
unique_descriptions = [desc for desc in tqdm(unique_descriptions) if desc not in unique_descriptions]
len(unique_descriptions)

In [None]:
# Load test job advertisements
test_ads = load_json("../00_data/EURES/eures_testads_final.json")

# Convert test ads to DataFrame and get unique descriptions
test_ads_df = pd.DataFrame(test_ads)
unique_descriptions = list(test_ads_df["description"].unique())
len(unique_descriptions)

In [None]:
# Shorten job descriptions using multithreading
shortened_descriptions = multithread_shortening(unique_descriptions)

In [None]:
# Add shortened descriptions to the DataFrame
job_ads_df["shortened_texts"] = shortened_descriptions

In [None]:
# Save the updated job advertisements with shortened descriptions to JSON
write_json("../00_data/EURES/parsed_shortened_ads_final.json", job_ads_df.to_dict("records"))