# Imports

In [None]:
from _util import *
import pandas as pd
import numpy as np
tqdm.pandas()
from sentence_transformers import util
import os
from concurrent.futures import ThreadPoolExecutor

In [None]:
from nltk import word_tokenize
import math
from datetime import datetime

In [None]:
import random
random.seed(42)

In [None]:
def multithread_shortening(ads):
    pipes = [setup_classifier() for _ in range(4)] 
    chunks = np.array_split(ads, 4)
    currently = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])
    filenames = [f"../00_data/EURES/{currently}_eures_ads_shortened{i}.json" for i in range(1,5)]
    with ThreadPoolExecutor(max_workers=4) as executor:  
        bucket = executor.map(shorten_jobads, chunks,pipes, filenames)
        results = [item for block in bucket for item in block]
    with open(f"../00_data/EURES/{currently}_total_ads_shortend.json", 'w',encoding= "utf-8") as fp:
        json.dump(results, fp, indent = 2, ensure_ascii=False)
    return results

In [None]:
# Classifier application
def shorten_jobads(ads,pipe,filename):
    shortened_texts = []
    for ad in tqdm(ads):  
        annots_jobad = []
        splitted_ad = [x for x in ad.split("\n") if x != "" or x != " ,"]
        # if no \n in the ad, tokenize and split at every 50, token
        if len(splitted_ad) == 1:
            tokenized = word_tokenize(ad, language="german")
            no_chunks = math.ceil(len(tokenized)/50)
            splitted_ad = np.array_split((tokenized), no_chunks)
            splitted_ad = [" ".join(x) for x in splitted_ad]
        for paragraph in splitted_ad:
            try:
                res = pipe(paragraph)[0]["label"]
            except:
                res = pipe(paragraph[:250])[0]["label"]
            annots_jobad.append({"text":paragraph,"label":res})
        text_short = " ".join([x["text"] for x in annots_jobad if x["label"] == "LABEL_1"])
        shortened_texts.append({ad:text_short})
        # safe results to json after every 50th ad
        if len(shortened_texts)%50 == 0:
            with open(filename, 'w',encoding= "utf-8") as fp:
                json.dump(shortened_texts, fp, indent = 2, ensure_ascii=False)
    # safe final results 
    with open(filename, 'w',encoding= "utf-8") as fp:
        json.dump(shortened_texts, fp, indent = 2, ensure_ascii=False)
    return shortened_texts

In [None]:
def choose_random_ads(df, k):
    random_choices = []
    ids_unique = df["esco_id"].unique()
    for id in tqdm(ids_unique):
        filtered_df = df[df["esco_id"] == id]
        if len(filtered_df) <= k:
            random_choices += filtered_df.to_dict("records")
        else:
            random_choices += random.sample(filtered_df.to_dict("records"), k)
    return random_choices

# Load Data

In [None]:
ads = load_json(r"../00_data/EURES/parsed_ads_final.json")
len(ads)

In [None]:
ads_df = pd.DataFrame(ads)
ads_df.drop(["count"], inplace=True, axis=1)
ads_df["count_esco_jobs"] = ads_df["esco_jobs"].apply(len)

In [None]:
unique_descriptions = ads_df["description"].unique()
len(unique_descriptions)

In [None]:
unique_descriptions = [x for x in tqdm(unique_descriptions) if x not in processed_descriptions]
len(unique_descriptions)

In [None]:
test_ads = load_json("../00_data/EURES/eures_testads_final.json")
selected_ads = pd.DataFrame(test_ads)
unique_descriptions = list(selected_ads["description"].unique())
len(unique_descriptions)

In [None]:
shortened_desc = multithread_shortening(unique_descriptions)

In [None]:
len(processed_ads)
len(ads)

In [None]:
ads_df = pd.DataFrame(ads)

In [None]:
replace_dict = {}
for item in processed_ads:
    replace_dict.update(item)

In [None]:
ads_df["short_texts"] = ads_df["description"].map(replace_dict)

In [None]:
write_json("../00_data/EURES/0_pars_short_ads_final.json", ads_df.to_dict("records"))