# Imports

In [8]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [9]:
# %%capture
# !pip install sentence_transformers

In [1]:
from sentence_transformers import SentenceTransformer, InputExample,evaluation,models, losses, util
from torch.utils.data import DataLoader
from tqdm import tqdm
import random
import numpy as np
import json
import pandas as pd
import nltk
nltk.download('punkt')
from collections import Counter
import random
random.seed(42)
from helpers import *
import os
from datetime import datetime
import torch

[nltk_data] Downloading package punkt to /home/roj14702/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# helpers

In [2]:
# from sbert library, original function has a bug, changes are not merged. https://github.com/UKPLab/sentence-transformers/commit/d8982c9f0d44f8a3c41579fa64c603eca029649b
def community_detection(embeddings, threshold=0.75, min_community_size=10, batch_size=1024):
    """
    Function for Fast Community Detection
    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.
    """

    threshold = torch.tensor(threshold, device=embeddings.device)

    extracted_communities = []

    # Maximum size for community
    min_community_size = min(min_community_size, len(embeddings))
    sort_max_size = min(max(2 * min_community_size, 50), len(embeddings))

    for start_idx in range(0, len(embeddings), batch_size):
        # Compute cosine similarity scores
        cos_scores = util.cos_sim(embeddings[start_idx:start_idx + batch_size], embeddings)
        # Minimum size for a community
        top_k_values, _ = cos_scores.topk(k=min_community_size, largest=True)

        # Filter for rows >= min_threshold
        for i in range(len(top_k_values)):
            if top_k_values[i][-1] >= threshold:
                new_cluster = []

                # Only check top k most similar entries
                top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)

                # Check if we need to increase sort_max_size
                while top_val_large[-1] > threshold and sort_max_size < len(embeddings):
                    sort_max_size = min(2 * sort_max_size, len(embeddings))
                    top_val_large, top_idx_large = cos_scores[i].topk(k=sort_max_size, largest=True)

                for idx, val in zip(top_idx_large.tolist(), top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)

                extracted_communities.append(new_cluster)

        del cos_scores

    # Largest cluster first
    extracted_communities = sorted(extracted_communities, key=lambda x: len(x), reverse=True)

    # Step 2) Remove overlapping communities
    unique_communities = []
    extracted_ids = set()

    for cluster_id, community in enumerate(extracted_communities):
        community = sorted(community)
        non_overlapped_community = []
        for idx in community:
            if idx not in extracted_ids:
                non_overlapped_community.append(idx)

        if len(non_overlapped_community) >= min_community_size:
            unique_communities.append(non_overlapped_community)
            extracted_ids.update(non_overlapped_community)

    unique_communities = sorted(unique_communities, key=lambda x: len(x), reverse=True)

    return unique_communities 

In [3]:
def filter_communities_total(df,columnname_to_encode, model, min_community_size=10, threshold=0.75):

    unique_texts = list(set(df[columnname_to_encode]))
    print(f"{len(unique_texts)} ads to encode.")
    embedding_map = {}
    embeddings = model.encode(unique_texts,show_progress_bar=True,convert_to_tensor=True, batch_size=64)
    print("Finished encoding.")
    for text, embedding in zip(unique_texts,embeddings.tolist()):
      embedding_map[text] = embedding
    df["embeddings"] = df[columnname_to_encode].map(embedding_map)

    results = []
    esco_ids = list(df["esco_id"].unique())
    df["community"] = None
    print("Adding Communities.")
    for id in tqdm((esco_ids)):
        filtered_df = df[df["esco_id"]==id].reset_index(drop=True)
        #display(filtered_df)
        if len(filtered_df) < min_community_size:
            filtered_df["community"] = 0
        else:
            embds = [torch.FloatTensor(x) for x in filtered_df["embeddings"]]
            embds = torch.stack(embds)
            communities = community_detection(embds,threshold=threshold,min_community_size=min_community_size)
            if len(communities) > 0:
              for index, community in enumerate(communities):
                filtered_df.loc[community, "community"] = index
            else:
              filtered_df["community"] = 0
        results += filtered_df.to_dict("records")
    result_df = pd.DataFrame(results)
    result_df.reset_index(drop=True,inplace=True)
    return result_df

# Model Loading

# Load Data

In [4]:
ads_df = pd.DataFrame(load_json(r"../00_data/EURES/0_pars_short_ads_final.json"))
ads_df.columns

Index(['searched_esco_job', 'title', 'url', 'esco_jobs', 'publication_date',
       'esco_job', 'esco_id', 'description', 'has_alpha', 'length', 'count',
       'short_texts'],
      dtype='object')

In [5]:
def concat_short(ad):
  return ad["title"] + ad["short_texts"]

In [6]:
ads_df["short+title"] = ads_df.apply(concat_short,axis=1)

In [7]:
len(ads_df["esco_id"].unique())

1700

In [8]:
jobs = (load_json("../00_data/ESCO/ESCO_JOBS_ALL.json"))
esco_lookup = {}
for job in jobs:
  esco_lookup[job["jobid_esco"]] = job["jobtitle"]


# Encoding

In [9]:

paths = [
    #"../00_data/SBERT_Models/gbert_batch16_wTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/gbert_batch32_wTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/gbert_batch64_wTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/gbert_batch16_woTSDAE_2e-05_f10/",
    "../00_data/SBERT_Models/models/gbert_batch32_woTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/gbert_batch64_woTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/jobgbert_batch16_wTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/jobgbert_batch32_wTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/jobgbert_batch64_wTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/jobgbert_batch16_woTSDAE_2e-05_f10/",
    "../00_data/SBERT_Models/models/jobgbert_batch32_woTSDAE_2e-05_f10/",
    #"../00_data/SBERT_Models/jobgbert_batch64_woTSDAE_2e-05_f10/", 
    "../00_data/SBERT_Models/models/consultantbert_multilingual_best/"
        ]

In [10]:
for model_path in paths:
  model = SentenceTransformer(model_path)
  embeddings = encode_jobs(model)
  write_pickle(f"{model_path}embeddings.pkl",embeddings)

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Pickle saved.


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Pickle saved.


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

No sentence-transformers model found with name ../00_data/SBERT_Models/models/openai-ada/. Creating a new one with mean pooling.


Pickle saved.


ValueError: Unrecognized model in ../00_data/SBERT_Models/models/openai-ada/. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, graphormer, grounding-dino, groupvit, hiera, hubert, ibert, idefics, idefics2, idefics3, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, time_series_transformer, timesformer, timm_backbone, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zoedepth

In [24]:
for model_path in paths:
  print(f"Creating Encodings for {model_path}")
  model = SentenceTransformer(model_path)
  df_new = filter_communities_total(ads_df,"short+title", model, min_community_size=100,threshold=0.5)
  uniqueids = set(df_new["esco_id"].unique())
  unfiltered_centroids_dict = {}
  filtered_centroids_dict = {}
  print("Creating Centroids.")
  # creates the unfiltered centroids
  for id in tqdm(uniqueids):
    # creates the unfiltered centroids
    id_filter = df_new[df_new["esco_id"]==id]
    unfiltered_centroids =  np.stack(list(id_filter["embeddings"])).mean(axis=0, dtype="float32")
    unfiltered_centroids_dict[id] = unfiltered_centroids

    # creates the filtered centroids
    id_filter = df_new[(df_new["esco_id"]==id) & (df_new["community"]==0)]
    filtered_centroids =  np.stack(list(id_filter["embeddings"])).mean(axis=0, dtype="float32")
    filtered_centroids_dict[id] = filtered_centroids

  write_pickle(f"{model_path}centroids_unfiltered.pkl",unfiltered_centroids_dict)
  write_pickle(f"{model_path}centroids_filtered.pkl",filtered_centroids_dict)

Creating Encodings for ../00_data/SBERT_Models/models/gbert_batch32_woTSDAE_2e-05_f10/
84273 ads to encode.


Batches:   0%|          | 0/1317 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
for model_path in paths:
  embeddings = load_pickle(f"{model_path}embeddings.pkl")
  print(embeddings.keys())
  #transform JACs in necessary format
  for centroid_path in ["centroids_filtered.pkl","centroids_unfiltered.pkl"]:
    centroids = load_pickle(f"{model_path}{centroid_path}")
    centroids_ids = list(centroids.keys())
    centroid_values = list(centroids.values())
    centroid_jobs = [esco_lookup[x] for x in centroids_ids]
    centroid_kind = "adcentroid_"+centroid_path.split("_")[1].split(".")[0]
    embeddings[centroid_kind] = {"jobtitle":centroid_jobs,"esco_id":centroids_ids,"embeddings":centroid_values}

  #create centroids from description and adcentroid unfiltered
  job_centroid = []
  for k in ['desc', 'adcentroid_unfiltered']:
    for id,job, embedding in zip(embeddings[k]["esco_id"],embeddings[k]["jobtitle"], embeddings[k]["embeddings"]):
      job_centroid.append({"esco_id":id,"jobtitle":job,"embeddings":embedding})
    
  #save mappings
  job_embeddings_jobtitle, job_embeddings_esco_ids, job_embeddings= [],[],[]
  centroid_df = pd.DataFrame(job_centroid)
  for id in centroid_df["esco_id"].unique():
    filtered_df = centroid_df[centroid_df["esco_id"]==id]
    stacked_embedding = np.stack(list(filtered_df["embeddings"])).mean(axis=0, dtype="float32")
    job_embeddings.append(stacked_embedding)
    job_embeddings_esco_ids.append(id)
    job_embeddings_jobtitle.append(filtered_df["jobtitle"].iloc[0])

  embeddings["job_centroid"] = {"jobtitle":job_embeddings_jobtitle,"esco_id":job_embeddings_esco_ids,"embeddings":job_embeddings}
  write_pickle(f"{model_path}embeddings.pkl",embeddings)

dict_keys(['skillsets', 'desc', 'jobtitle'])
Pickle saved.
