In [None]:
# Mount Google Drive (if using Google Colab)
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [None]:
# Install necessary libraries (if using Google Colab)
# %%capture
# !pip install sentence_transformers
# !pip install pypdf
# !pip install xlsxwriter

In [None]:
import sys
sys.path.append("..")
from _utils import load_json,load_pickle
from training_helpers import encode_jobs

In [None]:
import pandas as pd
import os
from datetime import datetime
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import numpy as np
import xlsxwriter

# Functions

# Load Data

In [None]:
prefix_path = "../00_data/SBERT_Models/models/"

In [None]:
testads = pd.DataFrame(load_json(r"../00_data/EURES/eures_testads_final_short.json"))

# Load Model

In [None]:
for model in os.listdir(prefix_path):
  print(f"Available model: {model}")

In [None]:
paths = [#"deepset/gbert-base",
          #"agne/jobGBERT",
          #"jobgbert_TSDAE_epochs5/",
          #"gbert_TSDAE_epochs5/",
          #"jobgbert_batch16_woTSDAE_2e-05_f10/",
          #"jobgbert_batch16_wTSDAE_2e-05_f10/",
          "jobgbert_batch32_woTSDAE_2e-05_f10/",
          #"jobgbert_batch32_wTSDAE_2e-05_f10/",
          #"jobgbert_batch64_woTSDAE_2e-05_f10/",
          #"jobgbert_batch64_wTSDAE_2e-05_f10/",
          #"gbert_batch16_woTSDAE_2e-05_f10/",
          #"gbert_batch16_wTSDAE_2e-05_f10/",
          "gbert_batch32_woTSDAE_2e-05_f10/",
         # "gbert_batch32_wTSDAE_2e-05_f10/",
          #"gbert_batch64_woTSDAE_2e-05_f10/",
          #"gbert_batch64_wTSDAE_2e-05_f10/",
          "consultantbert_multilingual_best/"
          ]
          

# Evaluate with Test Ads

In [None]:
MRR = []
MRR_AT = 100
currently = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])

for model_path in paths:
  print(f"Loading Model {model_path}")
  if model_path in ["agne/jobGBERT","deepset/gbert-base"]:
    model = SentenceTransformer(model_path)
    embeddings = encode_jobs(model)
  else:
    model = SentenceTransformer(f"../00_data/SBERT_Models/models/{model_path}")
    embeddings = load_pickle(f"../00_data/SBERT_Models/models/{model_path}embeddings.pkl")
  print(embeddings.keys())
  print("Creating Embeddings.")
  encodings_short = model.encode(list(testads["short_texts"]), show_progress_bar=True)
 # encodings_long = model.encode(list(testads["description"]), show_progress_bar=True) 

  testads["embeddings_short"] = encodings_short.tolist()
  #testads["embeddings_long"] = encodings_long.tolist()

  print("Finished creating Embeddings. Evaluating.")

  for textkind in ["embeddings_short"]:#,"embeddings_long"]:
    similarities = {}
    for k in ["job_centroid"]:
      similarities[k] = (util.cos_sim(testads[textkind],embeddings[k]["embeddings"]))
    
    for k in similarities:
      ranks = []
      missing = 0
      max_similarity = (max(map(max, similarities[k])))
      simdf = pd.DataFrame(similarities[k],columns=embeddings[k]["esco_id"], index=testads["esco_id"])
      for i in tqdm(range(len(simdf))):
        id = simdf.iloc[i].name
        series = simdf.iloc[i].sort_values(ascending=False).reset_index()
        #print(series)
        rank = (series[series["index"]==id].index.item()+1)
        #print(rank)
        if rank > MRR_AT:
          missing +=1
          ranks.append(0)
        else:
          ranks.append(1/rank)
      missing = missing/len(simdf)
      MRR.append({"model":model_path.split("/")[-2],"textkind": textkind,"embedding_kind":k,
                  "MRR":np.mean(ranks), "missing":missing, "max_similarity": max_similarity, "MRR@":MRR_AT})
      df = pd.DataFrame(MRR).sort_values(by=["MRR"], ascending=[False]).reset_index(drop=True)
      display(df)
      df.to_excel(f"../00_data/SBERT_Models/evaluation/{currently}_evaluation.xlsx")

In [None]:
# df = pd.read_excel(f"/content/drive/MyDrive/TRAINING/_COSINESIM/20230403173019_evaluation.xlsx")

# Test with TFIDF 

In [None]:
def calculate_tfidf_similarity(inputtext):
  if isinstance(inputtext,str):
    inputtext = [inputtext]
  path = "../00_data/TF-IDF_Vectorizer"
  vectorizer = load_pickle(path+"1_vectorizer.pkl")
  tfidf_matrix = load_pickle(path+"1_tfidf_matrix.pkl")
  #jobtitles = load_pickle("/content/drive/MyDrive/TRAINING/data/1_jobnames.pkl")
  escoids = load_pickle(path+"1_escoids.pkl")
  inputtfidf = vectorizer.transform(inputtext)
  cosine_sim = cosine_similarity(inputtfidf, tfidf_matrix)
  #cossim_df = pd.DataFrame(cosine_sim, columns = escoids, index=["similarity"]).T.sort_values(by="similarity",ascending=False).reset_index()
  #cossim_df.columns = ["esco_id","similarity"]
  #ranks = list(cossim_df.index+1)
  #cossim_df["rank"] = ranks
  return cosine_sim#,cossim_df

In [None]:
calculate_tfidf_similarity(testads["short_texts"])

In [None]:
MRR = []
MRR_AT = 100
currently = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])

for model_path in paths:
  print(f"Loading Model {model_path}")
  if model_path in ["agne/jobGBERT","deepset/gbert-base"]:
    model = SentenceTransformer(model_path)
    embeddings = encode_jobs(model)
  else:
    model = SentenceTransformer(f"/content/drive/MyDrive/TRAINING/_COSINESIM/content/{model_path}")
    embeddings = load_pickle(f"/content/drive/MyDrive/TRAINING/_COSINESIM/content/{model_path}embeddings.pkl")
  print(embeddings.keys())
  print("Creating Embeddings.")
  encodings_short = model.encode(list(testads["short_texts"]), show_progress_bar=True)
  #encodings_long = model.encode(list(testads["description"]), show_progress_bar=True) 

  testads["embeddings_short"] = encodings_short.tolist()
  #testads["embeddings_long"] = encodings_long.tolist()

  print("Finished creating Embeddings. Evaluating.")

  for textkind in ["embeddings_short"]:
    similarities = {}
    for k in ["job_centroid"]:
        similarities_bert = (util.cos_sim(testads[textkind],embeddings[k]["embeddings"]))
        similarities[k+"_woTFIDF"] = similarities_bert

        max_influence_tfidf = (max(map(max, similarities_bert))).item()*0.2
        similarities_tfidf = calculate_tfidf_similarity(testads["short_texts"])
        similarities_tfidf *= max_influence_tfidf/similarities_tfidf.max()
        
        similarities_combi = np.add(similarities_bert,similarities_tfidf)
        similarities[k+"_wTFIDF"] = similarities_combi
    print(similarities.keys())
    for k in similarities:
      ranks = []
      missing = 0
      simdf = pd.DataFrame(similarities[k],columns=embeddings["job_centroid"]["esco_id"], index=testads["esco_id"])
      for i in tqdm(range(len(simdf))):
        id = simdf.iloc[i].name
        series = simdf.iloc[i].sort_values(ascending=False).reset_index()
        #print(series)
        rank = (series[series["index"]==id].index.item()+1)
        #print(rank)
        if rank > MRR_AT:
          missing +=1
          ranks.append(0)
        else:
          ranks.append(1/rank)
      missing = missing/len(simdf)
      MRR.append({"model":model_path.split("/")[-2],"textkind": textkind,"embedding_kind":k, "MRR":np.mean(ranks), "missing":missing, "MRR@":MRR_AT})
      df = pd.DataFrame(MRR).sort_values(by=["MRR"], ascending=[False]).reset_index(drop=True)
      display(df)
      df.to_excel(f"../00_data/SBERT_Models/Evaluation/{currently}_evaluation.xlsx")