In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [None]:
# %%capture
# !pip install sentence_transformers
# !pip install pypdf
# !pip install xlsxwriter

In [None]:
from sentence_transformers import SentenceTransformer, util
from datetime import datetime
from IPython.core.display import display, HTML

import os

import random
random.seed(42)
import pandas as pd
from helpers import *
import pickle
import numpy as np
import torch
from pypdf import PdfReader
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm 
from collections import Counter
import math
from nltk import word_tokenize
from transformers import AutoModelForSequenceClassification, AutoTokenizer,pipeline
import nltk
nltk.download('punkt')

# Functions

In [None]:
def load_cvs():
    cvs = []
    for i in range(1,6):
        cv = ""
        reader = PdfReader(f"../00_data/CVs/CV_{i}.pdf")

        pages = reader.pages
        for i in range(len(pages)):
            page = reader.pages[i].extract_text().strip()
            cv +=page
        cvs.append(cv)
    return cvs

In [None]:
def load_pickle(filepath):
  with open(filepath, "rb") as fIn:
      stored_data = pickle.load(fIn)
  return stored_data

In [None]:
def setup_classifier():
    path = "../00_data/Classifier/model_classification_jobgbert/"
    model = AutoModelForSequenceClassification.from_pretrained(path)
    tokenizer = AutoTokenizer.from_pretrained(path)
    Classifier = pipeline("text-classification",model,tokenizer=tokenizer)
    return Classifier


In [None]:
def ismanager(ad):
  return "leiter" in ad["jobtitle"].lower() or "führungskraft" in ad["jobtitle"].lower() or "arzt" in ad["jobtitle"].lower() #or ad["esco_id"][0]=="1"

In [None]:
def text_alphanum(text):
  return "".join(x for x in text if x.isalnum() or x.isspace())

In [None]:
def shorten_text(text,pipe):
    annots_jobad = []
    splitted_text = [x for x in text.split("\n") if x != "" or x != " ,"]
    print(len(splitted_text))
    if len(splitted_text) <= 2 or len(splitted_text)>50:
      tokenized = word_tokenize(text, language="german")
      no_chunks = math.ceil(len(tokenized)/20)
      print(no_chunks)
      splitted_text = np.array_split((tokenized), no_chunks)
      splitted_text = [" ".join(x) for x in splitted_text]
    #print(len(splitted_text))
    for paragraph in splitted_text:
        try:
            res = pipe(paragraph)[0]["label"]
        except:
            res = pipe(paragraph[:250])[0]["label"]
        annots_jobad.append({"text":paragraph,"label":res})
        text_short = " ".join([x["text"] for x in annots_jobad if x["label"] == "LABEL_1"])
    return text_short

# Load Data

In [None]:
prefix_path = "../00_data/SBERT_Models/models/"

In [None]:
testads = pd.DataFrame(load_json(r"../00_data/EURES/eures_testads_final_short.json"))

# Load Model

In [None]:
for model in os.listdir(prefix_path):
  print(f"Available model: {model}")

In [None]:
paths = [#"deepset/gbert-base",
          #"agne/jobGBERT",
          #"jobgbert_TSDAE_epochs5/",
          #"gbert_TSDAE_epochs5/",
          #"jobgbert_batch16_woTSDAE_2e-05_f10/",
          #"jobgbert_batch16_wTSDAE_2e-05_f10/",
          "jobgbert_batch32_woTSDAE_2e-05_f10/",
          #"jobgbert_batch32_wTSDAE_2e-05_f10/",
          #"jobgbert_batch64_woTSDAE_2e-05_f10/",
          #"jobgbert_batch64_wTSDAE_2e-05_f10/",
          #"gbert_batch16_woTSDAE_2e-05_f10/",
          #"gbert_batch16_wTSDAE_2e-05_f10/",
          "gbert_batch32_woTSDAE_2e-05_f10/",
         # "gbert_batch32_wTSDAE_2e-05_f10/",
          #"gbert_batch64_woTSDAE_2e-05_f10/",
          #"gbert_batch64_wTSDAE_2e-05_f10/",
          ]
          

# Evaluate with Test Ads

In [None]:
MRR = []
MRR_AT = 100
currently = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])

for model_path in paths:
  print(f"Loading Model {model_path}")
  if model_path in ["agne/jobGBERT","deepset/gbert-base"]:
    model = SentenceTransformer(model_path)
    embeddings = encode_jobs(model)
  else:
    model = SentenceTransformer(f"../00_data/SBERT_Models/Models{model_path}")
    embeddings = load_pickle(f"../00_data/SBERT_Models/Models{model_path}embeddings.pkl")
  print(embeddings.keys())
  print("Creating Embeddings.")
  encodings_short = model.encode(list(testads["short_texts"]), show_progress_bar=True)
 # encodings_long = model.encode(list(testads["description"]), show_progress_bar=True) 

  testads["embeddings_short"] = encodings_short.tolist()
  #testads["embeddings_long"] = encodings_long.tolist()

  print("Finished creating Embeddings. Evaluating.")

  for textkind in ["embeddings_short"]:#,"embeddings_long"]:
    similarities = {}
    for k in ["job_centroid"]:
      similarities[k] = (util.cos_sim(testads[textkind],embeddings[k]["embeddings"]))
    
    for k in similarities:
      ranks = []
      missing = 0
      max_similarity = (max(map(max, similarities[k])))
      simdf = pd.DataFrame(similarities[k],columns=embeddings[k]["esco_id"], index=testads["esco_id"])
      for i in tqdm(range(len(simdf))):
        id = simdf.iloc[i].name
        series = simdf.iloc[i].sort_values(ascending=False).reset_index()
        #print(series)
        rank = (series[series["index"]==id].index.item()+1)
        #print(rank)
        if rank > MRR_AT:
          missing +=1
          ranks.append(0)
        else:
          ranks.append(1/rank)
      missing = missing/len(simdf)
      MRR.append({"model":model_path.split("/")[-2],"textkind": textkind,"embedding_kind":k,
                  "MRR":np.mean(ranks), "missing":missing, "max_similarity": max_similarity, "MRR@":MRR_AT})
      df = pd.DataFrame(MRR).sort_values(by=["MRR"], ascending=[False]).reset_index(drop=True)
      display(df)
      df.to_excel(f"../00_data/SBERT_Models/Evaluation/{currently}_evaluation.xlsx")

In [None]:
# df = pd.read_excel(f"/content/drive/MyDrive/TRAINING/_COSINESIM/20230403173019_evaluation.xlsx")

# Test with TFIDF 

In [None]:
def calculate_tfidf_similarity(inputtext):
  if isinstance(inputtext,str):
    inputtext = [inputtext]
  path = "../00_data/TF-IDF_Vectorizer"
  vectorizer = load_pickle(path+"1_vectorizer.pkl")
  tfidf_matrix = load_pickle(path+"1_tfidf_matrix.pkl")
  #jobtitles = load_pickle("/content/drive/MyDrive/TRAINING/data/1_jobnames.pkl")
  escoids = load_pickle(path+"1_escoids.pkl")
  inputtfidf = vectorizer.transform(inputtext)
  cosine_sim = cosine_similarity(inputtfidf, tfidf_matrix)
  #cossim_df = pd.DataFrame(cosine_sim, columns = escoids, index=["similarity"]).T.sort_values(by="similarity",ascending=False).reset_index()
  #cossim_df.columns = ["esco_id","similarity"]
  #ranks = list(cossim_df.index+1)
  #cossim_df["rank"] = ranks
  return cosine_sim#,cossim_df

In [None]:
calculate_tfidf_similarity(testads["short_texts"])

In [None]:
MRR = []
MRR_AT = 100
currently = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])

for model_path in paths:
  print(f"Loading Model {model_path}")
  if model_path in ["agne/jobGBERT","deepset/gbert-base"]:
    model = SentenceTransformer(model_path)
    embeddings = encode_jobs(model)
  else:
    model = SentenceTransformer(f"/content/drive/MyDrive/TRAINING/_COSINESIM/content/{model_path}")
    embeddings = load_pickle(f"/content/drive/MyDrive/TRAINING/_COSINESIM/content/{model_path}embeddings.pkl")
  print(embeddings.keys())
  print("Creating Embeddings.")
  encodings_short = model.encode(list(testads["short_texts"]), show_progress_bar=True)
  #encodings_long = model.encode(list(testads["description"]), show_progress_bar=True) 

  testads["embeddings_short"] = encodings_short.tolist()
  #testads["embeddings_long"] = encodings_long.tolist()

  print("Finished creating Embeddings. Evaluating.")

  for textkind in ["embeddings_short"]:
    similarities = {}
    for k in ["job_centroid"]:
        similarities_bert = (util.cos_sim(testads[textkind],embeddings[k]["embeddings"]))
        similarities[k+"_woTFIDF"] = similarities_bert

        max_influence_tfidf = (max(map(max, similarities_bert))).item()*0.2
        similarities_tfidf = calculate_tfidf_similarity(testads["short_texts"])
        similarities_tfidf *= max_influence_tfidf/similarities_tfidf.max()
        
        similarities_combi = np.add(similarities_bert,similarities_tfidf)
        similarities[k+"_wTFIDF"] = similarities_combi
    print(similarities.keys())
    for k in similarities:
      ranks = []
      missing = 0
      simdf = pd.DataFrame(similarities[k],columns=embeddings["job_centroid"]["esco_id"], index=testads["esco_id"])
      for i in tqdm(range(len(simdf))):
        id = simdf.iloc[i].name
        series = simdf.iloc[i].sort_values(ascending=False).reset_index()
        #print(series)
        rank = (series[series["index"]==id].index.item()+1)
        #print(rank)
        if rank > MRR_AT:
          missing +=1
          ranks.append(0)
        else:
          ranks.append(1/rank)
      missing = missing/len(simdf)
      MRR.append({"model":model_path.split("/")[-2],"textkind": textkind,"embedding_kind":k, "MRR":np.mean(ranks), "missing":missing, "MRR@":MRR_AT})
      df = pd.DataFrame(MRR).sort_values(by=["MRR"], ascending=[False]).reset_index(drop=True)
      display(df)
      df.to_excel(f"../00_data/SBERT_Models/Evaluation/{currently}_evaluation.xlsx")