In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [None]:
# %%capture
# !pip install sentence_transformers
# !pip install pypdf
# !pip install xlsxwriter

In [None]:
# Import necessary libraries
from helpers import *
from sentence_transformers import SentenceTransformer,util
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from pypdf import PdfReader

In [None]:
# Load ESCO jobs data
jobs = load_json("../00_data/ESCO/ESCO_JOBS_ALL.json")
esco_lookup = {}
for job in jobs:
  esco_lookup[job["jobid_esco"]] = job["jobtitle"]
  esco_lookup[job["jobtitle"]] = job["jobid_esco"]

# Similarity Calculation

In [None]:
# Calculate TF-IDF similarity
def calculate_tfidf_similarity(inputtext):
  if isinstance(inputtext,str):
    inputtext = [inputtext]

  vectorizer = load_pickle("../00_data/TF-IDF_Vectorizer/1_vectorizer.pkl")
  tfidf_matrix = load_pickle("../00_data/TF-IDF_Vectorizer/1_tfidf_matrix.pkl")
  escoids = load_pickle("../00_data/TF-IDF_Vectorizer/1_escoids.pkl")

  inputtfidf = vectorizer.transform(inputtext)
  cosine_sim = cosine_similarity(inputtfidf, tfidf_matrix)
  cossim_df = pd.DataFrame(cosine_sim, columns = escoids, index=["similarity"]).T.sort_values(by="similarity",ascending=False).reset_index()
  cossim_df.columns = ["esco_id","similarity"]
  cossim_df["jobtitle"] = cossim_df["esco_id"].map(esco_lookup)


  return cosine_sim,cossim_df

In [None]:
# Load a single CV from a PDF file
def load_single_cv(filepath):
  cv = ""
  reader = PdfReader(filepath)

  pages = reader.pages
  for i in range(len(pages)):
      page = reader.pages[i].extract_text().strip()
      cv +=page
  return cv

In [None]:
# Predict similarity between query and job embeddings
def predict_similarity(query, model,embeddings,TFIDF=False,topn=20):

  # uses the provided model to create a sentence embedding
  query_embedding = model.encode(query)
  # compares the query embedding tothe corpus of job centroids
  similarity_calc = util.cos_sim(query_embedding, embeddings["job_centroid"]["embeddings"])

  # if TFIDF is set to true
  if TFIDF == True:
    #gets the maximum cosine similarity of the sbert model multiplies it with 0.2 (experimental value)
    max_influence_tfidf = (max(map(max, similarity_calc))).item()*0.2
    # calculates the TFIDF similarity
    similarities_tfidf = calculate_tfidf_similarity(query)[0]

    #check if tfidf found any match, otherwise skip adding TFIDF
    #if similarities_tfidf.all() != 0:
      #normalizes all values to "max_influence_tfidf"
    similarities_tfidf *= (max_influence_tfidf/similarities_tfidf.max())
      #adds both similarity matrices together
    similarity_calc = np.add(similarity_calc,similarities_tfidf)

  # wraps the similarity_calc into a dataframe and adds the esco ids, also sorts the results according to similarity
  results = pd.DataFrame(similarity_calc, columns=embeddings["job_centroid"]["esco_id"]).T.reset_index()
  results["jobtitle"] = embeddings["job_centroid"]["jobtitle"]
  results.columns = ["esco_id", "similarity","jobtitle"]
  results = results.sort_values(by="similarity",ascending=False).reset_index(drop=True)

  return results[["jobtitle","esco_id","similarity"]].iloc[:topn]

## Testing

In [None]:
# Load the pre-trained model
modelpath = "../00_data/SBERT_Models/models/gbert_batch32_woTSDAE_2e-05_f10/"
model = SentenceTransformer(modelpath)
embeddings = load_pickle(f"{modelpath}/embeddings.pkl")

In [None]:
# Check embeddings keys
embeddings.keys()

In [None]:
# Load a CV for testing
path_to_cv = f"../00_data/CVs/CV_1.pdf"
cv = load_single_cv(path_to_cv)

In [None]:
# Predict similarity for the loaded CV
predict_similarity(cv,model,embeddings,TFIDF = True)

# Load CVs for prediction

In [None]:
# Define models and evaluation path
modellist = [
             "jobgbert_batch32_woTSDAE_2e-05_f10",
             "gbert_batch32_woTSDAE_2e-05_f10"
             
             ]
path_for_eval = "../00_data/CV_Evaluation/results"

## Make final Predictions, save as Excel

In [None]:
# Load pickle file
def load_pickle(filepath):
    with open(filepath, "rb") as fIn:
        stored_data = pickle.load(fIn)
    return stored_data

In [None]:
# Import joblib
import joblib

In [None]:
# Evaluate CVs and save predictions
for i in range(1,6):
  #opening cvs
  print(f"Evaluating CV {i}")
  path_to_cv = f"../00_data/CVs/CV_{i}.pdf"
  cv = load_single_cv(path_to_cv)
  results = {}
  model_dict = {}
  #iterating over models
  for model_no, model_name in enumerate(modellist):
    print(model_name)
    model = SentenceTransformer(f"../00_data/SBERT_Models/models/{model_name}")
    embeddings = load_pickle(f"../00_data/SBERT_Models/models/{model_name}/embeddings.pkl")

    #make predicitions with current model with tfidf
    results[f"M{model_no+1}_w_tfidf"] = predict_similarity(cv,model,embeddings,TFIDF=True,topn=20)
    #make predicitions with current model without tfidf
    results[f"M{model_no+1}_wo_tfidf"] = predict_similarity(cv,model,embeddings,TFIDF=False,topn=20)
    #modelnames are too long for excel sheet names, create a lookup to keep track of models
    model_dict[f"M{model_no+1}"] = model_name

    #save everything in one excel file per CV
    with pd.ExcelWriter(f"{path_for_eval}CV_{i}_x.xlsx", engine='xlsxwriter') as writer:
      for k, v in results.items():
        v.to_excel(writer, sheet_name=k)
      pd.Series(list(set(pd.concat(results.values())["jobtitle"])),name="jobtitle").to_excel(writer,sheet_name="concat")
      pd.Series(model_dict).to_excel(writer,sheet_name="model_lookup")

In [None]:
# Load CV data
cvs = pd.DataFrame(load_json("../00_data/CVs/cv_data.json"))
cvs.head()

In [None]:
# Encode CVs using the model
cv_embeddings = model.encode(cvs["text"].tolist(), show_progress_bar=True)
cvs["embeddings"] = cv_embeddings.tolist()

# Match CVs with Job Centroids

In [None]:
# Load job centroids
job_centroids = pd.DataFrame(load_json("../00_data/SBERT_Models/job_centroids.json"))
job_centroids.head()

In [None]:
# Calculate cosine similarity between CVs and job centroids
similarities = cosine_similarity(
    np.array(cvs["embeddings"].tolist()),
    np.array(job_centroids["embeddings"].tolist())
)
cvs["job_matches"] = [list(sim) for sim in similarities]

In [None]:
# Save the results
write_pickle("../00_data/Results/cv_job_matches.pkl", cvs)