In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [None]:
# %%capture
# !pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer,util
from helpers import load_json,write_pickle,load_pickle
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
jobs = load_json(r"../00_data/ESCO/ESCO_JOBS_ALL.json")
esco_lookup = {}
for job in jobs:
  esco_lookup[job["jobid_esco"]] = job["jobtitle"]
  esco_lookup[job["jobtitle"]] = job["jobid_esco"]

# Functions

In [None]:
def load_single_cv(filepath):
  cv = ""
  reader = PdfReader(filepath)

  pages = reader.pages
  for i in range(len(pages)):
      page = reader.pages[i].extract_text().strip()
      cv +=page
  return cv

# Load Model

In [None]:
modelpath = "/content/drive/MyDrive/TRAINING/_COSINESIM/content/jobgbert_batch32_woTSDAE_2e-05_f10"

In [None]:
model = SentenceTransformer(modelpath)

# Creating Embedding Space

The Embedding space consists of Job Centroids. Those are the mean of unfiltered Job Advertisement Centroids, where available (JACs) and the ESCO Job Descriptions.

**Steps:**
1. Encode Description of all ESCO Jobs
2. Encode all Job Advertisements
3. Create Job Advertisement centroid per ESCO Job (JACs)
4. Create the Job Centroid (JC) from both JAC & Description and save the embeddings


## Encode Description of all ESCO Jobs

In [None]:
esco_jobs = load_json(r"../00_data/ESCO/ESCO_JOBS_ALL.json")

In [None]:
descriptions = [x["jobdescription"] for x in esco_jobs]
esco_ids = [x["jobid_esco"] for x in esco_jobs]
jobtitles = [x["jobtitle"] for x in esco_jobs]

In [None]:
desc_embeddings = desc_embeddings = model.encode(descriptions,show_progress_bar=True)

In [None]:
embeddings = {}
embeddings["descriptions"] = {"jobtitle":jobtitles, "esco_id":esco_ids, "embeddings":desc_embeddings}

## encode Job Advertisements

In [None]:
job_ads = load_json("../00_data/EURES/0_pars_short_ads_final.json")

In [None]:
#create text to encode by concatenating title and shortened texts
def concat_short(ad):
  return ad["title"] + ad["short_texts"]

In [None]:
ads_df = pd.DataFrame(job_ads)

In [None]:
ads_df["final_text"] = ads_df.apply(concat_short,axis=1)

In [None]:
# as texts are not unique, create a dict of texts and their encodings and map it to the original text
unique_texts = list(set(ads_df["final_text"] ))
embedding_map = {}
embeddings_jobads = model.encode(unique_texts,show_progress_bar=True,convert_to_tensor=True, batch_size=64)
for text, embeddings_jobads in zip(unique_texts,embeddings_jobads.tolist()):
      embedding_map[text] = embeddings_jobads
ads_df["embeddings"] = ads_df["final_text"].map(embedding_map)

## Create Job Advertisement centroid per ESCO Job (JACs)

In [None]:
JAC_DICT = {}
JAC_jobtitles = []
# create JACs
for id in tqdm(ads_df["esco_id"].unique()):
    id_filter = ads_df[ads_df["esco_id"]==id]
    JAC_jobtitles.append(id_filter["esco_job"].unique()[0])
    JAC =  np.stack(list(id_filter["embeddings"])).mean(axis=0, dtype="float32")
    JAC_DICT[id] = JAC

In [None]:
embeddings.keys()

In [None]:
embeddings["JACs"] = {"jobtitle":JAC_jobtitles,
                      "esco_id":list(JAC_DICT.keys()),
                      "embeddings":list(JAC_DICT.values())}

In [None]:
combined_embeddings = []
for k in embeddings:
  for id,job, embedding in zip(embeddings[k]["esco_id"],embeddings[k]["jobtitle"], embeddings[k]["embeddings"]):
      combined_embeddings.append({"esco_id":id,"jobtitle":job,"embeddings":embedding, "kind": k})
centroid_df = pd.DataFrame(combined_embeddings)


In [None]:
JC_titles, JC_embeddings, JC_esco_ids= [],[],[]
for id in tqdm(centroid_df["esco_id"].unique()):
    filtered_df = centroid_df[centroid_df["esco_id"]==id]
    stacked_embedding = np.stack(list(filtered_df["embeddings"])).mean(axis=0, dtype="float32")
    JC_embeddings.append(stacked_embedding)
    JC_esco_ids.append(id)
    JC_titles.append(filtered_df["jobtitle"].iloc[0])

In [None]:
embeddings["job_centroids"] = {"jobtitle":JC_titles,
                      "esco_id":JC_esco_ids,
                      "embeddings":JC_embeddings}

In [None]:
embeddings.keys()

In [None]:
embeddings.keys()
write_pickle(f"{modelpath}/embeddings.pkl",embeddings)

# Plotting



In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import plotly.express as px

In [None]:
embeddings = load_pickle(f"{modelpath}/embeddings.pkl")


In [None]:
tsne = TSNE(n_components=2,random_state=0)

In [None]:
Y = tsne.fit_transform(np.array(embeddings['job_centroids']['embeddings'], dtype=np.float32))
plotting = []
for jobtitle, xy in zip(embeddings['job_centroids']["jobtitle"], Y):
  plotting.append({"jobtitle":jobtitle,"x":xy[0],"y":xy[1]})
plotting = pd.DataFrame(plotting)

In [None]:
clustering = DBSCAN(eps=3, min_samples=10).fit(Y)
plotting["cluster"] = clustering.labels_

In [None]:
plotting_x = plotting[plotting["cluster"]!=-1]
px.scatter(data_frame = plotting_x,
           x = "x",
           y="y",
           color = "cluster",
           hover_data = ["jobtitle"],
           color_discrete_map=True
          #  text = plotting["jobtitle"],
)

In [None]:
tsne = TSNE(n_components=3,random_state=0)
Y = tsne.fit_transform(np.array(embeddings['job_centroids']['embeddings'], dtype=np.float32))

plotting = []
for jobtitle, xyz in zip(embeddings['job_centroids']["jobtitle"], Y):
  plotting.append({"jobtitle":jobtitle,"x":xyz[0],"y":xyz[1], "z": xyz[2]})
plotting = pd.DataFrame(plotting)

clustering = DBSCAN(eps=3, min_samples=10).fit(Y)
plotting["cluster"] = clustering.labels_