# Imports

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [None]:
import sys
sys.path.append('..')
from _utils import load_json, load_pickle, flatten_list

In [None]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pypdf import PdfReader
from sklearn.metrics.pairwise import cosine_similarity
from helpers import *
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import numpy as np

In [None]:
# Define a function to calculate similarity between user skills and job titles
def calculate_similarity_user_skills(input_text, top_n=10):
    """
    Calculate similarity between user input and job titles using TF-IDF.
    Args:
        input_text (str): User input text.
        top_n (int): Number of top matches to return.
    Returns:
        DataFrame: Similarity scores for top matches.
    """
    if isinstance(input_text, str):
        input_text = [input_text]
    path = "../00_data/TF-IDF_Vectorizer"
    vectorizer = load_pickle(path + "1_vectorizer.pkl")
    tfidf_matrix = load_pickle(path + "1_tfidf_matrix.pkl")
    job_titles = load_pickle(path + "1_jobnames.pkl")
    input_tfidf = vectorizer.transform(input_text)
    cosine_sim = cosine_similarity(tfidf_matrix, input_tfidf)
    similarity_df = pd.DataFrame(
        cosine_sim, index=job_titles, columns=["Similarity"]
    ).sort_values(by="Similarity", ascending=False)
    return similarity_df.head(top_n)

In [1]:
def get_similarity_userskills_input(inputtext,vectorizer,tfidf_matrix):
  """
    Calculates the cosine similarity between input text and a TF-IDF matrix representing job skills.

    This function transforms the input text into a TF-IDF vector using a provided vectorizer,
    and then computes the cosine similarity between this vector and a pre-computed TF-IDF matrix The function is designed to assess
    how well the input text aligns with the content represented in the TF-IDF matrix.

    Args:
        inputtext (str or list of str): The input text (e.g., a job seeker's resume) to compare against the job skills matrix.
                                     If a string is provided, it is converted into a list containing that single string.
        vectorizer: A fitted TF-IDF vectorizer object (e.g., `sklearn.feature_extraction.text.TfidfVectorizer`)
                    that has been trained on the vocabulary of the job skills matrix.  This vectorizer is used to
                    transform the input text into a TF-IDF vector with the same feature space as the matrix.
        tfidf_matrix: A pre-computed TF-IDF matrix representing job skills or descriptions.  Each row of the matrix
                      corresponds to a different job or skill set.

    Returns:
        numpy.ndarray: A 2D numpy array containing the cosine similarity scores.  The shape of the array is
                       (number of input texts, number of jobs/skillsets in the TF-IDF matrix).  Each element [i, j]
                       represents the cosine similarity between the i-th input text and the j-th job/skillset.
    """
  if isinstance(inputtext,str):
    inputtext = [inputtext]

  jobtitles = load_pickle(r"../00_data/TF-IDF_Vectorizer/1_jobnames.pkl")
  inputtfidf = vectorizer.transform(inputtext)
  cosine_sim = cosine_similarity(inputtfidf,tfidf_matrix)

  return cosine_sim

In [None]:
def evaluate(cosine_sim):
  """
    Evaluates the performance of a cosine similarity-based job recommendation system using Mean Reciprocal Rank (MRR).

    This function takes a matrix of cosine similarity scores between job advertisements and job postings,
    and calculates the MRR@100 metric.  For each job advertisement, it identifies the rank of the
    correct job posting based on the similarity scores, and computes the reciprocal of this rank.
    If the correct job posting is not within the top 100 recommendations, it assigns a reciprocal rank of 0.
    The MRR@100 is then calculated as the average of these reciprocal ranks across all job advertisements.

    Args:
        cosine_sim (numpy.ndarray): A 2D numpy array representing the cosine similarity scores between
                                     job advertisements and job postings. Rows correspond to job advertisements,
                                     and columns correspond to job postings.

    Returns:
        float: The Mean Reciprocal Rank at 100 (MRR@100) score, representing the average reciprocal rank
               of the correct job posting for each job advertisement, considering only the top 100 recommendations.
    """
  ranks = []
  simdf = pd.DataFrame(cosine_sim, columns=job_ids, index=testad_ids)
  for i in (range(len(simdf))):
    id = simdf.iloc[i].name
    series = simdf.iloc[i].sort_values(ascending=False).reset_index()
    rank = series[series["index"]==id].index.item()+1
    if rank > 100:
      ranks.append(0)
      continue
    else:
      ranks.append(1/rank)
  return np.mean(ranks)

In [None]:
# Load German stop words
german_stop_words = stopwords.words('german')

# Data

In [None]:
# Load job data
jobs = pd.DataFrame(load_json(r"../00_data/ESCO/ESCO_JOBS_ALL.json"))
jobs.columns



In [None]:
# Extract job titles and IDs
job_titles = list(jobs["jobtitle"])
job_ids = list(jobs["jobid_esco"])

## Synonyms

In [10]:
def insert_jobtitle_to_synonyms(ad):
  if ad["synonyms"] == None:
    return ad["jobtitle"].split("/")
  else:
    return ad["synonyms"]
jobs["synonyms_altered"] = jobs.apply(insert_jobtitle_to_synonyms,axis=1)

In [11]:
unique_synonyms = list(set(flatten_list([x for x in jobs["synonyms_altered"] if x != None])))
len(unique_synonyms)
synonym_l_of_l = [" ".join(x) for x in jobs["synonyms_altered"]]

In [12]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=german_stop_words)
vectorizer = vectorizer.fit(unique_synonyms)
tfidf_matrix = vectorizer.transform(synonym_l_of_l)
features = vectorizer.get_feature_names_out()

## Skills

In [13]:
skilllist = set(flatten_list(jobs["full_skills"]))
len(skilllist)

In [14]:
fullskills_jobs = [" ".join(skillset) for skillset in jobs["full_skills"]]

# print(len(fullskills_jobs))
# print(jobtitle[0])
# fullskills_jobs[0]

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=german_stop_words)
vectorizer = vectorizer.fit(skilllist)

tfidf_matrix = vectorizer.transform(fullskills_jobs)
features = vectorizer.get_feature_names_out()
tfidf_matrix

## Combination


In [16]:
# info_unique_concat = list(unique_synonyms) + list(skilllist)
# total_l_of_l = []

# for syns, skills in zip(jobs["synonyms_altered"],jobs["full_skills"]):
#   total_l_of_l.append(" ".join(syns+skills))

In [17]:
# vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words=german_stop_words)
# vectorizer = vectorizer.fit(info_unique_concat)
# tfidf_matrix = vectorizer.transform(total_l_of_l)
# features = vectorizer.get_feature_names_out()
# tfidf_matrix

## Descriptions

In [18]:
# unique_desc = (jobs["jobdescription"])
# vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words=german_stop_words)
# tfidf_matrix = vectorizer.fit_transform(unique_desc)
# features = vectorizer.get_feature_names_out()

# Application

In [19]:
path = "../00_data/TF-IDF_Vectorizer/"

In [20]:
# idf_list = []
# for idf, word in zip(vectorizer.idf_, vectorizer.get_feature_names_out()):
#   idf_list.append({"idf":idf,"word":word})
# pd.DataFrame(idf_list).sort_values("idf")

In [36]:
write_pickle(f"{path}1_vectorizer.pkl", vectorizer)
write_pickle(f"{path}1_tfidf_matrix.pkl", tfidf_matrix)
write_pickle(f"{path}1_jobnames.pkl", jobtitle)
write_pickle(f"{path}1_escoids.pkl", job_ids)



In [22]:
testads = pd.DataFrame(load_json(r"../00_data/EURES/eures_testads_final_short.json"))

In [23]:
testad_texts = list(testads["short_texts"])
testad_ids = list(testads["esco_id"])

# Setup Vec & Matrix

In [24]:
evaluation = []
for vec in [TfidfVectorizer]:#, CountVectorizer]:
  for ngram in [(1,1),(1,2),(1,3)]:
    for stopword_removal in [True, False]:
      if stopword_removal == True:
        vectorizer = vec(ngram_range=ngram, stop_words=german_stop_words)
      else:
        vectorizer = vec(ngram_range=ngram)
      vectorizer = vectorizer.fit(skilllist)
      tfidf_matrix = vectorizer.transform(fullskills_jobs)
      #tfidf_matrix = vectorizer.fit_transform(unique_desc)
      cosine_sim = get_similarity_userskills_input(testad_texts,vectorizer,tfidf_matrix)
      max_value = (max(map(max, cosine_sim)))
      MRR = evaluate(cosine_sim)
      evaluation.append({"vectorizer":str(vec).split(".")[-1][:-2],"n-gram range":ngram,
                         "stopword_removal":stopword_removal,"MRR@100":MRR,"max_similarity":max_value})
display(pd.DataFrame(evaluation))

In [25]:
df = pd.DataFrame(evaluation)
df["MRR@100"] = df["MRR@100"].round(3)
ax = df[df["vectorizer"]=="TfidfVectorizer"].pivot("n-gram range","stopword_removal",values="MRR@100").plot(kind="bar", figsize=(12, 5),ylabel="MRR@100",title="Comparison of TFIDF Settings")
for container in ax.containers:
    ax.bar_label(container)



In [26]:
cvs = load_cvs()

In [27]:
def get_similarity_userskills(inputtext,vectorizer,tfidf_matrix, topn = 10):
  if isinstance(inputtext,str):
    inputtext = [inputtext]
  jobtitles = load_pickle(r"../00_data/TF-IDF_Vectorizer/1_jobnames.pkl")
  inputtfidf = vectorizer.transform(inputtext)
  cosine_sim = cosine_similarity(tfidf_matrix,inputtfidf)
  cossim_df = pd.DataFrame(cosine_sim, index = jobtitles, columns=["Similarity"]).sort_values(by="Similarity",ascending=False)
  return cossim_df

In [28]:
text = cvs[1]

In [29]:
inputtfidf = vectorizer.transform([text])
features = vectorizer.get_feature_names_out()
df = pd.DataFrame(inputtfidf.toarray(),index =["input"], columns = features).T.sort_values(by="input",ascending=False)
df[df["input"]!=0]

In [30]:
get_similarity_userskills(text, vectorizer,tfidf_matrix).iloc[:20]

In [31]:
def get_tfidf_features_job(job, tfidf_matrix):
    index = jobtitle.index(job)
    feature_dict = {}
    for v, feature in zip(tfidf_matrix[index].toarray()[0], features):
        if v != 0:
            feature_dict[feature] = v
    return (feature_dict)

In [32]:
def get_tfidf_features(text):
    inputtfidf = vectorizer.transform([(text)])
    features = vectorizer.get_feature_names_out()
    input_dict = {}
    for v, feature in zip(inputtfidf.toarray()[0], features):
            if v != 0:
                input_dict[feature] = v
    return (input_dict)

In [33]:
def compare_tfidf(text, job, tfidf_matrix):
    input_tfidf = get_tfidf_features(text)
    #print(input_tfidf)
    jobfeatures = get_tfidf_features_job(job, tfidf_matrix)
    #print(jobfeatures)
    output_dict = {}
    for k in jobfeatures:
        if k in input_tfidf:
            output_dict[k] = input_tfidf[k]
    return output_dict

In [34]:
compare_tfidf(text,"Spieleentwickler/Spieleentwicklerin",tfidf_matrix)