In [None]:
# Import necessary libraries
import pandas as pd
import os
import matplotlib.pyplot as plt

# Comparison Training

In [None]:
# List available models in the specified directory
for model in os.listdir('../00_data/SBERT_Models/Evaluation/'):
  if "TSDAE" in model:
    print(f"Available model: {model}")



In [None]:
# Define a list of model paths
models = ["jobgbert_batch16_woTSDAE_2e-05_f10",
          "jobgbert_batch16_wTSDAE_2e-05_f10",
          "jobgbert_batch32_woTSDAE_2e-05_f10",
          "jobgbert_batch32_wTSDAE_2e-05_f10",
          "jobgbert_batch64_woTSDAE_2e-05_f10",
          "jobgbert_batch64_wTSDAE_2e-05_f10",
          "gbert_batch16_woTSDAE_2e-05_f10",
          "gbert_batch16_wTSDAE_2e-05_f10",
          "gbert_batch32_woTSDAE_2e-05_f10",
          "gbert_batch32_wTSDAE_2e-05_f10",
          "gbert_batch64_woTSDAE_2e-05_f10",
          "gbert_batch64_wTSDAE_2e-05_f10",
          ]

In [None]:
# Initialize an empty dictionary to store results
results = {}
# Loop through each model path
for model_path in models:
    # Loop through each item in the evaluation directory for the current model
    for item in os.listdir(f"../00_data/SBERT_Models/Evaluation/{model_path}/eval"):
        # Check if the item contains 'trainig_details'
        if "trainig_details" in item:
            # If it does, read the Excel file and store it in the results dictionary
            results[model_path] = pd.read_excel(f"../00_data/SBERT_Models/Evaluation/{model_path}/eval/{item}", index_col=0)

In [35]:
# results = {"gbert": [], "jobgbert": []}
# for model_path in models:
#     for item in os.listdir(f"content/{model_path}/eval"):
#         if "trainig_details" in item and "job" in model_path:
#             results["jobgbert"].append(pd.read_excel(f"content/{model_path}/eval/{item}", index_col=0))
#         elif "trainig_details" in item:
#             results["gbert"].append(pd.read_excel(f"content/{model_path}/eval/{item}", index_col=0))
#         else:
#             continue

In [None]:
# Concatenate the results DataFrames, group by epoch and model, and reset the index
concat_df = pd.concat(results.values()).groupby(["epoch","model"]).max().reset_index()
# Extract modelname, pretraining, batchsize, and basemodel information from the 'model' column
concat_df["modelname"] = concat_df["model"].apply(lambda x: "_".join(x.split("_")[:3]))
concat_df["pretraining"] = concat_df["model"].apply(lambda x: x.split("_")[2])
concat_df["batchsize"] = concat_df["model"].apply(lambda x: int(x.split("_")[1].replace("batch","")))
concat_df["basemodel"] = concat_df["model"].apply(lambda x: x.split("_")[0])
# Adjust the epoch number by adding 1
concat_df["epoch"] = concat_df["epoch"].apply(lambda x: x+1)
# Replace 'jobgbert' and 'gbert' with 'JobGBERT' and 'GBERT' respectively
concat_df["basemodel"] = concat_df["basemodel"].replace("jobgbert","JobGBERT")
concat_df["basemodel"] = concat_df["basemodel"].replace("gbert","GBERT")
# Rename the 'basemodel' column to 'base model'
concat_df.rename({"basemodel":"base model"},inplace=True, axis=1)

In [None]:
# Pivot the DataFrame and plot MRR values against epoch
concat_df.pivot(index="epoch",values="MRR",columns="model").plot(xlabel="MRR@100")

In [None]:
# Pivot the DataFrame to display MRR values for each model at each epoch
concat_df.pivot(index="epoch",values="MRR",columns="model")

In [None]:
# Group by 'base model' and 'batchsize', find the maximum MRR, and plot a bar chart
ax = concat_df.groupby(["base model","batchsize"]).max().round(3).reset_index().pivot("base model", "batchsize",values="MRR").plot(kind="bar",title="Comparison of Model Results with Batch Size",ylabel="MRR@100")
# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container)



In [None]:
# Create subplots for comparing base models
fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 5))
fig.suptitle('Comparison of base models', fontsize=16)

# Plot MRR values for GBERT and JobGBERT on separate subplots
concat_df[concat_df["base model"]=="GBERT"].reset_index(drop=True).pivot(index="epoch",values="MRR",columns="model").plot(ylabel="MRR@100",ax =axes[0], title="GBERT",grid=True)
concat_df[concat_df["base model"]=="JobGBERT"].reset_index(drop=True).pivot(index="epoch",values="MRR",columns="model").plot(ylabel="MRR@100",ax =axes[1], title="JobGBERT",grid=True)

In [None]:
# Display data for a specific model
concat_df[concat_df["model"]=="jobgbert_batch32_woTSDAE_2e-05_f10"]

In [None]:
# Display the first 2 rows of the DataFrame
concat_df.head(2)

In [None]:
# Define a function to concatenate base model and batchsize
def concat(row):
    return "_".join([(row["base model"]),str(row["batchsize"])])

In [None]:
# Group by model, find the maximum MRR, and create a bar plot comparing models with and without pretraining
pretraing_df = concat_df.groupby("model",as_index=True, sort="MRR").max().reset_index()[["batchsize","base model","MRR","pretraining"]].reset_index(drop=True)
pretraing_df["modelname"] = pretraing_df.apply(concat,axis=1)
fig = pretraing_df.pivot("pretraining","modelname",values="MRR").T.round(3).plot(kind="barh",ylabel="MRR@100",
                                                                                 title="Comparison of Models with and without pretraining",
                                                                                   figsize=(11,6),
                                                                                 )

# Add legend and labels to the bars
fig.legend(loc="lower left")
for container in fig.containers:
    fig.bar_label(container)



In [None]:
# Group by model, find the maximum MRR, and create a bar plot comparing base models
model_comparison = concat_df.groupby("model",as_index=True, sort="MRR").max().reset_index()[["modelname","base model","MRR"]].reset_index(drop=True)
model_comparison["modelname"] = model_comparison["modelname"].apply(lambda x: "_".join(x.split("_")[1:]))
plot = model_comparison.pivot("modelname", "base model",values="MRR").sort_values("JobGBERT",ascending=True).round(3).plot(kind="barh",
                                                                                                                          xlabel="MRR@100",
                                                                                                                          figsize=(11,6),
                                                                                                                          title="Comparison of Base Models")
# Add legend and labels to the bars
plot.legend(loc="lower left")
for container in plot.containers:
    plot.bar_label(container)



In [None]:
# Set 'epoch' as the index for each DataFrame in the results dictionary
for item in results:
    results[item].set_index("epoch", inplace=True)

In [None]:
# Print the model name and the epoch with the maximum MRR for each model
for k in results:
    print(k, results[k]["MRR"].idxmax(axis=0))
    #display(results[k])



# Comparison Total

In [None]:
# Define the path to the Excel file containing the final evaluation results
excel_name = "../00_data/SBERT_Models/Evaluation/final_evaluation.xlsx"

In [None]:
# Read the Excel file into a DataFrame
df_total = pd.read_excel(excel_name)
# Convert the 'pretraining' column to integer type
df_total["pretraining"] = df_total["pretraining"].astype(int) 
# Filter the DataFrame to include only 'description' and 'skillsets' embedding kinds
df_total = df_total[df_total["embedding_kind"].isin(["description","skillsets"])]

In [None]:
# Define a dictionary to replace model names with shorter, more descriptive names
replace_dict = {"jobgbert_batch32_woTSDAE":"jobgbert_trained",
                "gbert_batch32_woTSDAE":"gbert_trained",
                "jobgbert_TSDAE_epochs5":"jobgbert_pretrained",
                "gbert_TSDAE_epochs5":"gbert_pretrained",
                "jobgbert_untrained":"jobgbert_untrained",
                "gbert_untrained":"gbert_untrained"}
# Replace the model names in the 'model' column using the replace_dict
df_total["model"].replace(replace_dict,inplace=True)

In [None]:
# Filter the DataFrame to include only models present in the replace_dict values
df_for_overview = df_total[df_total["model"].isin(replace_dict.values())]
# Round the 'MRR' column to 3 decimal places
df_for_overview["MRR"] = df_for_overview["MRR"].round(3)
# Group by model, find the maximum MRR, and create a pivot table for plotting
df_for_overview = df_for_overview.groupby(["model"]).max().sort_values(by="MRR",ascending=False).reset_index().pivot("base model","training",values="MRR")[["untrained","pretraining","trained"]]
# Create a bar plot to evaluate training effectiveness
ax = df_for_overview.plot(kind="bar",ylabel="MRR@100", title="Evaluation of training effectiveness")
# Add labels to the bars
for container in ax.containers:
    ax.bar_label(container)



In [52]:
# fig = px.bar(df_for_overview, y="model",x="MRR",color="basemodel", orientation="h")
# fig.update_layout(yaxis={'categoryorder':'total descending'})
# fig

# Comparison Shortened

In [None]:
# Read the Excel file into a DataFrame
df_total = pd.read_excel(excel_name)
# Filter the DataFrame to include only 'description' and 'skillsets' embedding kinds
df_total = df_total[df_total["embedding_kind"].isin(["description","skillsets"])]
# Convert the 'pretraining' column to integer type
df_total["pretraining"] = df_total["pretraining"].astype(int) 

In [None]:
# Display the first 2 rows of the DataFrame
df_total.head(2)

In [None]:
# Filter the DataFrame to include only 'trained' models, group by model and textkind, and create a pivot table
df_shortened = df_total[df_total["training"]=="trained"].groupby(["model", "textkind"]).max().reset_index()
df_shortened = df_shortened.pivot("model","textkind",values="MRR").reset_index()
# Select rows where the 'model' column contains 'job'
df_shortened[df_shortened["model"].str.contains("job")]
# Rename columns for better readability
df_shortened = df_shortened.rename({"embeddings_long":"original text","embeddings_short":"shortened text"},axis=1)




In [None]:
# Create subplots for comparing text inputs
fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(12, 5))
fig.suptitle('Comparison of Text Inputs', fontsize=16)

# Plot MRR values for JobGBERT and GBERT on separate subplots
fig1 = df_shortened[df_shortened["model"].str.contains("job")].round(3).plot(x="model",ylabel="MRR@100", title="JobGBERT", kind="bar", ax=axes[1])
fig1.legend(loc="lower left")
for container in fig1.containers:
    fig1.bar_label(container)
fig2 = df_shortened[~df_shortened["model"].str.contains("job")].round(3).plot(x="model",ylabel="MRR@100", title="GBERT", kind="bar", ax=axes[0],legend=False)
for container in fig2.containers:
    fig2.bar_label(container)

# Comparison embeddings


In [None]:
# Read the Excel file into a DataFrame
df_total = pd.read_excel(excel_name)
# Filter the DataFrame to exclude 'job_centroid' embedding kind
df_total = df_total[df_total["embedding_kind"]!="job_centroid"]
# Convert the 'pretraining' column to integer type
df_total["pretraining"] = df_total["pretraining"].astype(int)
# Round the 'MRR' column to 3 decimal places
df_total["MRR"] = df_total["MRR"].round(3)
# Replace embedding kind names for better readability
df_total["embedding_kind"] = df_total["embedding_kind"].str.replace("adcentroid_filtered","filtered JAC")
df_total["embedding_kind"] = df_total["embedding_kind"].str.replace("adcentroid_unfiltered","unfiltered JAC")

In [None]:
# Group by embedding kind, find the maximum MRR, and create a horizontal bar plot
df_embeddings = df_total.groupby(["embedding_kind"]).max().sort_values(by="MRR")
display(df_embeddings)
df_embeddings["MRR"] = df_embeddings["MRR"].round(3)
ax = df_embeddings["MRR"].plot(kind="barh",y="MRR",xlabel="MRR@100",ylabel="Embedding kind",figsize=(11,5), title="Comparison of Embeddings")
# Add labels to the bars
ax = ax.bar_label(ax.containers[0])



# Job Centroids

In [None]:
# Read the Excel file into a DataFrame
df_total = pd.read_excel(excel_name)
# Filter the DataFrame to include only 'job_centroid' embedding kind
df_total = df_total[df_total["embedding_kind"]=="job_centroid"]
# Convert the 'pretraining' column to integer type
df_total["pretraining"] = df_total["pretraining"].astype(int)
# Round the 'MRR' column to 3 decimal places
df_total["MRR"] = df_total["MRR"].round(3)

In [None]:
# Group by model, find the maximum MRR, and create a horizontal bar plot
df_embeddings = df_total.groupby(["model"]).max().sort_values(by="MRR")
df_embeddings
df_embeddings["MRR"] = df_embeddings["MRR"].round(3)
ax = df_embeddings["MRR"].plot(kind="barh",y="MRR",xlabel="MRR@100",
                               ylabel="model",figsize=(11,5), title="Comparison of Job Centroid")
# Add labels to the bars
ax = ax.bar_label(ax.containers[0])