# Imports

In [23]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [24]:
# %%capture
# !pip install sentence_transformers

In [1]:
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from tqdm import tqdm
import random
from sentence_transformers import evaluation
import json
import pandas as pd
from collections import Counter
import os
from sklearn.model_selection import train_test_split
from helpers import *
import numpy as np
from datetime import datetime
from sklearn.model_selection import KFold
from transformers import set_seed
set_seed(42)

In [2]:
def create_trainig_samples(pos_dev_samples,neg_pairs):
  dev_set_total =[]
  anchors = set([x[0] for x in pos_dev_samples])
  neg_dev_samples = [x for x in neg_pairs if x[0] in anchors]
  print("Creating Devset")
  for anchor in tqdm(anchors):
    pos_pairs_filtered = [x[1] for x in pos_dev_samples if x[0]==anchor]
    neg_pairs_filtered = [x[1] for x in neg_dev_samples if x[0]==anchor]
    dev_set_total.append({"query":anchor,"positive":pos_pairs_filtered,"negative":neg_pairs_filtered})
  return dev_set_total

In [3]:
data_dict = load_data_pairs()
pos_pairs = flatten_list([data_dict[x] for x in data_dict if "pos" in x])
neg_pairs = flatten_list([data_dict[x] for x in data_dict if "neg" in x])

100%|██████████| 6/6 [00:00<00:00,  6.05it/s]


# Variables

In [10]:
# modelname = "../00_data/SBERT_Models/models/jobgbert_TSDAE_epochs5"
# modelname = "agne/jobgbert"
# modelname = "google/rembert"
modelname = "google-bert/bert-base-multilingual-cased"

In [11]:
model = SentenceTransformer(modelname)

No sentence-transformers model found with name google-bert/bert-base-multilingual-cased. Creating a new one with mean pooling.


In [30]:
if modelname == "agne/jobgbert":
  TSDAE = "woTSDAE"
elif modelname == "../00_data/SBERT_Models/models/jobgbert_TSDAE_epochs5":
  TSDAE = "wTSDAE"
else:
  raise TypeError

TypeError: 

In [6]:
if modelname == "google/rembert":
  TSDAE = "woTSDAE"

In [12]:
if modelname == "google-bert/bert-base-multilingual-cased":
  TSDAE = "woTSDAE"

In [13]:
batch_size = 32
lr = 2e-5
num_epochs = 1
fold_size = 10
output_path = f"../00_data/SBERT_Models/models/rembert_{batch_size}_{TSDAE}_{lr}_f{fold_size}"
output_path

'../00_data/SBERT_Models/models/rembert_32_woTSDAE_2e-05_f10'

In [None]:
batch_size = 32
lr = 2e-5
num_epochs = 1
fold_size = 10
output_path = f"../00_data/SBERT_Models/models/consultantbert_{batch_size}_{TSDAE}_{lr}_f{fold_size}"
output_path

In [8]:
kf = KFold(n_splits=fold_size, random_state=42, shuffle=True)

In [9]:
MRR = []
MRR_AT = 100
training_start = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])
max_MRR = 0

for epoch, (train_index, dev_index) in enumerate(kf.split(pos_pairs)):

  pos_train_samples = [pos_pairs[i] for i in train_index]
  pos_dev_samples = [pos_pairs[i] for i in dev_index]
  warmup = len(pos_train_samples)*0.1
 
  dev_set_total = create_trainig_samples(pos_dev_samples,neg_pairs)
  train_examples = []
  for item in pos_train_samples:
    train_examples.append(InputExample(texts=[item[0], item[1]]))
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
  train_loss = losses.MultipleNegativesRankingLoss(model)
  evaluator = evaluation.RerankingEvaluator(dev_set_total,at_k=100,show_progress_bar=True)
  # train the model 
  
  model.fit(train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup,
    evaluator=evaluator,
    checkpoint_path="./model_checkpoints",  # Changed path
    checkpoint_save_total_limit=1,
    #save_best_model = True,
    optimizer_params={'lr':lr},
    checkpoint_save_steps = 1000,
    output_path= output_path,
  )
  # load the test ads and encode them with the current model
  testads = pd.DataFrame(load_json("../00_data/EURES/eures_testads_final_short.json"))
  encodings_short = model.encode(list(testads["short_texts"]), show_progress_bar=True)
  testads["embeddings_short"] = encodings_short.tolist()
  embeddings = encode_jobs(model)

  # make evaluation
  similarities = {}
  for k in embeddings:
    similarities[k] = (util.cos_sim(testads["embeddings_short"],embeddings[k]["embeddings"]))
  for k in similarities.keys():
    ranks = []
    missing = 0
    simdf = pd.DataFrame(similarities[k],columns=embeddings[k]["esco_id"], index=testads["esco_id"])
    for i in tqdm(range(len(simdf))):
      id = simdf.iloc[i].name
      series = simdf.iloc[i].sort_values(ascending=False).reset_index()
      #print(series)
      rank = (series[series["index"]==id].index.item()+1)
      #print(rank)
      if rank > MRR_AT:
        missing +=1
        ranks.append(0)
      else:
        ranks.append(1/rank)
    missing = missing/len(simdf)
    current_run = {"model":output_path.split("/")[-1],"epoch":epoch,
                   "embedding_kind":k, "MRR":np.mean(ranks),
                   "missing":missing, "MRR@":MRR_AT,
                   "training_details":[training_start, batch_size, lr, warmup, num_epochs, fold_size, TSDAE]}
    MRR.append(current_run)
    df = pd.DataFrame(MRR)
    display(df)
    # safe model separately, if new model has higher MRR than best model before 
    if np.mean(ranks) > max_MRR:
      print(f"New best Model saved after epoch {epoch}")
      max_MRR = np.mean(ranks)
      best_model_to_save = model
      best_model_to_save.save(f"{output_path}_best")
      write_json(f"{output_path}/model_info.json",current_run)
    df.to_excel(f"{output_path}/eval/{training_start}_trainig_details.xlsx")
best_model_to_save.save(f"{output_path}")

Creating Devset


100%|██████████| 2848/2848 [00:43<00:00, 65.24it/s]


Step,Training Loss,Validation Loss,Map,Mrr@100,Ndcg@100
500,2.6234,,,,
1000,1.7508,,,,
1500,1.5357,,,,
2000,1.38,,,,
2500,1.3147,,,,
3000,1.2375,,,,
3500,1.2152,,,,
3695,1.2152,No log,0.671629,0.869754,0.820508


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/10281 [00:00<?, ?it/s]

Batches:   0%|          | 0/71 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

100%|██████████| 2250/2250 [00:01<00:00, 1176.09it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


New best Model saved after epoch 0
Sucessfully saved file: ../00_data/SBERT_Models/models/rembert_32_woTSDAE_2e-05_f10/model_info.json


100%|██████████| 2250/2250 [00:01<00:00, 1169.24it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


New best Model saved after epoch 0
Sucessfully saved file: ../00_data/SBERT_Models/models/rembert_32_woTSDAE_2e-05_f10/model_info.json


100%|██████████| 2250/2250 [00:01<00:00, 1186.29it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
2,rembert_32_woTSDAE_2e-05_f10,0,jobtitle,0.330851,0.16,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


Creating Devset


100%|██████████| 2850/2850 [00:43<00:00, 65.51it/s]


Step,Training Loss,Validation Loss,Map,Mrr@100,Ndcg@100
500,1.0593,,,,
1000,0.9971,,,,
1500,0.9711,,,,
2000,0.9658,,,,
2500,0.9937,,,,
3000,0.9895,,,,
3500,0.9931,,,,
3695,0.9931,No log,0.718659,0.887589,0.848324


Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/10292 [00:00<?, ?it/s]

Batches:   0%|          | 0/71 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

100%|██████████| 2250/2250 [00:01<00:00, 1136.33it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
2,rembert_32_woTSDAE_2e-05_f10,0,jobtitle,0.330851,0.16,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
3,rembert_32_woTSDAE_2e-05_f10,1,skillsets,0.290252,0.18,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


100%|██████████| 2250/2250 [00:01<00:00, 1137.56it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
2,rembert_32_woTSDAE_2e-05_f10,0,jobtitle,0.330851,0.16,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
3,rembert_32_woTSDAE_2e-05_f10,1,skillsets,0.290252,0.18,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
4,rembert_32_woTSDAE_2e-05_f10,1,desc,0.385673,0.148889,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


New best Model saved after epoch 1
Sucessfully saved file: ../00_data/SBERT_Models/models/rembert_32_woTSDAE_2e-05_f10/model_info.json


100%|██████████| 2250/2250 [00:01<00:00, 1130.92it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
2,rembert_32_woTSDAE_2e-05_f10,0,jobtitle,0.330851,0.16,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
3,rembert_32_woTSDAE_2e-05_f10,1,skillsets,0.290252,0.18,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
4,rembert_32_woTSDAE_2e-05_f10,1,desc,0.385673,0.148889,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
5,rembert_32_woTSDAE_2e-05_f10,1,jobtitle,0.335263,0.175111,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


Creating Devset


100%|██████████| 2852/2852 [00:44<00:00, 64.01it/s]


Step,Training Loss,Validation Loss,Map,Mrr@100,Ndcg@100
500,0.9018,,,,
1000,0.855,,,,
1500,0.8256,,,,
2000,0.8371,,,,
2500,0.8631,,,,
3000,0.8749,,,,
3500,0.8901,,,,
3695,0.8901,No log,0.74588,0.899958,0.864061


Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/10286 [00:00<?, ?it/s]

Batches:   0%|          | 0/71 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

100%|██████████| 2250/2250 [00:01<00:00, 1145.26it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
2,rembert_32_woTSDAE_2e-05_f10,0,jobtitle,0.330851,0.16,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
3,rembert_32_woTSDAE_2e-05_f10,1,skillsets,0.290252,0.18,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
4,rembert_32_woTSDAE_2e-05_f10,1,desc,0.385673,0.148889,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
5,rembert_32_woTSDAE_2e-05_f10,1,jobtitle,0.335263,0.175111,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
6,rembert_32_woTSDAE_2e-05_f10,2,skillsets,0.28778,0.177778,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


100%|██████████| 2250/2250 [00:01<00:00, 1146.04it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
2,rembert_32_woTSDAE_2e-05_f10,0,jobtitle,0.330851,0.16,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
3,rembert_32_woTSDAE_2e-05_f10,1,skillsets,0.290252,0.18,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
4,rembert_32_woTSDAE_2e-05_f10,1,desc,0.385673,0.148889,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
5,rembert_32_woTSDAE_2e-05_f10,1,jobtitle,0.335263,0.175111,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
6,rembert_32_woTSDAE_2e-05_f10,2,skillsets,0.28778,0.177778,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
7,rembert_32_woTSDAE_2e-05_f10,2,desc,0.389333,0.153333,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


New best Model saved after epoch 2
Sucessfully saved file: ../00_data/SBERT_Models/models/rembert_32_woTSDAE_2e-05_f10/model_info.json


100%|██████████| 2250/2250 [00:01<00:00, 1159.36it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,rembert_32_woTSDAE_2e-05_f10,0,skillsets,0.286642,0.178222,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
1,rembert_32_woTSDAE_2e-05_f10,0,desc,0.376811,0.144444,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
2,rembert_32_woTSDAE_2e-05_f10,0,jobtitle,0.330851,0.16,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
3,rembert_32_woTSDAE_2e-05_f10,1,skillsets,0.290252,0.18,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
4,rembert_32_woTSDAE_2e-05_f10,1,desc,0.385673,0.148889,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
5,rembert_32_woTSDAE_2e-05_f10,1,jobtitle,0.335263,0.175111,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
6,rembert_32_woTSDAE_2e-05_f10,2,skillsets,0.28778,0.177778,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
7,rembert_32_woTSDAE_2e-05_f10,2,desc,0.389333,0.153333,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."
8,rembert_32_woTSDAE_2e-05_f10,2,jobtitle,0.333842,0.173778,100,"[20250202195054, 32, 2e-05, 11821.0, 1, 10, wo..."


Creating Devset


100%|██████████| 2847/2847 [00:44<00:00, 63.66it/s]


Step,Training Loss
500,0.8253
1000,0.7634
1500,0.7358
2000,0.7677
2500,0.7746


KeyboardInterrupt: 