# Imports

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/TRAINING

In [2]:
# %%capture
# !pip install sentence_transformers

In [3]:
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from tqdm import tqdm
import random
from sentence_transformers import evaluation
import json
import pandas as pd
from collections import Counter
import os
from sklearn.model_selection import train_test_split
from helpers import *
import numpy as np
from datetime import datetime
from sklearn.model_selection import KFold
from transformers import set_seed
import accelerate
set_seed(42)

In [4]:
def create_trainig_samples(pos_dev_samples,neg_pairs):
  dev_set_total =[]
  anchors = set([x[0] for x in pos_dev_samples])
  neg_dev_samples = [x for x in neg_pairs if x[0] in anchors]
  print("Creating Devset")
  for anchor in tqdm(anchors):
    pos_pairs_filtered = [x[1] for x in pos_dev_samples if x[0]==anchor]
    neg_pairs_filtered = [x[1] for x in neg_dev_samples if x[0]==anchor]
    dev_set_total.append({"query":anchor,"positive":pos_pairs_filtered,"negative":neg_pairs_filtered})
  return dev_set_total

In [5]:
data_dict = load_data_pairs()
pos_pairs = flatten_list([data_dict[x] for x in data_dict if "pos" in x])
neg_pairs = flatten_list([data_dict[x] for x in data_dict if "neg" in x])

100%|█████████████████████████████████████████████| 6/6 [00:00<00:00,  6.07it/s]


# Variables

In [6]:
modelname = "../00_data/SBERT_Models/models/gbert_TSDAE_epochs5"
# modelname = "deepset/gbert"

In [7]:
model = SentenceTransformer(modelname)

In [8]:
if modelname == "deepset/gbert":
  TSDAE = "woTSDAE"
elif modelname == "../00_data/SBERT_Models/models/gbert_TSDAE_epochs5":
  TSDAE = "wTSDAE"
else:
  raise TypeError

In [9]:
batch_size = 16
lr = 2e-5
num_epochs = 1
fold_size = 10
output_path = f"../00_data/SBERT_Models/models/gbert_batch{batch_size}_{TSDAE}_{lr}_f{fold_size}"
output_path

'../00_data/SBERT_Models/models/gbert_batch16_wTSDAE_2e-05_f10'

In [10]:
kf = KFold(n_splits=fold_size, random_state=42, shuffle=True)

In [12]:
MRR = []
MRR_AT = 100
training_start = "".join([c for c in str(datetime.now()).split('.')[0] if c.isdigit()])
max_MRR = 0

for epoch, (train_index, dev_index) in enumerate(kf.split(pos_pairs)):

  pos_train_samples = [pos_pairs[i] for i in train_index]
  pos_dev_samples = [pos_pairs[i] for i in dev_index]
  warmup = len(pos_train_samples)*0.1
 
  dev_set_total = create_trainig_samples(pos_dev_samples,neg_pairs)
  train_examples = []
  for item in pos_train_samples:
    train_examples.append(InputExample(texts=[item[0], item[1]]))
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
  train_loss = losses.MultipleNegativesRankingLoss(model)
  evaluator = evaluation.RerankingEvaluator(dev_set_total,at_k=100,show_progress_bar=True)
  # train the model 
  
  model.fit(train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup,
    evaluator=evaluator,
    checkpoint_path="00_data/SBERT_models/modeltrain",
    checkpoint_save_total_limit=1,
    #save_best_model = True,
    optimizer_params={'lr':lr},
    checkpoint_save_steps = 1000,
    output_path= output_path,
  )
  # load the test ads and encode them with the current model
  testads = pd.DataFrame(load_json("../00_data/EURES/eures_testads_final_short.json"))
  encodings_short = model.encode(list(testads["short_texts"]), show_progress_bar=True)
  testads["embeddings_short"] = encodings_short.tolist()
  embeddings = encode_jobs(model)

  # make evaluation
  similarities = {}
  for k in embeddings:
    similarities[k] = (util.cos_sim(testads["embeddings_short"],embeddings[k]["embeddings"]))
  for k in similarities.keys():
    ranks = []
    missing = 0
    simdf = pd.DataFrame(similarities[k],columns=embeddings[k]["esco_id"], index=testads["esco_id"])
    for i in tqdm(range(len(simdf))):
      id = simdf.iloc[i].name
      series = simdf.iloc[i].sort_values(ascending=False).reset_index()
      #print(series)
      rank = (series[series["index"]==id].index.item()+1)
      #print(rank)
      if rank > MRR_AT:
        missing +=1
        ranks.append(0)
      else:
        ranks.append(1/rank)
    missing = missing/len(simdf)
    current_run = {"model":output_path.split("/")[-1],"epoch":epoch,
                   "embedding_kind":k, "MRR":np.mean(ranks),
                   "missing":missing, "MRR@":MRR_AT,
                   "training_details":[training_start, batch_size, lr, warmup, num_epochs, fold_size, TSDAE]}
    MRR.append(current_run)
    df = pd.DataFrame(MRR)
    display(df)
    # safe model separately, if new model has higher MRR than best model before 
    if np.mean(ranks) > max_MRR:
      print(f"New best Model saved after epoch {epoch}")
      max_MRR = np.mean(ranks)
      best_model_to_save = model
      best_model_to_save.save(f"{output_path}_best")
      write_json(f"{output_path}/model_info.json",current_run)
    df.to_excel(f"{output_path}/eval/{training_start}_trainig_details.xlsx")
best_model_to_save.save(f"{output_path}")

Creating Devset


100%|███████████████████████████████████████| 2848/2848 [00:43<00:00, 65.23it/s]


Step,Training Loss,Validation Loss,Map,Mrr@100,Ndcg@100
500,2.1859,,,,
1000,1.6875,,,,
1500,1.4932,,,,
2000,1.3347,,,,
2500,1.2384,,,,
3000,1.1625,,,,
3500,1.1116,,,,
4000,1.029,,,,
4500,1.0156,,,,
5000,0.9721,,,,


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

Batches:   0%|          | 0/10281 [00:00<?, ?it/s]

Batches:   0%|          | 0/71 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

Batches:   0%|          | 0/92 [00:00<?, ?it/s]

100%|█████████████████████████████████████| 2250/2250 [00:01<00:00, 1136.54it/s]


Unnamed: 0,model,epoch,embedding_kind,MRR,missing,MRR@,training_details
0,gbert_batch16_wTSDAE_2e-05_f10,0,skillsets,0.212598,0.262222,100,"[20241230185335, 16, 2e-05, 11821.0, 1, 10, wT..."


New best Model saved after epoch 0
Sucessfully saved file: ../00_data/SBERT_Models/models/gbert_batch16_wTSDAE_2e-05_f10/model_info.json


OSError: Cannot save file into a non-existent directory: '../00_data/SBERT_Models/models/gbert_batch16_wTSDAE_2e-05_f10/eval'