# mpnet, no preprocessing, no tuning, mean_pooling

In [2]:
# sanity check, using data directly from hugging face and no fp16

# installs
!pip install datasets transformers torch

# dependencies
import torch
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer
from torch.nn.functional import cosine_similarity

# load model and tokenizer
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to('cuda')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model.eval()

# get unique ESCO skills
synthetic = load_dataset('jensjorisdecorte/Synthetic-ESCO-skill-sentences', split='train')
skills = sorted(list(set(synthetic['skill'])))
skill2idx = {skill: idx for idx, skill in enumerate(skills)}

# encode skills in batches
with torch.no_grad():
    skill_embeddings = []
    for i in range(0, len(skills), 128):
        batch_skills = skills[i:i + 128]
        skill_inputs = {k: v.to('cuda') for k, v in tokenizer(batch_skills, padding='longest', truncation=True, return_tensors='pt').items()}
        batch_embeddings = model(**skill_inputs).last_hidden_state#.mean(dim=1)
        batch_embeddings = (batch_embeddings * skill_inputs['attention_mask'].unsqueeze(-1)).sum(dim=1) / skill_inputs['attention_mask'].sum(dim=1, keepdim=True)
        skill_embeddings.append(batch_embeddings)
    skill_embeddings = torch.cat(skill_embeddings, dim=0)


def calculate_metrics(similarities, true_labels, sentences, is_p=False):
    # convert sims to softmax distribution if needed
    if not is_p:
        similarities = torch.softmax(similarities / 0.01, dim=1)

    # group entries by unique sentences
    sentence_to_indices = {}
    for i, sentence in enumerate(sentences):
        sentence_to_indices.setdefault(sentence, []).append(i)

    mrr_sum, rp5_sum, atp_sum, count = 0.0, 0.0, 0.0, 0

    for sentence, indices in sentence_to_indices.items():
        # gather all labels for this sentence
        sentence_labels = set(true_labels[i] for i in indices if true_labels[i] > -1)
        if len(sentence_labels) == 0:
            # no relevant labels for this sentence, skip
            continue

        # Use the first index for predictions (assuming identical for same sentence)
        first_idx = indices[0]

        # Get sorted ranks
        ranked_preds = torch.argsort(similarities[first_idx], descending=True)

        # MRR calculation (standard): just find the first correct label
        sentence_mrr = 0.0
        for pos, pred in enumerate(ranked_preds.tolist(), start=1):
            if pred in sentence_labels:
                sentence_mrr = 1.0 / pos
                break

        # RP@5 calculation
        top_5_preds = set(ranked_preds[:5].tolist())
        top_k_correct = len(sentence_labels & top_5_preds)
        rp5 = top_k_correct / min(5, len(sentence_labels))

        # ATP calculation
        sentence_atp = similarities[first_idx, list(sentence_labels)].sum().item()
        atp = sentence_atp / len(sentence_labels)

        # Accumulate metrics
        mrr_sum += sentence_mrr
        rp5_sum += rp5
        atp_sum += atp
        count += 1

    # return averages, handle case when count=0
    if count > 0:
        return mrr_sum / count, rp5_sum / count, atp_sum / count
    else:
        return None, None, None

# function to compute batched cosine similarities
def batched_cosine_similarity(sent_embeddings, skill_embeddings, batch_size=128):
    similarities = []
    for i in range(0, len(sent_embeddings), batch_size):
        sim = cosine_similarity(sent_embeddings[i:i+batch_size].unsqueeze(1), skill_embeddings.unsqueeze(0), dim=2)
        similarities.append(sim)
    return torch.cat(similarities, dim=0)

# process each benchmark
for dataset_name in ['tech', 'house', 'techwolf']:
    print(f"\nProcessing {dataset_name}...")

    # load benchmark
    benchmark = load_dataset(f'jensjorisdecorte/skill-extraction-{dataset_name}', split='test')

    # filter to keep only entries where label exists in our skill set
    valid_entries = [(sent, lab) for sent, lab in zip(benchmark['sentence'], benchmark['label']) if lab in skill2idx]
    if not valid_entries:
        print(f"No valid entries found in {dataset_name}")
        continue

    sentences, labels = zip(*valid_entries)
    label_indices = [skill2idx[label] for label in labels]

    # encode sentences in batches
    with torch.no_grad():
       sent_embeddings = []
       for i in range(0, len(sentences), 128):
            batch_sentences = list(sentences)[i:i + 128]
            sent_inputs = {k: v.to('cuda') for k, v in tokenizer(batch_sentences, padding='longest', truncation=True, return_tensors='pt').items()}
            batch_embeddings = model(**sent_inputs).last_hidden_state#.mean(dim=1)
            batch_embeddings = (batch_embeddings * sent_inputs['attention_mask'].unsqueeze(-1)).sum(dim=1) / sent_inputs['attention_mask'].sum(dim=1, keepdim=True)
            sent_embeddings.append(batch_embeddings)
       sent_embeddings = torch.cat(sent_embeddings, dim=0)

    # calculate batched similarities and metrics
    similarities = batched_cosine_similarity(sent_embeddings, skill_embeddings, batch_size=128)
    mrr, rp5, atp = calculate_metrics(similarities, label_indices, list(sentences))

    print(f"Processed entries: {len(valid_entries)}")
    print(f"MRR: {mrr:.4f}")
    print(f"RP@5: {rp5:.4f}")
    print(f"ATP: {atp:.4f}")


Processing tech...
Processed entries: 673
MRR: 0.3882
RP@5: 0.3971
ATP: 0.1827

Processing house...
Processed entries: 568
MRR: 0.2633
RP@5: 0.2617
ATP: 0.0996

Processing techwolf...
Processed entries: 588
MRR: 0.2958
RP@5: 0.3348
ATP: 0.1498
