Full Fine-tune Text Embeddings for AI Job Search

Imports

In [4]:
! pip install sentence-transformers==3.4.1

Collecting sentence-transformers==3.4.1
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.7.0
    Uninstalling sentence-transformers-2.7.0:
      Successfully uninstalled sentence-transformers-2.7.0
Successfully installed sentence-transformers-3.4.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    # SentenceTransformerTrainer,
    losses, evaluation,InputExample,
    # SentenceTransformerTrainingArguments,
)
from torch.utils.data import DataLoader
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

import pre-trained model

In [2]:
model_name = "sentence-transformers/all-distilroberta-v1" # acc = 0.88
model = SentenceTransformer(model_name)

# # other models
# model_name = "microsoft/mpnet-base" # acc = 0.57
# model_name = "sentence-transformers/msmarco-bert-base-dot-v5" # acc = 0.09
# model_name = "sentence-transformers/msmarco-distilbert-dot-v5" # acc = 0.13

load dataset

In [3]:
dataset = load_dataset("shawhin/ai-job-embedding-finetuning")

evaluate pre-trained model on eval data

In [4]:
evaluator_valid = TripletEvaluator(
    anchors=dataset["validation"]["query"],
    positives=dataset["validation"]["job_description_pos"],
    negatives=dataset["validation"]["job_description_neg"],
    name="ai-job-validation",
)
evaluator_valid(model)

{'ai-job-validation_cosine_accuracy': 0.8811880946159363}

define loss function

In [5]:
loss = MultipleNegativesRankingLoss(model)

define training args

In [6]:
num_epochs = 2
batch_size = 2
# lr = 2e-5
finetuned_model_name = "distilroberta-ai-job-embeddings"
    
train_examples = [
    InputExample(texts=[query, job_pos])  # Convert to expected format
    for query, job_pos in zip(dataset["train"]["query"], dataset["train"]["job_description_pos"])
]

# Create DataLoader
train_dataloader = DataLoader(train_examples, batch_size=batch_size, shuffle=True)

# train_args = SentenceTransformerTrainingArguments(
#     output_dir=f"models/{finetuned_model_name}",
#     num_train_epochs=num_epochs,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     learning_rate=lr,
#     warmup_ratio=0.1,
#     batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
#     eval_strategy="steps",
#     eval_steps=100,
#     logging_steps=100,
# )


In [7]:
model.fit(
    train_objectives=[(train_dataloader, loss)],
    evaluator=evaluator_valid,
    epochs=num_epochs,
    warmup_steps=100
)



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Ai-job-validation Cosine Accuracy
405,No log,No log,0.990099
500,0.070000,No Log,No Log
810,0.070000,No log,0.990099


fine-tune model

In [8]:
# %%time
# trainer = SentenceTransformerTrainer(
#     model=model,
#     args=train_args,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["validation"],
#     loss=loss,
#     evaluator=evaluator_valid,
# )
# trainer.train()
model.save(f"models/{finetuned_model_name}")

evaluate fine-tuned model

In [9]:
evaluator_test = TripletEvaluator(
    anchors=dataset["test"]["query"],
    positives=dataset["test"]["job_description_pos"],
    negatives=dataset["test"]["job_description_neg"],
    name="ai-job-test",
)
print("Validation:", evaluator_valid(model))
print("Test:", evaluator_test(model))

Validation: {'ai-job-validation_cosine_accuracy': 0.9900990128517151}
Test: {'ai-job-test_cosine_accuracy': 1.0}


push fine-tuned model to HF hub

In [10]:
from huggingface_hub import login
import os

access_token=os.getenv("HUGGINGFACE_KEY")

write_key = access_token
login(write_key)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
model_id = "krshahvivek/"+finetuned_model_name
model.push_to_hub(model_id)

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

'https://huggingface.co/krshahvivek/distilroberta-ai-job-embeddings/commit/e2eed1ca0a858eddd0c2839fe2b17d807a25c019'