In [None]:
from datasets import load_dataset

from sentence_transformers import SentenceTransformer
import torch

model_checkpoint = "paraphrase-multilingual-MiniLM-L12-v2"

model = SentenceTransformer(
    model_checkpoint,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
)

jobs = [
    "Lập trình viên Python tại công ty ABC.",
    "Chuyên viên phân tích dữ liệu tại công ty XYZ.",
    "Nhân viên kinh doanh phần mềm cho công ty DEF.",
]

cvs = [
    "Tôi có 3 năm kinh nghiệm phát triển Python và sử dụng Flask.",
    "Kỹ năng phân tích dữ liệu với Python, R, và SQL, 2 năm kinh nghiệm.",
    "Tôi từng làm nhân viên kinh doanh phần mềm trong 4 năm.",
]

job_embeddings = model.encode(jobs)
cv_embeddings = model.encode(cvs)

# Tính cosine similarity giữa tất cả các cặp (CV, Job)
similarities = model.similarity(cv_embeddings, job_embeddings)
print("Cosine Similarity Matrix:")
print(similarities)

In [None]:
binary_dataset = load_dataset("HZeroxium/job-cv-binary")
triplet_dataset = load_dataset("HZeroxium/cv-job-triplet")
similarity_dataset = load_dataset("HZeroxium/cv-job-similarity")
job_paraphrase_dataset = load_dataset("HZeroxium/job-paraphrase")
cv_paraphrase_dataset = load_dataset("HZeroxium/cv-paraphrase")

In [None]:
binary_dataset

In [None]:
triplet_dataset

In [None]:
similarity_dataset

In [None]:
job_paraphrase_dataset

In [None]:
cv_paraphrase_dataset

In [None]:
train_binary_dataset = binary_dataset["train"]
eval_binary_dataset = binary_dataset["test"]

train_triplet_dataset = triplet_dataset["train"]
eval_triplet_dataset = triplet_dataset["test"]

train_similarity_dataset = similarity_dataset["train"]
eval_similarity_dataset = similarity_dataset["test"]

train_job_paraphrase_dataset = job_paraphrase_dataset["train"]
eval_job_paraphrase_dataset = job_paraphrase_dataset["test"]

train_cv_paraphrase_dataset = cv_paraphrase_dataset["train"]
eval_cv_paraphrase_dataset = cv_paraphrase_dataset["test"]

In [None]:
train_dataset = {
    "binary": train_binary_dataset,
    "triplet": train_triplet_dataset,
    "similarity": train_similarity_dataset,
    "job_paraphrase": train_job_paraphrase_dataset,
    "cv_paraphrase": train_cv_paraphrase_dataset,
}

eval_dataset = {
    "binary": eval_binary_dataset,
    "triplet": eval_triplet_dataset,
    "similarity": eval_similarity_dataset,
    "job_paraphrase": eval_job_paraphrase_dataset,
    "cv_paraphrase": eval_cv_paraphrase_dataset,
}

In [None]:
from sentence_transformers.losses import (
    ContrastiveLoss,
    TripletLoss,
    CoSENTLoss,
    MultipleNegativesRankingLoss,
)



binary_loss = ContrastiveLoss(model)
triplet_loss = TripletLoss(model)



similarity_loss = CoSENTLoss(model)
job_paraphrase_loss = MultipleNegativesRankingLoss(model)
cv_paraphrase_loss = MultipleNegativesRankingLoss(model)


losses = {
    "binary": binary_loss,
    "triplet": triplet_loss,
    "similarity": similarity_loss,
    "job_paraphrase": job_paraphrase_loss,
    "cv_paraphrase": cv_paraphrase_loss,
}

In [None]:
from sentence_transformers.evaluation import (
    BinaryClassificationEvaluator,
    TripletEvaluator,
    EmbeddingSimilarityEvaluator,
)

job_scores = [1] * len(eval_job_paraphrase_dataset["text1"])
cv_scores = [1] * len(eval_cv_paraphrase_dataset["text1"])

binary_evaluator = BinaryClassificationEvaluator(
    sentences1=eval_binary_dataset["text1"],
    sentences2=eval_binary_dataset["text2"],
    labels=eval_binary_dataset["label"],
)

triplet_evaluator = TripletEvaluator(
    anchors=eval_triplet_dataset["anchor"],
    positives=eval_triplet_dataset["positive"],
    negatives=eval_triplet_dataset["negative"],
)

similarity_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_similarity_dataset["text1"],
    sentences2=eval_similarity_dataset["text2"],
    scores=eval_similarity_dataset["score"],
)

job_paraphrase_evaluator = BinaryClassificationEvaluator(
    sentences1=eval_job_paraphrase_dataset["text1"],
    sentences2=eval_job_paraphrase_dataset["text2"],
    labels=job_scores,
)

cv_paraphrase_evaluator = BinaryClassificationEvaluator(
    sentences1=eval_cv_paraphrase_dataset["text1"],
    sentences2=eval_cv_paraphrase_dataset["text2"],
    labels=cv_scores,
)


from sentence_transformers.evaluation import SequentialEvaluator

evaluator = SequentialEvaluator(
    [
        binary_evaluator,
        triplet_evaluator,
        similarity_evaluator,
        job_paraphrase_evaluator,
        cv_paraphrase_evaluator,
    ]
)
evaluator(model=model)

In [None]:
from sentence_transformers import (
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.training_args import BatchSamplers

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"./models/{model_checkpoint}-job-cv-multi-dataset",
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    logging_steps=200,
    run_name="triplet-job-cv-multi-dataset",  # Will be used in W&B if `wandb` is installed
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=losses,
    evaluator=evaluator,
)

In [None]:
import numpy as np
from transformers.trainer_callback import TrainerState
import os
import json
import dataclasses


# Ghi đè hàm save_to_json để chuyển đổi numpy floats
def custom_save_to_json(self, json_path: str):
    def convert_to_native(obj):
        if isinstance(obj, (np.float32, np.float64)):
            return float(obj)
        return obj

    # Chuyển đổi tất cả các trường thành kiểu có thể lưu JSON
    data = json.loads(json.dumps(dataclasses.asdict(self), default=convert_to_native))
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, sort_keys=True)


# Thay thế phương thức trong TrainerState
TrainerState.save_to_json = custom_save_to_json

In [None]:
trainer.train()

In [None]:
evaluator(model=model)

In [None]:
job_embeddings = model.encode(jobs)
cv_embeddings = model.encode(cvs)

# Tính cosine similarity giữa tất cả các cặp (CV, Job)
similarities = model.similarity(cv_embeddings, job_embeddings)
print("Cosine Similarity Matrix:")
print(similarities)

In [None]:
model.save_pretrained(path=f"./models/{model_checkpoint}-job-cv-multi-dataset")

In [None]:
from pathlib import Path

# Ghi đè phương thức read_text để đọc tất cả tệp dưới dạng UTF-8
original_read_text = Path.read_text


def read_text_utf8(path, *args, **kwargs):
    kwargs["encoding"] = "utf-8"
    return original_read_text(path, *args, **kwargs)


Path.read_text = read_text_utf8

In [None]:
trainer.push_to_hub()