In [1]:
import logging


logger = logging.getLogger("Notebook")
logger.setLevel(logging.DEBUG)

In [3]:
import json
import pathlib

DATASET_PATH = pathlib.Path(
    "dataset/annotated/project-1-at-2025-11-15-15-33-207ebb0e.json"
)

with open(DATASET_PATH, "r") as f:
    dataset_json = json.load(f)

In [None]:
def task_to_datapoint(task: dict) -> dict | None:
    """
    Convert Label Studio ranking dataset â†’ canonical format:
    [
        {
            "query": str,
            "negatives": [str, ...],
            "positives": [str, ...]
        },
        ...
    ]
    """


# ---- Load dataset ----
dataset = []
for task in dataset_json:
    datapoint = task_to_datapoint(task)

    # --- extract query ---
    data = task["data"]
    query = data["query"]

    # --- extract candidate options ---
    # some items have only score + value; we keep only value
    candidates = [opt["value"] for opt in data["options"]]

    annotations = [
        annotation
        for annotation in task["annotations"]
        if not annotation["was_cancelled"]
    ]

    if not annotations:
        continue

    if 1 < len(annotations):
        logger.warning(
            "Task %s has more than one annotation, selecting the last", task["id"]
        )

    for res in annotations[-1]["result"]:
        if res["from_name"] != "rel":
            continue

        # LabelStudio "choices" format
        positives = res["value"]["choices"]
        negatives = [n for n in candidates if n not in positives]

        dataset.append({"query": query, "negatives": negatives, "positives": positives})
        break

    continue


print("Loaded tasks:", len(dataset))
print("Sample:", dataset[0])


Task 2153 has more than one annotation, selecting the last
Task 2223 has more than one annotation, selecting the last
Task 2277 has more than one annotation, selecting the last
Task 2677 has more than one annotation, selecting the last
Task 3025 has more than one annotation, selecting the last


Loaded tasks: 969
Sample: {'query': 'Ú©ÙˆØ±', 'negatives': ['kor', 'korkar', 'kor eng', 'Ú©ÙˆØ±ØªÛŒ', 'korrea', 'ÙƒÙˆØ±Ø¯', 'Ú©ÙˆÛŒØ±', 'korosh'], 'positives': ['Ú©ÙˆØ±', 'Ù…ÙˆØ´ Ú©ÙˆØ±']}


In [39]:
from datasets import Dataset


def expand_rows(rows):
    s1, s2, labels = [], [], []
    for row in rows:
        q = row["query"]
        for p in row["positives"]:
            s1.append(q)
            s2.append(p)
            labels.append(1)

        for n in row["negatives"]:
            s1.append(q)
            s2.append(n)
            labels.append(0)

    return {"sentence1": s1, "sentence2": s2, "label": labels}


expanded = expand_rows(dataset)
hf_dataset = Dataset.from_dict(expanded)

print(hf_dataset)


Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 9690
})


In [None]:
hf_dataset = Dataset.load_from_disk()

In [None]:
hf_dataset.sav

### Model + LoRA (PEFT) setup

In [None]:
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)

# 1) Load base Persian model
model = SentenceTransformer("xmanii/maux-gte-persian-v3", allow_remote_code=True)


In [None]:
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    losses,
    util,
)


model = SentenceTransformer("xmanii/maux-gte-persian-v3")

loss = losses.OnlineContrastiveLoss(
    model=model,
    # default metric = 1 - cos_sim. Good for embedding.
    margin=0.5,
)

args = SentenceTransformerTrainingArguments(
    output_dir="/kaggle/working/gte-persian-seo",
    num_train_epochs=2,
    per_device_train_batch_size=32,  # adjust to VRAM; increase via grad_accum
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    warmup_ratio=0.05,
    # fp16=True,
    logging_steps=50,
    save_steps=1000,
    run_name="gte-persian-seo",
)

trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=hf_dataset,
    loss=loss,
)


NameError: name 'model' is not defined

In [None]:
args.

In [None]:
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    losses,
    util,
)


model = SentenceTransformer("xmanii/maux-gte-persian-v3", trust_remote_code=True)

loss = losses.OnlineContrastiveLoss(
    model=model,
    # default metric = 1 - cos_sim. Good for embedding.
    margin=0.5,
)

args = SentenceTransformerTrainingArguments(
    output_dir="/kaggle/working/gte-persian-seo",
    num_train_epochs=2,
    per_device_train_batch_size=32,  # adjust to VRAM; increase via grad_accum
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    warmup_ratio=0.05,
    # fp16=True,
    logging_steps=50,
    save_steps=1000,
    run_name="gte-persian-seo",
)

trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=hf_dataset,
    loss=loss,
)


In [None]:
trainer.train()


In [None]:
from peft import LoraConfig, TaskType

# 2) Add a LoRA adapter
lora = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["query", "key", "value", "dense"],
)
model.add_adapter(lora)  # SBERT integrates with PEFT


In [None]:
from sentence_transformers import SentenceTransformer

# Download from the ðŸ¤— Hub
model = SentenceTransformer(MODEL_NAME)

# Run inference
sentences = [
    "is toprol xl the same as metoprolol?",
    "Metoprolol succinate is also known by the brand name Toprol XL. It is the extended-release form of metoprolol. Metoprolol succinate is approved to treat high blood pressure, chronic chest pain, and congestive heart failure.",
    "Metoprolol starts to work after about 2 hours, but it can take up to 1 week to fully take effect. You may not feel any different when you take metoprolol, but this doesn't mean it's not working. It's important to keep taking your medicine",
]

embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings[0], embeddings[1:])
print(similarities)
# tensor([[0.7913, 0.4976]])

In [None]:
model.encode()
# model.prompts

NameError: name 'model' is not defined

In [None]:
task_to_datapoint(tasks[9])