# Installation

In [None]:
!pip install -qU pip
# !pip install -qU --no-deps unsloth
# Now install the dependencies explicitly without upgrading
!pip install -qU transformers peft trl datasets huggingface_hub fsspec evaluate sacrebleu
!pip install -qU unbabel-comet fastembed qdrant-client python-dotenv
!pip install -q -U bitsandbytes
!pip install -q sentence-transformers wandb

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo evaluate
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install -U transformers
    !pip install --no-deps unsloth
!pip install -qU sacrebleu
!pip install -qU unbabel-comet
!pip install fastembed qdrant-client python-dotenv
!pip install -q sentence-transformers wandb

In [2]:
!pip install -qU unbabel-comet

In [1]:
import os
import random
import json
import numpy as np
import pandas as pd
import torch
import warnings
from tqdm import tqdm

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Setup environment variables
DEVELOPMENT_MODE = True
IN_GITHUB = os.getenv("GITHUB_ACTIONS") == "true"
try:
    import google.colab

    IN_COLAB = True
    print("Running as a Colab notebook")
except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    # Uncomment for auto-reloading modules during development
    # %load_ext autoreload
    # %autoreload 2

# Base directory for data
BASE_DIR_PREFIX = "."
if IN_COLAB:
    from google.colab import drive

    drive.mount('/content/drive')
    BASE_DIR_PREFIX = '/content/drive/MyDrive/CLEF2025'

# Task configuration
TASK = "task1"
TYPE = "retrieval"
LANG = "EN"
FILE_BASE = f"joker_{TASK}_{TYPE}_corpus25"


# Helper function to load secrets
def get_secret(name: str):
    """Get a secret from environment or Google Colab userdata"""
    if IN_COLAB:
        from google.colab import userdata
        return userdata.get(name)
    else:
        from dotenv import load_dotenv
        load_dotenv()
        return os.getenv(name)


# Helper function to load JSON files
def load_json(path):
    """Load and parse a JSON file with UTF-8 encoding"""
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)


# Configure random seed for reproducibility
def set_random_seed(seed=42):
    """Set random seeds for reproducible experiments"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:2"
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


Running as a Colab notebook
Mounted at /content/drive


In [2]:
!mkdir -p utils
# !wget -O utils/cpo_trainer.py "https://raw.githubusercontent.com/fe1ixxu/ALMA/master/utils/cpo_trainer.py"
# !wget -O utils/cpo_config.py "https://raw.githubusercontent.com/fe1ixxu/ALMA/master/utils/cpo_config.py"
# Copy from drive
!cp -r /content/drive/MyDrive/CLEF2025/utils/cpo_trainer.py utils/cpo_trainer.py
!cp -r /content/drive/MyDrive/CLEF2025/utils/cpo_config.py utils/cpo_config.py

In [3]:
# Login to HuggingFace and WandB
from huggingface_hub import login

login(get_secret("HF_TOKEN"))
import wandb

wandb.login(key=get_secret("WANDB_KEY"))

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33migorktech01[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
TEAM_NAME = "Skommarkhos"

In [5]:
SEED = 3407

# Task 1

In [None]:
import pandas as pd
import json

# load json data
TASK = "task1"
TYPE = "retrieval"
FILE_BASE = "joker_task1_retrieval_corpus25"
LANG = 'EN'
# with open(f'data/{TASK}/{LANG}/{FILE_BASE}_{LANG}.json') as f:
#     data = json.load(f)
# # convert to dataframe
# df = pd.DataFrame(data)
df = pd.read_json(os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/{FILE_BASE}_{LANG}.json'))
print(len(df))
print(df.head())

77658
   docid                                               text
0      1  He has a green body, no visible nose, and live...
1      2  On the software side, a port is a numerical id...
2      3  A garbage man going through the trash says: "I...
3      4   The term "run out" can have several meanings ...
4      5  According to the Centers for Disease Control a...


## LLM analysis
#
Query Generation

In [None]:
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field

class Analysis(BaseModel):
    reasoning: str = Field(..., description="Short concise reasoning about whether the text is wordplay")
    is_wordplay: bool = Field(..., description="True if the text is a humorous wordplay instance, else False")
    generated_query: str = Field(..., description="A concise retrieval query if the text is wordplay, else an empty string")

def structured_response(doc_text, use_openai=True):
    """
    Extract structured data from text using either OpenAI API or self-hosted model.

    Args:
        doc_text: Text to analyze
        use_openai: If True, use OpenAI API. If False, use self-hosted model.

    Returns:
        Dictionary with analysis results
    """
    system_prompt = """
    You are an assistant that classifies short documents as wordplay (jokes) or not, and — if it is wordplay — generates a concise search-style query that would retrieve this joke.
    For example:
    - Text: "Why did the scarecrow win an award? Because he was outstanding in his field."
      is_wordplay: True
      generated_query: "scarecrow award"
    - Text: "The mitochondria is the powerhouse of the cell."
      is_wordplay: False
      generated_query: ""
    """.strip()

    api_key = get_secret("OPENAI_API_KEY")

    if use_openai:
        # Use instructor with OpenAI
        client = OpenAI(api_key=api_key)

        response = client.responses.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": doc_text}
            ],
            text_format=Analysis,
            temperature=0.4,
        )
        return response.model_dump()
    else:
        # Use standard OpenAI client with self-hosted endpoint
        client = instructor.from_openai(OpenAI(
            base_url="http://127.0.0.1:8080/v1",
            api_key="token"
        ))

        response = client.chat.completions.create(
            model='Qwen3-1.7B-Q8_0.gguf',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": doc_text}
            ],
            response_model=Analysis,
            temperature=0.4,
            max_tokens=256,
        )

        # Parse the response text manually since instructor isn't used
        return response.model_dump()

In [None]:
# Example corpus entry to analyze
doc_text = "Why did the bicycle fall over? Because it was two-tired."
analysis = structed_response(doc_text)
print(analysis)


In [None]:
from tqdm import tqdm
import random
import concurrent.futures

def run_multithreaded_analysis(corpus, num_workers=4):
    """Run analysis on corpus using multiple threads."""
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(structed_response, doc['text']): doc for doc in corpus}
        results = []
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            doc = futures[future]
            try:
                analysis = future.result()
                doc['analysis'] = analysis
                results.append(doc)
            except Exception as e:
                print(f"Error processing docid {doc['docid']}: {e}")
                doc['analysis'] = ""  # Fallback to empty analysis
                results.append(doc)
    return results

In [None]:
for LANG in ['EN', 'PT']:
    # Load corpus and sample 05%
    corpus_path = os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_corpus25_{LANG}.json')  # docid and text
    corpus = load_json(corpus_path)
    corpus = random.sample(corpus, int(len(corpus) * 0.1))
    # run multithreaded analysis
    processed_corpus = run_multithreaded_analysis(corpus, num_workers=4)
    # Save the updated corpus with analysis
    with open(os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_corpus25_{LANG}_analysis_1.json'), 'w', encoding='utf-8') as f:
        json.dump(processed_corpus, f, ensure_ascii=False, indent=4)

## Train sentence-transformers

## Prepare Data, generate triplets

In [None]:
# Hard-negative Triplet Generation
from dataclasses import dataclass
import json
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import mine_hard_negatives
from datasets import Dataset, DatasetDict


@dataclass
class MiningConfig:
    # Input files
    corpus_path: str
    queries_path: str
    qrels_path: str

    # Model & mining parameters
    model_name: str = 'all-MiniLM-L12-v2'#'all-MiniLM-L6-v2'
    range_min: int = 8
    range_max: int = 100
    max_score: float = 0.8
    relative_margin: float = 0.05
    num_negatives: int = 5
    sampling_strategy: str = 'random'  # 'top' or 'random'
    batch_size: int = 32
    use_faiss: bool = False

    # Split ratios
    test_size: float = 0.1
    dev_size: float = 0.5  # fraction of test split used for dev

    # Output prefix
    output_prefix: str = 'triplets'


# Configure
config = MiningConfig(
    corpus_path=os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_corpus25_{LANG}.json'),  # docid and text
    queries_path=os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_queries_train25_{LANG}.json'),  # qid and query
    qrels_path=os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_qrels_train25_{LANG}.json'),  # qid and docid
    output_prefix='hardneg'
)

# Load data
corpus = load_json(config.corpus_path)
queries = load_json(config.queries_path)
qrels = load_json(config.qrels_path)

# Build anchor-positive pairs
qid2query = {q['qid']: q['query'] for q in queries}
print(qid2query)
doc_map = {int(d['docid']): d['text'] for d in corpus}
pairs = [
    {'query': qid2query[r['qid']], 'answer': doc_map[int(r['docid'])]}
    for r in qrels if r.get('qrel', 0) > 0
]

# load analysis  to build query-answer pairs if analysis wordplay is true

# Load analysis data to augment pairs
analysis_path = os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_corpus25_{LANG}_analysis_1.json")
analysis_data = load_json(analysis_path)
print(f"Loaded {len(analysis_data)} documents from analysis")
# Add pairs from analysis where wordplay is true
wordplay_pairs = [
    {'query': doc['analysis']['generated_query'],
     'answer': doc['text']}
    for doc in analysis_data
    if doc.get('analysis') and
       isinstance(doc['analysis'], dict) and
       doc['analysis'].get('is_wordplay') and
       doc['analysis'].get('generated_query')
]
# Filter out empty queries and deduplicate
valid_pairs = [p for p in wordplay_pairs if p['query'].strip()]
print(f"Found {len(valid_pairs)} valid wordplay pairs from analysis")
pairs.extend(valid_pairs)
# Convert to Hugging Face Dataset
dataset = Dataset.from_list(pairs)
print(dataset)
# Mine hard negatives
model = SentenceTransformer(config.model_name)
mined = mine_hard_negatives(
    dataset=dataset,
    model=model,
    range_min=config.range_min,
    range_max=config.range_max,
    max_score=config.max_score,
    relative_margin=config.relative_margin,
    num_negatives=config.num_negatives,
    sampling_strategy=config.sampling_strategy,
    batch_size=config.batch_size,
    use_faiss=config.use_faiss,
)

# 6) Split into train/validation/test
split1 = mined.train_test_split(test_size=config.test_size, seed=42)
split2 = split1['test'].train_test_split(test_size=config.dev_size, seed=42)
ds = DatasetDict({
    'train': split1['train'],
    'validation': split2['train'],
    'test': split2['test']
})

# 7) Save each split to JSONL
for name, subset in ds.items():
    out_file = os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/{config.output_prefix}_{name}.jsonl")
    subset.to_json(out_file, orient='records', lines=True,index=False)
    print(f"Wrote {len(subset)} rows to {out_file}")



{'qid_train_1': 'matemática', 'qid_train_2': 'astrologia', 'qid_train_3': 'Tomás', 'qid_train_4': 'animal doméstico', 'qid_train_5': 'ciência', 'qid_train_6': 'fotografia', 'qid_train_7': 'árvore', 'qid_train_8': 'dinheiro', 'qid_train_9': 'elevador', 'qid_train_10': 'corpo', 'qid_train_11': 'emergência', 'qid_train_12': 'pão', 'qid_train_13': 'legume', 'qid_train_14': 'culinária', 'qid_train_15': 'cores', 'qid_train_16': 'limpeza', 'qid_train_17': 'agricultura', 'qid_train_18': 'bombeiro', 'qid_train_19': 'tomate', 'qid_train_20': 'impressora', 'qid_train_21': 'vidente', 'qid_train_22': 'sapo', 'qid_train_23': 'itália', 'qid_train_24': 'pizza', 'qid_train_25': 'férias', 'qid_train_26': 'alpinista', 'qid_train_27': 'mistério', 'qid_train_28': 'agradecimento', 'qid_train_29': 'fantasma'}
Loaded 4512 documents from analysis
Found 164 valid wordplay pairs from analysis
Dataset({
    features: ['query', 'answer'],
    num_rows: 467
})
Found 191 unique queries out of 467 total queries.
Foun

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Metric       Positive       Negative     Difference
Count             467            980               
Mean           0.4602         0.3219         0.3259
Median         0.4288         0.3373         0.3387
Std            0.2359         0.1028         0.1411
Min           -0.0253         0.0418        -0.0645
25%            0.2768         0.2435         0.2334
50%            0.4288         0.3375         0.3391
75%            0.6921         0.4026         0.4158
Max            0.9494         0.5963         0.8475
Skipped 4,604 potential negatives (17.22%) due to the relative_margin of 0.05.
Could not find enough negatives for 1355 samples (58.03%). Consider adjusting the range_max, range_min, relative_margin and max_score parameters if you'd like to find more valid negatives.


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Wrote 882 rows to /content/drive/MyDrive/CLEF2025/data/task1/PT/hardneg_train.jsonl


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Wrote 49 rows to /content/drive/MyDrive/CLEF2025/data/task1/PT/hardneg_validation.jsonl


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Wrote 49 rows to /content/drive/MyDrive/CLEF2025/data/task1/PT/hardneg_test.jsonl


In [None]:
# === Find queries present in BOTH train and test ===
import json

# Load train and test query files
with open(os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_queries_train25_{LANG}.json'), 'r', encoding='utf-8') as f:
    train_qs = json.load(f)
with open(os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_queries_test25_{LANG}.json'), 'r', encoding='utf-8') as f:
    test_qs = json.load(f)

# Extract just the query texts
train_set = {q['query'] for q in train_qs}
test_set = {q['query'] for q in test_qs}

# Compute intersection
common_queries = train_set & test_set

print(f"Found {len(common_queries)} queries in BOTH:")
for q in sorted(common_queries):
    print(" •", q)

# (Optionally write out)
with open('common_queries.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(sorted(common_queries)))

# Unique to train
train_only = train_set - test_set
print(f"\n{len(train_only)} queries only in TRAIN:")
for q in sorted(train_only):
    print(" •", q)

# Unique to test
test_only = test_set - train_set
print(f"\n{len(test_only)} queries only in TEST:")
for q in sorted(test_only):
    print(" •", q)


### Losses

In [None]:
from __future__ import annotations
import torch
from torch import Tensor, nn
import torch.nn.functional as F
from sentence_transformers.SentenceTransformer import SentenceTransformer
from sentence_transformers import util


class AdaptiveMarginSigLIPLoss(nn.Module):
    """
    Adaptive-margin (log-sum-exp) loss with learnable temperature & bias,
    extended to handle multiple in-batch candidates (positives + negatives).
    """

    def __init__(
            self,
            model: SentenceTransformer,
            scale: int = 1,
            init_log_temp: float = 0.0,
            init_bias: float = 0.0,
            similarity_fct=util.cos_sim
    ) -> None:
        super().__init__()
        self.model = model
        self.scale = scale
        self.log_t = nn.Parameter(torch.tensor([init_log_temp], dtype=torch.float))
        self.bias = nn.Parameter(torch.tensor([init_bias], dtype=torch.float))
        self.similarity_fct = similarity_fct

    def forward(
            self,
            sentence_features: list[dict[str, Tensor]],
            labels: Tensor | None = None  # unused
    ) -> Tensor:
        # 1) Embed all views: anchor + (positive + negatives...)
        reps = [self.model(sf)["sentence_embedding"] for sf in sentence_features]
        anchors = reps[0]  # (batch_size, dim)
        candidates = torch.cat(reps[1:],  # (batch_size * num_views, dim)
                               dim=0)

        # # 2) Optional normalization (uncomment if needed):
        # anchors   = F.normalize(anchors,   p=2, dim=1)
        # candidates = F.normalize(candidates, p=2, dim=1)

        # 3) Compute scaled logits: dot-product * t + b
        t = torch.exp(self.log_t)
        sim = self.similarity_fct(anchors, candidates)
        logits = sim * t * self.scale + self.bias  # (N, M)

        # 4) Log-sum-exp over all candidates (neg_agg) & true-positive scores (pos)
        neg_agg = torch.logsumexp(logits, dim=1)  # (N,)
        # True-positive is assumed to be at position j=i within each block of size batch_size
        # If you pack in order [positive1, neg1_1,..., neg1_n, positive2, ...],
        # Here we assume the first batch_size entries of `candidates` are the positives:
        pos = logits.diagonal(offset=0)  # picks logits[i,i]

        # 5) Compute adaptive-margin loss
        loss = (neg_agg - pos).mean()
        return loss

    def get_config_dict(self) -> dict[str, any]:
        return {
            "init_log_temp": float(self.log_t.detach().cpu()),
            "init_bias": float(self.bias.detach().cpu())
        }

    @property
    def citation(self) -> str:
        return """
"""

### Training

In [None]:
import os

os.environ["WANDB_PROJECT"] = "sentence-transformers"

In [None]:
set_random_seed(42)

In [None]:
import os

os.environ["ACCELERATE_USE_MPS_DEVICE"] = "False"

In [None]:
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

In [None]:
# load from local jsonl files
from datasets import load_dataset
dataset = load_dataset("json", data_files={
    "train": os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hardneg_train.jsonl"),
    "dev": os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hardneg_validation.jsonl"),
    "test": os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hardneg_test.jsonl")
})
train_dataset = dataset["train"]
eval_dataset = dataset["dev"]
test_dataset = dataset["test"]


Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.util import dot_score
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
import wandb

model_name = "intfloat/multilingual-e5-small"#"intfloat/multilingual-e5-base"
model = SentenceTransformer(
    model_name,
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)
# model = model.to("mps")

# 4. Define a loss function
# loss = SigLIPLoss(model)
# loss = MultipleNegativesRankingLoss(model, scale=1)
# loss = AdaptiveSigLIPLoss(model)
loss = AdaptiveMarginSigLIPLoss(model, scale=1)

short_model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
run_name = f"{short_model_name}-jokes-{LANG}"
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f"models/{run_name}",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if GPU can't handle FP16
    # bf16=False,  # Set to True if GPU supports BF16
    # use_mps_device=False,  # Set to True if using Apple silicon
    # use_cpu=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    run_name=run_name,  # Used in W&B if `wandb` is installed
    report_to="wandb",
    push_to_hub=True,
    hub_model_id=f"igorktech/{run_name}",
    hub_private_repo=True,

)

# Create an evaluators & evaluate the base model
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["query"],
    positives=eval_dataset["answer"],
    negatives=eval_dataset["negative"],
    name="triplet-dev",
)

test_evaluator = TripletEvaluator(
    anchors=test_dataset["query"],
    positives=test_dataset["answer"],
    negatives=test_dataset["negative"],
    name="triplet-test",
)

# evaluate the model on dev set
results = dev_evaluator(model)
print("Dev metrics before:", results)

wandb.init(project="sentence-transformers", name= run_name)
wandb.log({f"eval/dense_dev_{k}": v for k,v in results.items()})

# Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)

trainer.train()

# 8. Save the trained model
model.save_pretrained("models/mpnet-base-jokes-triplet/final")

# 9. (Optional) Push it to the Hugging Face Hub
# model.push_to_hub("mpnet-base-jokes-triplet")

# Evaluate on dev and test, log to W&B
dev_metrics = dev_evaluator(model)
print("Dev metrics:", dev_metrics)
test_metrics = test_evaluator(model)
print("Test metrics:", test_metrics)
wandb.log({f"eval/dense_dev_{k}": v for k,v in dev_metrics.items()})
wandb.log({f"eval/dense_test_{k}": v for k,v in test_metrics.items()})

Dev metrics before: {'triplet-dev_cosine_accuracy': 1.0}


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Triplet-dev Cosine Accuracy
10,3.3458,2.664879,1.0
20,3.0379,2.293495,1.0
30,2.783,2.20218,1.0
40,2.7374,2.174546,1.0
50,2.7199,2.158144,1.0


Dev metrics: {'triplet-dev_cosine_accuracy': 1.0}
Test metrics: {'triplet-test_cosine_accuracy': 1.0}


In [None]:
wandb.finish()

0,1
eval/dense_dev_triplet-dev_cosine_accuracy,▁▁
eval/dense_test_triplet-test_cosine_accuracy,▁
eval/loss,█▃▂▁▁
eval/runtime,█▁▁▃▁
eval/samples_per_second,▁██▅█
eval/steps_per_second,▁██▅█
eval/triplet-dev_cosine_accuracy,▁▁▁▁▁
train/epoch,▁▁▃▃▄▄▆▆▇▇█
train/global_step,▁▁▃▃▄▄▆▆▇▇███
train/grad_norm,▁▅▄█▂

0,1
eval/dense_dev_triplet-dev_cosine_accuracy,1.0
eval/dense_test_triplet-test_cosine_accuracy,1.0
eval/loss,2.15814
eval/runtime,0.6832
eval/samples_per_second,71.722
eval/steps_per_second,5.855
eval/triplet-dev_cosine_accuracy,1.0
total_flos,0.0
train/epoch,1.0
train/global_step,56.0


## Reranker

In [None]:
# === Load Triplet Dataset ===
raw_dataset = load_dataset("json", data_files={
    "train": os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hardneg_train.jsonl"),
    "dev": os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hardneg_validation.jsonl"),
    "test": os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hardneg_test.jsonl")
})
train_dataset = raw_dataset["train"]
dev_dataset   = raw_dataset["dev"]
test_dataset  = raw_dataset["test"]


In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import mine_hard_negatives

def prepare_hard_negatives(train_dataset, eval_dataset, test_dataset, dataset,
                           num_hard_negatives=5, num_eval_negatives=30,
                           embedder_name="sentence-transformers/static-retrieval-mrl-en-v1"):
    """
    Returns hard_train, hard_dev, hard_test ready for CrossEncoderTrainer/evaluator.

    - train_dataset, eval_dataset, test_dataset: your HF Dataset splits.
    - dataset: full HF Dataset dict with .data_files for corpus.
    """
    # 1) Initialize a lightweight SentenceTransformer for mining
    embedder = SentenceTransformer(embedder_name)

    # 2) Mine labeled pairs for training
    hard_train = mine_hard_negatives(
        train_dataset,
        embedder,
        num_negatives=num_hard_negatives,
        margin=0,
        range_min=0,
        range_max=100,
        sampling_strategy="top",
        batch_size=4096,
        output_format="labeled-pair",
        use_faiss=False,
    )

    # 3) Build full answer corpus
    corpus = (
            dataset["train"]["answer"]
            + dataset["dev"]["answer"]
            + dataset["test"]["answer"]
    )

    # 4) Mine n-tuple samples for dev and test (positives included)
    hard_dev = mine_hard_negatives(
        eval_dataset,
        embedder,
        corpus=corpus,
        num_negatives=num_eval_negatives,
        batch_size=4096,
        include_positives=True,
        output_format="n-tuple",
        use_faiss=False,
    )
    hard_test = mine_hard_negatives(
        test_dataset,
        embedder,
        corpus=corpus,
        num_negatives=num_eval_negatives,
        batch_size=4096,
        include_positives=True,
        output_format="n-tuple",
        use_faiss=False,
    )

    return hard_train, hard_dev, hard_test


In [None]:
from sentence_transformers.cross_encoder.evaluation import (
    CrossEncoderNanoBEIREvaluator,
    CrossEncoderRerankingEvaluator,
)
from sentence_transformers.evaluation.SequentialEvaluator import SequentialEvaluator
def build_evaluators(hard_dev, hard_test, batch_size=16):
    """
    Returns:
      - dev_evaluator: combines reranking + NanoBEIR
      - test_evaluator: reranking on test split
    """
    # Lightweight BEIR benchmarking
    beir_eval = CrossEncoderNanoBEIREvaluator(
        dataset_names=["msmarco", "nfcorpus", "nq"],
        batch_size=batch_size,
    )

    rerank_dev = CrossEncoderRerankingEvaluator(
        samples=[
            {
                "query": sample["query"],
                "positive": [sample["answer"]],
                "documents": [sample[column_name] for column_name in hard_dev.column_names[2:]],
            }
            for sample in hard_dev
        ],
        batch_size=batch_size,
        name="ir-dev",
        always_rerank_positives=False,
    )

    rerank_test = CrossEncoderRerankingEvaluator(
        samples=[
            {
                "query": sample["query"],
                "positive": [sample["answer"]],
                "documents": [sample[column_name] for column_name in hard_test.column_names[2:]],
            }
            for sample in hard_test
        ],
        batch_size=batch_size,
        name="ir-test",
        always_rerank_positives=False,
    )

    dev_evaluator  = SequentialEvaluator([rerank_dev, beir_eval])
    test_evaluator = rerank_test
    return dev_evaluator, test_evaluator

In [None]:
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import (
    CrossEncoder,
    CrossEncoderModelCardData,
    CrossEncoderTrainer,
    CrossEncoderTrainingArguments,
)


from sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss import BinaryCrossEntropyLoss

num_hard_negatives = 5  # How many hard negatives should be mined for each question-answer pair
train_batch_size = 16
model_name = "cross-encoder/ms-marco-MiniLM-L12-v2"

# prepare hard negatives
hard_train, hard_dev, hard_test =  prepare_hard_negatives(    train_dataset,
                                                              dev_dataset,
                                                              test_dataset,
                                                              raw_dataset,
                                                              num_hard_negatives=num_hard_negatives,
                                                              num_eval_negatives=30)
# Build evaluators
dev_evaluator, test_evaluator = build_evaluators(hard_dev, hard_test, batch_size=train_batch_size)

model = CrossEncoder(model_name, num_labels=1)
# model.to("mps")
loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(num_hard_negatives))

short_model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
run_name = f"reranker-{short_model_name}-jokes-{LANG}"
args = CrossEncoderTrainingArguments(
    # Required parameter:
    output_dir=f"models/{run_name}",
    # Optional training parameters:
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if GPU can't handle FP16
    # bf16=False,  # Set to True if GPU supports BF16
    # use_mps_device=False,  # Set to True if using Apple silicon
    # use_cpu=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_ir-dev_ndcg@10",
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    run_name=run_name,  # Used in W&B if `wandb` is installed
    report_to="wandb",
    push_to_hub=True,
    hub_model_id=f"igorktech/{run_name}",
    hub_private_repo=True,
)

wandb.init(project="sentence-transformers", name=run_name)

trainer = CrossEncoderTrainer(
    model=model,
    args=args,
    train_dataset=hard_train,
    loss=loss,
    evaluator=dev_evaluator,
)

trainer.train()
# model.save(output_dir)

# Evaluate and log metrics
dev_metrics = dev_evaluator(model)
test_metrics = test_evaluator(model)
wandb.log({f"eval/rerank_dev_{metric}": value for metric, value in dev_metrics.items()})
wandb.log({f"eval/rerank_test_{metric}": value for metric, value in test_metrics.items()})


modules.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/670k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/125M [00:00<?, ?B/s]



Found 162 unique queries out of 882 total queries.
Found an average of 5.444 positives per query.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Metric       Positive       Negative     Difference
Count             882          4,385               
Mean           0.5699         0.2353         0.3332
Median         0.6206         0.2353         0.3395
Std            0.2068         0.1086         0.1733
Min           -0.0010        -0.0048         0.0006
25%            0.4154         0.1787         0.1999
50%            0.6223         0.2353         0.3395
75%            0.7197         0.2966         0.4541
Max            0.9615         0.6260         0.8334
Skipped 741 potential negatives (3.29%) due to the absolute_margin of 0.
Could not find enough negatives for 25 samples (0.57%). Consider adjusting the range_max and absolute_margin parameters if you'd like to find more valid negatives.
Setting range_max to 33 based on the provided parameters.
Found 38 unique queries out of 49 total queries.
Found an average of 1.289 positives per query.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Metric       Positive       Negative     Difference
Count              49          1,470               
Mean           0.5485         0.2011         0.3474
Median         0.6009         0.1900         0.3835
Std            0.2056         0.1087         0.1855
Min           -0.0010         0.0217        -0.1326
25%            0.3959         0.1349         0.2199
50%            0.6009         0.1901         0.3836
75%            0.7129         0.2331         0.5075
Max            0.8478         0.8478         0.6689
Setting range_max to 35 based on the provided parameters.
Found 39 unique queries out of 49 total queries.
Found an average of 1.256 positives per query.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Metric       Positive       Negative     Difference
Count              49          1,470               
Mean           0.5264         0.1857         0.3407
Median         0.5767         0.1684         0.3746
Std            0.2269         0.1086         0.2099
Min            0.0124         0.0376        -0.3073
25%            0.3653         0.1184         0.1947
50%            0.5767         0.1684         0.3749
75%            0.7067         0.2189         0.4804
Max            0.8930         0.8930         0.7617


Loading NanoBEIR datasets:   0%|          | 0/3 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5043 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/445k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Loading NanoBEIR datasets:  33%|███▎      | 1/3 [00:12<00:25, 12.86s/it]

README.md:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2953 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/250k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Loading NanoBEIR datasets:  67%|██████▋   | 2/3 [00:24<00:12, 12.02s/it]

README.md:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/1.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5035 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/443k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]



config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Step,Training Loss,Validation Loss,Ir-dev Map,Ir-dev Mrr@10,Ir-dev Ndcg@10,Ir-dev Base Map,Ir-dev Base Mrr@10,Ir-dev Base Ndcg@10,Nanomsmarco R100 Map,Nanomsmarco R100 Mrr@10,Nanomsmarco R100 Ndcg@10,Nanomsmarco R100 Base Map,Nanomsmarco R100 Base Mrr@10,Nanomsmarco R100 Base Ndcg@10,Nanonfcorpus R100 Map,Nanonfcorpus R100 Mrr@10,Nanonfcorpus R100 Ndcg@10,Nanonfcorpus R100 Base Map,Nanonfcorpus R100 Base Mrr@10,Nanonfcorpus R100 Base Ndcg@10,Nanonq R100 Map,Nanonq R100 Mrr@10,Nanonq R100 Ndcg@10,Nanonq R100 Base Map,Nanonq R100 Base Mrr@10,Nanonq R100 Base Ndcg@10,Nanobeir R100 Mean Map,Nanobeir R100 Mean Mrr@10,Nanobeir R100 Mean Ndcg@10,Nanobeir R100 Mean Base Map,Nanobeir R100 Mean Base Mrr@10,Nanobeir R100 Mean Base Ndcg@10,Sequential Score
100,0.0628,No log,0.863095,0.863095,0.892489,0.878231,0.876531,0.897391,0.637504,0.631595,0.695325,0.489577,0.4775,0.540426,0.362324,0.560556,0.402289,0.260995,0.499833,0.32504,0.722765,0.746333,0.771285,0.419606,0.42669,0.500647,0.574198,0.646161,0.622966,0.390059,0.468008,0.455371,0.622966
200,0.0053,No log,0.860058,0.860058,0.890182,0.878231,0.876531,0.897391,0.640521,0.63469,0.697686,0.489577,0.4775,0.540426,0.355316,0.559111,0.400136,0.260995,0.499833,0.32504,0.723306,0.746333,0.771705,0.419606,0.42669,0.500647,0.573048,0.646712,0.623176,0.390059,0.468008,0.455371,0.623176


In [None]:

final_output_dir = f"models/{run_name}/final"
# model.save_pretrained(final_output_dir)
model.push_to_hub(run_name)



HfHubHTTPError: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-683a4faf-443f7e0c13f48536263a2482;7d29921b-87de-48b3-9670-7f60283a2d85)

You already created this model repo

## Build Index

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
from qdrant_client import QdrantClient
client = QdrantClient(
    ":memory:",
    timeout=None
    )

In [None]:
from fastembed import TextEmbedding, SparseTextEmbedding

In [None]:
dense_embedding_model = SentenceTransformer(f"igorktech/multilingual-e5-small-jokes-{LANG}")
dense_embedding_model = SentenceTransformer("intfloat/multilingual-e5-base")
bm25_embedding_model = SparseTextEmbedding("Qdrant/bm25")



modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/179k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
import json
import pickle
from typing import List

from tqdm import tqdm
from qdrant_client import QdrantClient, models as rest
from qdrant_client.models import Distance, VectorParams, models
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer


def build_indexes(
    client: QdrantClient,
    corpus_file: str,
    index_name: str,
    dense_model: SentenceTransformer,
    sparse_model: SparseTextEmbedding
) -> None:
    """
    Build a Qdrant collection with:
      - Dense vectors (SentenceTransformer, COSINE)
      - Sparse vectors (BM25 via IDF modifier)
    """
    # Load docs
    corpus_data = load_json(corpus_file)

    # Extract ids and documents from the loaded data
    valid_corpus_data = [obj for obj in corpus_data if obj.get('text') is not None]

    ids: List[int] = [int(obj['docid']) for obj in valid_corpus_data]
    documents: List[str] = [obj['text'] for obj in valid_corpus_data]

    # Dense embeddings (normalized for COSINE)
    embeddings = dense_model.encode(
        documents,
        batch_size=128,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    bm25_embeddings = list(sparse_model.embed(doc for doc in documents))

    # Delete if exists
    existing = [c.name for c in client.get_collections().collections]
    if index_name in existing:
        client.delete_collection(collection_name=index_name)

    client.create_collection(
        collection_name=index_name,
        vectors_config={
            "dense": rest.VectorParams(
                size=embeddings.shape[1],
                distance=rest.Distance.COSINE
            )
        },
        sparse_vectors_config={
            "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF)
        }
    )

    points = []
    for idx, dense_embedding, bm25_embedding, doc in tqdm(zip(ids, embeddings, bm25_embeddings, documents),desc="Preparing Points"):

        point = rest.PointStruct(
            id=idx,
            vector={
                "dense": dense_embedding.tolist(),
                "bm25": bm25_embedding.as_object(),
            },
            payload={"document": doc}
        )
        points.append(point)

    operation_info = client.upsert(
        collection_name=index_name,
        points=points
    )
    print(f"Loaded {len(ids)} documents into {index_name}")
    print(f"Operation info: {operation_info}")

In [None]:
import pickle
from typing import List, Tuple

from qdrant_client import QdrantClient, models as rest
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.feature_extraction.text import CountVectorizer


def hybrid_retrieve(
    client: QdrantClient,
    queries: List[str],
    k: int,
    index_name: str,
    dense_model: SentenceTransformer,
    sparse_model: SparseTextEmbedding,
    ranker: CrossEncoder,
) -> List[List[Tuple[str, float]]]:
    """
    For each query:
      1) Encode dense + sparse (raw counts)
      2) Use Qdrant Query API prefetch (sparse+dense) + RRF fusion
      3) Rerank top-k with CrossEncoder
    Returns lists of (docid, score).
    """

    # 1) Dense encodings
    denses = dense_model.encode(
        queries,
        batch_size=64,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    results = []
    for query, d_emb in zip(queries, denses):
        # Sparse count vector
        sparse_vectors = next(sparse_model.query_embed(query))
        sparse_q = rest.SparseVector(**sparse_vectors.as_object())

        # # Hybrid fetch + fusion
        # response = client.query_points(
        #     collection_name=index_name,
        #     prefetch=[
        #         rest.Prefetch(query=sparse_q, using="bm25", limit=k),
        #         rest.Prefetch(query=d_emb.tolist(), using="dense", limit=k)
        #     ],
        #     query=rest.FusionQuery(fusion=rest.Fusion.RRF),
        #     limit=k,
        #     with_payload=["document"]
        # )
        response = client.search(
            collection_name=index_name,
            query_vector=rest.NamedVector(name="dense", vector=d_emb.tolist()),
            limit=k,
            with_payload=["document"]
        )
        # print(response)
        # hits = response.points

        # # Prepare for CrossEncoder rerank
        # docs = [(str(hit.id), hit.payload["document"]) for hit in hits]
        # pairs = [[query, text] for _, text in docs]
        # ce_scores = ranker.predict(pairs).tolist()

        # # Sort by CE score
        # reranked = sorted(
        #     zip(docs, ce_scores),
        #     key=lambda x: x[1],
        #     reverse=True
        # )[:k]

        # results.append([(docid, float(score)) for ((docid, _), score) in reranked])
                # response is a list of ScoredPoint
        if not response:
            results.append([])
            continue

        # rerank with CrossEncoder
        if ranker is not None:
            # Extract (id, document_text)
            docs_and_ids = [(str(pt.id), pt.payload["document"]) for pt in response]
            # Prepare pairs for CrossEncoder: [[query_str, doc_text], ...]
            pairs = [[query, text] for (_, text) in docs_and_ids]
            ce_scores = ranker.predict(pairs).tolist()

            # Combine and sort by CE score (descending)
            reranked = sorted(
                zip(docs_and_ids, ce_scores), key=lambda x: x[1], reverse=True
            )[:k]
            results.append([(docid, float(score)) for ((docid, _), score) in reranked])
        else:
            # If no reranker, just return (docid, raw_score) from Qdrant
            results.append([(str(pt.id), pt.score) for pt in response])
    return results

In [None]:
import pandas as pd
import json

# load json data
TASK = "task1"
TYPE = "retrieval"
FILE_BASE = "joker_task1_retrieval_corpus25"
LANG = 'EN'
# with open(f'data/{TASK}/{LANG}/{FILE_BASE}_{LANG}.json') as f:
#     data = json.load(f)
# # convert to dataframe
# df = pd.DataFrame(data)
df = pd.read_json(os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/{FILE_BASE}_{LANG}.json'))
print(len(df))
print(df.head())

77658
   docid                                               text
0      1  He has a green body, no visible nose, and live...
1      2  On the software side, a port is a numerical id...
2      3  A garbage man going through the trash says: "I...
3      4   The term "run out" can have several meanings ...
4      5  According to the Centers for Disease Control a...


In [None]:
build_indexes(client,
              os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/{FILE_BASE}_{LANG}.json'),
              f"{FILE_BASE}_{LANG}",
              dense_embedding_model,
              bm25_embedding_model)

Preparing Points: 77656it [00:07, 10447.13it/s]


Loaded 77656 documents into joker_task1_retrieval_corpus25_EN
Operation info: operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>


In [None]:
collection_info = client.get_collection(collection_name=f"{FILE_BASE}_{LANG}")
list(collection_info)

[('status', <CollectionStatus.GREEN: 'green'>),
 ('optimizer_status', <OptimizersStatusOneOf.OK: 'ok'>),
 ('vectors_count', None),
 ('indexed_vectors_count', 0),
 ('points_count', 77656),
 ('segments_count', 1),
 ('config',
  CollectionConfig(params=CollectionParams(vectors={'dense': VectorParams(size=768, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None)}, shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors={'bm25': SparseVectorParams(index=None, modifier=<Modifier.IDF: 'idf'>)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flus

In [None]:
from sentence_transformers import CrossEncoder
ranker = CrossEncoder(f"igorktech/reranker-ms-marco-MiniLM-L12-v2-jokes-{LANG}")

In [None]:
from tqdm import tqdm
# Load test queries
FILE_BASE = "joker_task1_retrieval_queries_test25"
queries = load_json(os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/{FILE_BASE}_{LANG}.json'))
qids = [q["qid"] for q in queries]
texts = [q["query"] for q in queries]

# Retrieve top-k (we’ll request 100)
FILE_BASE = "joker_task1_retrieval_corpus25"
raw_results = hybrid_retrieve(
    client,
    queries=texts,
    k=1000,
    index_name=f"{FILE_BASE}_{LANG}",
    dense_model=dense_embedding_model,
    sparse_model=bm25_embedding_model,
    ranker=ranker
)

# Format and normalize scores, then output JSON
run_id = f"{TEAM_NAME}_task_1_BM25_E5_MiniLM"
manual = 0
output = []
for qid, hits in tqdm(zip(qids, raw_results)):
    if not hits:
        continue
    # Extract scores and normalize
    _, scores = zip(*hits)
    max_score = max(scores)
    for rank, (docid, score) in tqdm(enumerate(hits, start=1)):
        normalized = score / max_score if max_score > 0 else 0.0
        output.append({
            "run_id": run_id,
            "system": run_id,
            "manual": manual,
            "qid": qid,
            "docid": docid,
            "rank": rank,
            "score": round(normalized, 6)
        })

# # 5. Write to file
# with open(os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hybrid_run_train_{LANG}_20.json"), "w") as out_f:
#     # json.dump(output, out_f, indent=2)
#     for row in output:
#         out_f.write(json.dumps(row) + "\n")
df = pd.DataFrame(output).reset_index(drop=True)

# --- write ONE array (no line breaks between objects) --------------------
# pred_path = Path("prediction.json")                 # must live at zip root
df.to_json(os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hybrid_run_train_{LANG}_1000_only_dense.json"), orient="records", indent=2)   # JSON array


0it [00:00, ?it/s]
1000it [00:00, 533355.04it/s]

1000it [00:00, 532474.80it/s]

1000it [00:00, 563750.54it/s]

1000it [00:00, 557530.77it/s]

1000it [00:00, 581492.31it/s]

1000it [00:00, 599957.66it/s]

1000it [00:00, 601247.71it/s]

1000it [00:00, 590747.04it/s]

1000it [00:00, 496661.22it/s]

1000it [00:00, 428514.92it/s]

1000it [00:00, 485227.21it/s]

1000it [00:00, 530253.35it/s]

1000it [00:00, 356506.93it/s]

1000it [00:00, 355088.38it/s]

1000it [00:00, 357936.85it/s]

1000it [00:00, 354698.01it/s]

1000it [00:00, 350606.37it/s]

1000it [00:00, 583920.92it/s]

1000it [00:00, 565422.49it/s]

1000it [00:00, 395390.65it/s]
20it [00:00, 190.07it/s]
1000it [00:00, 518455.38it/s]

1000it [00:00, 492404.79it/s]

1000it [00:00, 577091.91it/s]

1000it [00:00, 582299.60it/s]

1000it [00:00, 542320.14it/s]

1000it [00:00, 580205.28it/s]

1000it [00:00, 530387.46it/s]

1000it [00:00, 568410.90it/s]

1000it [00:00, 572210.64it/s]

1000it [00:00, 565270.08it/s]

1000it [00:00, 448348.90it/

In [None]:
!pip install -q ranx

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.3/99.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.2/249.2 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m87.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m859.0/859.0 kB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.0/135.0 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for warc3-wet-clueweb09 (setup.py) ...

In [None]:
import json
from ranx import Qrels, Run, evaluate

# Load your data
qrels_path = os.path.join(BASE_DIR_PREFIX,f'data/{TASK}/{LANG}/joker_{TASK}_{TYPE}_qrels_train25_{LANG}.json')
run_path = os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hybrid_run_train_{LANG}_100_only_dense.json")

qrels_data = load_json(qrels_path)
run_data = load_json(run_path)

# Format data for ranx
# Create Qrels object
# The format for ranx Qrels is { "qid": { "docid": relevance } }
qrels_dict = {}
for item in qrels_data:
    qid = str(item['qid'])
    docid = str(item['docid'])
    relevance = item.get('qrel', 1) # Assuming 1 if qrel is not present
    if qid not in qrels_dict:
        qrels_dict[qid] = {}
    qrels_dict[qid][docid] = relevance

qrels = Qrels(qrels_dict)

# Create Run object
# The format for ranx Run is { "qid": { "docid": score } }
run_dict = {}
for item in run_data:
    qid = str(item['qid'])
    docid = str(item['docid'])
    score = item['score']
    if qid not in run_dict:
        run_dict[qid] = {}
    run_dict[qid][docid] = score

run = Run(run_dict)


# Define the metrics
metrics = [
    "map",
    "ndcg",
    "precision@1", "precision@5", "precision@10",
    "recall@5", "recall@10", "recall@100", "recall@1000",
    "bpref",
    "mrr"
]

# Evaluate the run
results = evaluate(qrels, run, metrics)
print(results)

{'map': 0.025870743801685567, 'ndcg': 0.11918164053403614, 'precision@1': 0.0, 'precision@5': 0.049999999999999996, 'precision@10': 0.041666666666666664, 'recall@5': 0.0006868131868131869, 'recall@10': 0.012820512820512818, 'recall@100': 0.24135568922802964, 'recall@1000': 0.24135568922802964, 'bpref': nan, 'mrr': 0.0863882874437674}


In [None]:
import pandas as pd
df = pd.DataFrame([results])

df.to_csv(os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/{LANG}/hybrid_run_train_{LANG}.csv"))
df

Unnamed: 0,map,ndcg,precision@1,precision@5,precision@10,recall@5,recall@10,recall@100,recall@1000,bpref,mrr
0,0.086246,0.333092,0.0,0.016667,0.058333,0.000229,0.013278,0.381094,0.744232,,0.053331


PT {'map': np.float64(0.0823441875960505), 'ndcg': np.float64(0.2882948131610417), 'precision@1': np.float64(0.0), 'precision@5': np.float64(0.04137931034482759), 'precision@10': np.float64(0.055172413793103454), 'recall@5': np.float64(0.05019157088122606), 'recall@10': np.float64(0.1471264367816092), 'recall@100': np.float64(0.49470684825654404), 'recall@1000': np.float64(0.8201606741495835), 'bpref': np.float64(0.21264287945407517), 'mrr': np.float64(0.08382701295704029)}


EN {'map': np.float64(0.08624599733801996), 'ndcg': np.float64(0.3330917764370759), 'precision@1': np.float64(0.0), 'precision@5': np.float64(0.016666666666666666), 'precision@10': np.float64(0.05833333333333333), 'recall@5': np.float64(0.00022893772893772896), 'recall@10': np.float64(0.013278388278388278), 'recall@100': np.float64(0.3810942800304502), 'recall@1000': np.float64(0.7442320583278029), 'bpref': np.float64(nan), 'mrr': np.float64(0.05333082326273786)}

# Task 2 & Task 3 utils

In [6]:
import os
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import apply_chat_template
from transformers import GenerationConfig
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
# from trl import CPOTrainer, CPOConfig
from trl import SFTTrainer, SFTConfig
# import trl
import sys
# sys.path.append('./')
# sys.path.append('/content/')
from utils.cpo_trainer import CPOTrainer
from utils.cpo_config import CPOConfig
import sacrebleu.metrics as sbmetrics
from evaluate import load
from transformers import TrainerCallback

In [7]:
USE_TRANSFORMERS=False

In [8]:
import nltk
# Download necessary NLTK data for BLEU
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
def push_model_to_hub(
    run_name: str,
    use_transformers: bool,
    model=None,
    tokenizer=None,
    generation_config=None,
    merge_method: str = None,
    save_dir: str = "model",
    hub_repo_prefix: str = "igorktech"
):
    repo_name = f"{hub_repo_prefix}/{run_name}"
    model.generation_config = generation_config
    if use_transformers:
        if merge_method == 'lora':
            model.push_to_hub(repo_name)
            tokenizer.push_to_hub(repo_name)
        else:
            merged_model = model.merge_and_unload()
            merged_model.push_to_hub(repo_name)
            tokenizer.push_to_hub(repo_name)
    else:
        if merge_method not in {None, 'merged_16bit', 'merged_4bit', 'lora'}:
            raise ValueError("merge_method must be one of 'merged_16bit', 'merged_4bit', 'lora', or None.")
        if merge_method:
            model.save_pretrained_merged(save_dir, tokenizer, save_method=merge_method)
            model.push_to_hub_merged(repo_name, tokenizer, save_method=merge_method)
            tokenizer.push_to_hub(repo_name)

In [10]:
from typing import List, Optional
import json
import torch
from tqdm.auto import tqdm
from transformers import (
    PreTrainedModel,
    PreTrainedTokenizerBase,
    GenerationConfig,
)

# Helper
def generate(
    prompts: List[str],
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizerBase,
    generation_config: GenerationConfig,
    batch_size: int = 8,
) -> List[str]:
    """
    Generates only the model’s new text for each prompt, in batches.
    """
    # ensure pad/eos IDs are set
    print("Setting pad/eos token IDs")
    if generation_config.pad_token_id is None:
        print("Set pad token to eos")
        generation_config.pad_token_id = tokenizer.eos_token_id
    if generation_config.eos_token_id is None:
        print("Set eos token to eos")
        generation_config.eos_token_id = [tokenizer.eos_token_id]

    completions: List[str] = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i : i + batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
        ).to(model.device)

        with torch.no_grad():
            generations = model.generate(
                **inputs,
                generation_config=generation_config,
            )

        # strip off the prompt tokens from each generation
        for input_ids, gen_ids in zip(inputs.input_ids, generations):
            new_tokens = gen_ids[input_ids.shape[-1] :].tolist()
            text = tokenizer.decode(new_tokens, skip_special_tokens=True)
            completions.append(text.lstrip())

    return completions

In [11]:
import warnings

import torch
from transformers import DataCollatorForLanguageModeling
from typing import Any, Optional, Union


class DataCollatorForCompletionsOnlyLM(DataCollatorForLanguageModeling):
    """
    Data collator used for completion tasks. It ensures that all the tokens of the labels are set to an 'ignore_index'
    when they do not come from the assistant. This ensure that the loss is only
    calculated on the all completions made by the assistant.

    Args:
        response_template (`Union[str, list[int]]`): the template form that indicates the start of the response, typically something like
            '### Response:\n'. It can also be passed as tokenized ids, which can be useful when using a tokenizer that encodes the response
            differently if it does not have proper context.
        mlm (`bool`, *optional*, defaults to `False`): Whether to use masked language modeling in the underlying
            `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
             for flexibility and backwards-compatibility.
        ignore_index (`int`, *optional*, defaults to `-100`):
            The index to use to ignore the initial tokens with
    """

    def __init__(
            self,
            response_template: Union[str, list[int]],
            *args,
            mlm: bool = False,
            ignore_index: int = -100,
            **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)

        self.response_template = response_template
        if isinstance(response_template, str):
            # The user provides a string, must tokenize
            self.response_token_ids = self.tokenizer.encode(self.response_template, add_special_tokens=False)
        else:
            # The user already provides the token ids
            self.response_token_ids = response_template

        if not self.mlm and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
            warnings.warn(
                "The pad_token_id and eos_token_id values of this tokenizer are identical. "
                "If you are planning for multi-turn training, "
                "it can result in the model continuously generating questions and answers without eos token. "
                "To avoid this, set the pad_token_id to a different value.",
                UserWarning,
            )

        self.ignore_index = ignore_index

    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            response_token_ids_start_indexes = []
            eos_token_indexes = []

            for idx in torch.where(batch["labels"][i] == self.response_token_ids[0])[0]:
                # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match
                if (
                        self.response_token_ids
                        == batch["labels"][i][idx: idx + len(self.response_token_ids)].tolist()
                ):
                    response_token_ids_start_indexes.append(idx+len(self.response_token_ids))

            for idx in torch.where(batch["labels"][i] == self.tokenizer.eos_token_id)[0]:
                eos_token_indexes.append(idx)
            # Filter out EOS token indexes keeping only those that are directly after the corresponding response token
            filtered_eos_token_indexes = []
            for resp_idx in response_token_ids_start_indexes:
                # Find the first EOS token that comes after this response start
                next_eos = next((eos_idx for eos_idx in eos_token_indexes if eos_idx > resp_idx), None)
                if next_eos is not None:
                    filtered_eos_token_indexes.append(next_eos)
            eos_token_indexes = filtered_eos_token_indexes

            if not response_token_ids_start_indexes or not eos_token_indexes:
                warnings.warn(
                    f"Could not find response key `{self.response_template}` in the following instance: "
                    f"{self.tokenizer.decode(batch['input_ids'][i])}. This instance will be ignored in loss "
                    "calculation. Note, if this happens often, consider increasing the `max_seq_length`.",
                    UserWarning,
                )
                batch["labels"][i, :] = self.ignore_index
            else:
                # Keep tokens for responses, rest of the tokens are ignored
                new_labels = torch.full_like(batch["labels"][i], self.ignore_index).to(batch["labels"][i].device)
                for start_idx, end_idx in zip(response_token_ids_start_indexes, eos_token_indexes):
                    new_labels[start_idx:end_idx+1] = batch["labels"][i][start_idx:end_idx+1]

                batch["labels"][i] = new_labels

        return batch

# Task 2

In [12]:
# Set up environment and paths
set_random_seed(SEED)
os.environ["WANDB_PROJECT"] = "joker-pun-translation"
TASK = "task2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define paths
train_path = os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/joker_pun_translation_2025_train.json")
test_path = os.path.join(BASE_DIR_PREFIX,f"data/{TASK}/joker_pun_translation_2025_test.json")

In [13]:
SFT_MODEL_ID = "croissantllm/CroissantLLMChat-v0.1"#"croissantllm/CroissantLLMChat-v0.1"#"croissantllm/CroissantLLMChat-v0.1" # OpenLLM-France/Lucie-7B-Instruct-v1.1
CPO_MODEL_ID = "Skommarkhos-pun-translation-sft"

In [14]:
train_data = load_json(train_path)
test_data = load_json(test_path)

print(f"Loaded {len(train_data)} training examples and {len(test_data)} test examples")


Loaded 5838 training examples and 4537 test examples


In [15]:
# Initialize sacrebleu metrics
bleu_calc = sbmetrics.BLEU()
chrf_calc = sbmetrics.CHRF(word_order=2)
# WMT22 ensemble metric
comet22_metric = load(
    "comet",
    module_type="metric",
    model_id="Unbabel/wmt22-comet-da"
)

Downloading builder script:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.40k [00:00<?, ?B/s]

checkpoints/model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

## Metrics

In [16]:
#@title Compute translation metrics
def compute_translation_metrics(
    preds,
    refs,
    language_pairs=None,
    prompts=None,
    comet_metric=None,
):
    """
    Compute BLEU, chrF++ and optional COMET metrics for cleaned predictions and references.

    Args:
        preds: List of model-generated translations (EOS-stripped, cleaned)
        refs: List of lists of reference translations (EOS-stripped, cleaned)
        language_pairs: Optional list of source-target codes for per-language metrics
        prompts: Optional list of source prompts (required for COMET)
        comet_metric: Optional sacreCOMET metric instance

    Returns:
        Dict of metric names to scores
    """
    # Corpus-level BLEU and chrF++
    metrics = {
        'bleu': bleu_calc.corpus_score(preds, refs).score,
        'chrf++': chrf_calc.corpus_score(preds, refs).score,
    }

    # COMET if provided
    if comet_metric is not None and prompts is not None:
        comet_out = comet_metric.compute(
            sources=prompts,
            predictions=preds,
            references=[r[0] for r in refs]
        )
        metrics['comet'] = comet_out.get('mean_score')

    # Per-language breakdown
    if language_pairs is not None:
        lang_group = {}
        for idx, pair in enumerate(language_pairs):
            tgt = pair.split('-')[-1]
            group = lang_group.setdefault(tgt, {'preds': [], 'refs': []})
            group['preds'].append(preds[idx])
            group['refs'].append(refs[idx])
        for lang, data in lang_group.items():
            metrics[f'bleu_{lang}'] = bleu_calc.corpus_score(data['preds'], data['refs']).score
            metrics[f'chrf++_{lang}'] = chrf_calc.corpus_score(data['preds'], data['refs']).score

    return metrics

In [17]:
#@title Custom evaluation callback
from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)
from trl.trainer.callbacks import _generate_completions
class SacreBleuCallback(TrainerCallback):
    def __init__(
        self,
        trainer,
        eval_dataset: Dataset = None,
        generation_config=None,
        eval_steps: int = 50,
        comet_metric=None,
        language_pairs=None,
    ):
        super().__init__()
        self.trainer           = trainer
        if eval_dataset is None:
            self.eval_dataset  = trainer.eval_dataset
        else:
            self.eval_dataset  = eval_dataset
        self.tokenizer         = trainer.tokenizer
        self.generation_config = generation_config
        self.eval_steps        = eval_steps
        self.comet_metric      = comet_metric
        self.language_pairs    = language_pairs

    def _run_eval(self, args: TrainingArguments, state: TrainerState):
        # tokenizer = kwargs["processing_class"]
        self.tokenizer.padding_side = "left"
        accelerator = self.trainer.accelerator
        model = getattr(self.trainer, "ref_model", None)
        # At this point, there are two cases where `ref_model` is None:
        # 1. The method doesn't require a reference model.
        # 2. The method uses a reference model, but `ref_model` is set to None.
        #    This occurs when using PEFT, where the reference model can be obtained by simply disabling the model's adapter.
        #    In theory, we should disable the adapter here, but since it's zero-initialized at the start of training,
        #    the model behaves identically with or without the adapter.
        #    Therefore, there's no need to explicitly disable it at this point.
        if model is None:
            model = self.trainer.model_wrapped
        # Generate completions using TRL’s helper
        # with accelerator.split_between_processes(self.eval_dataset["prompt"]) as prompts:
        prompts = self.eval_dataset["prompt"]
        predictions = _generate_completions(
                prompts=prompts,
                model=model,
                tokenizer=self.tokenizer,
                accelerator=accelerator,
                generation_config=self.generation_config,
                batch_size=args.per_device_eval_batch_size,
            )

        column = "completion" if "completion" in self.eval_dataset.column_names else "chosen"
        # replace eos_token with ""
        # predictions = [pred.replace(self.tokenizer.eos_token, "") for pred in predictions]
        references = [[t.replace(self.tokenizer.eos_token, "")] for t in self.eval_dataset[column]]

        # bleu = bleu_calc.corpus_score(predictions, references).score
        # chrf = chrf_calc.corpus_score(predictions, references).score

        batch = self.tokenizer(
            prompts,
            add_special_tokens=False,
            padding=True,
            return_tensors="pt"
        )

        # Batch-decode, skipping all special tokens
        sources = self.tokenizer.batch_decode(
            batch["input_ids"],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )

        metrics = compute_translation_metrics(
            preds=predictions,
            refs=references,
            language_pairs=self.language_pairs,
            prompts=sources,
            comet_metric=self.comet_metric,
        )
        # Logging
        # if self.trainer.accelerator.is_main_process:
        # Log to HF and optionally W&B
        # Log with eval_ prefix
        self.trainer.log({f'eval_{k}': v for k, v in metrics.items()})
        if "wandb" in args.report_to:
                import wandb
                if wandb.run is not None:
                    table = wandb.Table(columns=["prompt","pred","ref"])
                    for p,pr,rf in zip(self.eval_dataset["prompt"], predictions, [r[0] for r in references]):
                        table.add_data(p, pr, rf)
                    wandb.log({"eval/translations": table})

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if state.global_step % self.eval_steps != 0:
            return control

        self._run_eval(args, state)
        return control

    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        self._run_eval(args, state)
        return control

## SFT

### Data

In [18]:
def get_prompt(source_text, source_language="English", target_language = "French"):
    return f"Translate the following text from {source_language} into {target_language}.\n{source_language}: {source_text} \n{target_language}: "

In [39]:
#%%
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import json
import os

def prepare_sft_dataset(split_ratio=(0.8, 0.05, 0.15), system_prompt=None):
    """
    Load local JSON and HuggingFace parallel data, normalize HF format,
    format prompts/completions with one function, concatenate, and split into
    train/validation/test.
    """
    # Load local JSON data
    train_json_path = os.path.join(BASE_DIR_PREFIX, "data/task2/joker_pun_translation_2025_train.json")
    with open(train_json_path, "r", encoding="utf-8") as f:
        train_list = json.load(f)
    print(f"Loaded {len(train_list)} preference examples")

    ds_joker = Dataset.from_list(train_list)

    # Load HF parallel dataset (fr-en subset)
    hf = load_dataset("haoranxu/X-ALMA-Parallel-Data", "fr-en", split="train")

    # Normalize HF examples to joker-like fields 'en' and 'fr'
    def normalize_hf(example):
        return {
            "en": example["translation"]["en"].strip(),
            "fr": example["translation"]["fr"].strip()
        }

    hf_normalized = hf.map(normalize_hf, remove_columns=["translation"])

    # Define formatter
    def format_joker(example):
        prompt_messages = []
        if system_prompt:
            prompt_messages.append({"role": "system", "content": system_prompt})
        prompt_messages.append({"role": "user", "content": get_prompt(example["en"].strip())})
        example["prompt"] = prompt_messages
        example["completion"] = [{"role": "assistant", "content": example["fr"].strip()}]
        return example

    # Apply formatting
    # ds_joker = ds_joker.map(format_joker, remove_columns=["id_en"])
    # hf_normalized = hf_normalized.map(format_joker)

    # Concatenate
    joined = concatenate_datasets([ds_joker, hf_normalized])
    def join_completion(example):
        prompt_messages = []
        if system_prompt:
            prompt_messages.append({"role": "system", "content": system_prompt})
        prompt_messages.append({"role": "user", "content": get_prompt(example["en"].strip())})
        prompt_messages.append({"role": "assistant", "content": example["fr"].strip()})
        example["messages"] = prompt_messages
#         example["text"]=tokenizer.apply_chat_template(
#     prompt_messages,
#     tokenize = False, add_generation_prompt = False
# ).removeprefix(tokenizer.bos_token) #sft duplicates bos
        return example
    # joined = joined.map(
    #     lambda examples: {
    #         "text": apply_chat_template(
    #             example["prompt"] + example["completion"],
    #             tokenizer,
    #             add_generation_prompt=False, # We want the full conversation for SFT
    #         )
    #     }
    # )
    joined = joined.map(join_completion, remove_columns=["en", "fr"])

    # Compute split fractions
    train_frac, val_frac, test_frac = split_ratio

    # Split train vs. rest
    split1 = joined.train_test_split(test_size=1 - train_frac, seed=SEED)
    train_split = split1["train"]
    rest = split1["test"]

    # Split validation vs. test
    split2 = rest.train_test_split(test_size=test_frac / (val_frac + test_frac), seed=SEED)
    val_split = split2["train"]
    test_split = split2["test"]

    ds = DatasetDict({
        "train": train_split,
        "validation": val_split,
        "test": test_split
    })

    return ds


### Utils

In [None]:
#@title Prepare model Transformers
# Load and prepare the model
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
def prepare_model(model_id="croissantllm/CroissantLLMChat-v0.1", quantization_config=None,
                  peft_config=None, peft_model=None, generation_config=None,):#"Qwen/Qwen3-0.6B"
    """
    Load and prepare the base model with PEFT configuration.
    """
    print("Preparing model...")

    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Load model in 8-bit for memory efficiency
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=quantization_config,  # Use quantization config if needed
    )
    print(model)
    if generation_config:
        generation_config.eos_token_id = [tokenizer.eos_token_id]
        generation_config.pad_token_id = tokenizer.eos_token_id
        model.generation_config = generation_config

    if quantization_config:
        model = prepare_model_for_kbit_training(model)

    # Apply LoRA to model
    if peft_config and peft_model is None:
        print("Applying LoRA...")
        model = get_peft_model(model, peft_config)
        model.print_trainable_parameters()
    else:
        model = PeftModel.from_pretrained(model, peft_model)

    return model, tokenizer

In [20]:
#@title Prepare model Unsloth
from unsloth import FastLanguageModel
from dataclasses import asdict
from peft import LoraConfig, TaskType
def prepare_model_with_unsloth(
    model_id: str = "croissantllm/CroissantLLMChat-v0.1",
    max_seq_length: int = 2048,
    load_in_8bit: bool = True,
    peft_config = None,
    generation_config = None,
):
    """
    Load a model via Unsloth's FastLanguageModel to get optimized kernels
    with quantization, then attach LoRA adapters via get_peft_model.
    """
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = max_seq_length,
        load_in_4bit      = False,
        load_in_8bit      = load_in_8bit,
        dtype             = None,
    )  #

    if peft_config:
        model = FastLanguageModel.get_peft_model(
            model,
            # Extract parameters from peft_config and pass directly
            r=peft_config.r,
            target_modules=list(peft_config.target_modules),
            lora_alpha=peft_config.lora_alpha,
            lora_dropout=peft_config.lora_dropout,
            bias=peft_config.bias,
            use_gradient_checkpointing = True,
            random_state = SEED,
            max_seq_length = max_seq_length,
        )


    if generation_config:
        generation_config.eos_token_id = [tokenizer.eos_token_id]
        generation_config.pad_token_id = tokenizer.eos_token_id
        print(generation_config)
        model.generation_config = generation_config

    if peft_config:
        model.print_trainable_parameters()

    return model, tokenizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
#@title validaiton function
# Validation function
def validate_model(model, tokenizer, dataset, batch_size=4):
    """
    Validate the model on a dataset and compute BLEU and chrF++ scores.
    """
    device = model.device
    model.eval()

    references = []
    predictions = []
    language_pairs = []

    for i in tqdm(range(0, len(dataset), batch_size), desc="Validating"):
        batch = dataset[i:i + batch_size]
        prompts = batch["prompt"]
        target_texts = batch["completion"]

        # Prepare references
        for text in target_texts:
            references.append([text])  # BLEU expects list of lists

        # Generate translations
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=512,
                num_beams=3,
                temperature=0.7,
                do_sample=True
            )

        # Decode outputs (strip prompts)
        for j, output in enumerate(outputs):
            prompt_tokens = inputs.input_ids[j]
            prompt_len = len(prompt_tokens)

            # Get only the generated part (exclude prompt)
            generation = output[prompt_len:]
            decoded = tokenizer.decode(generation, skip_special_tokens=True)
            predictions.append(decoded)

    # Compute metrics
    metrics = compute_translation_metrics(
        predictions=predictions,
        references=references,
        language_pairs=language_pairs
    )

    return metrics, predictions, references

## Training


In [21]:
from trl.trainer.utils import get_quantization_config
from dataclasses import dataclass

@dataclass
class QuantizationConfig:
    load_in_4bit: bool = False
    bnb_4bit_quant_type: str = "nf4"
    use_bnb_nested_quant: bool = True
    torch_dtype: str = "bfloat16"
    load_in_8bit: bool = True  # mutually exclusive with 4-bit
quantization_args = QuantizationConfig()

quantization_config = get_quantization_config(quantization_args)

In [22]:
# Define LoRA configuration for parameter-efficient fine-tuning
peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        bias="none",
        r=32,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=["lm_head", "q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    )

In [23]:
GENERATION_CONFIG = GenerationConfig(
            max_new_tokens=256,
            temperature=0.2,
            top_p=0.9,
            do_sample=True,
            num_beams=3,
            repetition_penalty=1.3,
            early_stopping=True)
generation_config = GENERATION_CONFIG

In [24]:
if USE_TRANSFORMERS:
    model, tokenizer = prepare_model(SFT_MODEL_ID, quantization_config=quantization_config, peft_config=peft_config, generation_config=GENERATION_CONFIG)
else:
    model, tokenizer = prepare_model_with_unsloth(SFT_MODEL_ID,max_seq_length=512, peft_config=peft_config, generation_config=GENERATION_CONFIG)

==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/397M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Unsloth: Making `model.base_model.model.model` require gradients
GenerationConfig {
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": [
    32000
  ],
  "max_new_tokens": 256,
  "num_beams": 3,
  "pad_token_id": 32000,
  "repetition_penalty": 1.3,
  "temperature": 0.2,
  "top_p": 0.9
}

trainable params: 31,072,320 || all params: 1,376,503,872 || trainable%: 2.2573


In [25]:
if DEVELOPMENT_MODE:
    print(tokenizer.eos_token)

<|im_end|>


In [40]:
sft_dataset = prepare_sft_dataset(
    split_ratio=(0.96, 0.015, 0.025),
    # system_prompt="You are a helpful assistant that translates text from English to French."
    )
if DEVELOPMENT_MODE:
    print(sft_dataset)
    print(sft_dataset['train'][0])

Loaded 5838 preference examples


Map:   0%|          | 0/10332 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id_en', 'messages'],
        num_rows: 9918
    })
    validation: Dataset({
        features: ['id_en', 'messages'],
        num_rows: 155
    })
    test: Dataset({
        features: ['id_en', 'messages'],
        num_rows: 259
    })
})
{'id_en': None, 'messages': [{'content': 'Translate the following text from English into French.\nEnglish: Scuffles also broke out later in the day with police using their batons to contain the fighting. \nFrench: ', 'role': 'user'}, {'content': 'Des échauffourées ont également éclaté plus tard dans la journée, tandis que la police faisait usage de matraques pour contenir les affrontements.', 'role': 'assistant'}]}


In [27]:
if DEVELOPMENT_MODE:
    print(sft_dataset['train'][100]["text"])

<|im_start|>user
Translate the following text from English into French.
English: She was only a Gardener's daughter, but she knows all the rakes. 
French: <|im_end|>
<|im_start|>assistant
Ce n’était qu’une fille de jardinier mais elle prenait beaucoup de râteaux.<|im_end|>



In [39]:
tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"


In [28]:
run_name = "Skommarkhos-pun-translation-sft-v12-unsloth"

In [41]:
training_args = SFTConfig(
    output_dir=f"models/{run_name}",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    max_grad_norm=1.0,
    weight_decay=0.01,
    warmup_ratio=0.01,
    lr_scheduler_type="inverse_sqrt",
    optim="adamw_8bit",
    logging_steps=1,
    save_steps=20,
    eval_strategy="steps",
    eval_steps=2,
    save_total_limit=3,
    fp16=not torch.cuda.is_bf16_supported(),  # not is_bfloat16_supported()
    bf16=torch.cuda.is_bf16_supported(),   # is_bfloat16_supported()
    # load_best_model_at_end=True,
    # metric_for_best_model="eval_chrf++",
    seed=SEED,
    report_to="wandb",
    run_name=run_name,
    logging_dir="./logs",
    push_to_hub=True,
    hub_model_id=f"igorktech/{run_name}",
    hub_private_repo=True,

    max_length=512,

    max_seq_length = 512,
    eos_token=tokenizer.eos_token,
    pad_token=tokenizer.pad_token,
    # completion_only_loss=True,
    packing=False,
    dataset_num_proc=2,
)

if USE_TRANSFORMERS:
    training_args.completion_only_loss=True

    sft_dataset = sft_dataset.remove_columns(["text"])
    sft_trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=sft_dataset["train"],
        eval_dataset=sft_dataset["validation"],
        processing_class=tokenizer
    )
else:
    # training_args.dataset_text_field = "text"

    training_args.completion_only_loss=False # Do not
    # sft_dataset = sft_dataset.remove_columns(["text"])
    # sft_dataset = sft_dataset.remove_columns(["prompt", "completion"])
#     training_args.dataset_kwargs={"skip_prepare_dataset": True}
#     sft_dataset = sft_dataset.map(
#     lambda ex: tokenizer(
#         ex["text"],
#         truncation=True,
#         max_length=512,
#         return_attention_mask=True
#     ),
#     batched=True,
#     remove_columns=["text"]
# )
    response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n" if 'Lucie' in SFT_MODEL_ID else "<|im_start|>assistant"
        #set back eos_token
    # tokenizer.eos_token = "<|eot_id|>" if 'Lucie' in SFT_MODEL_ID else "<|im_end|>"
    # tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
    # tokenizer.pad_token = "<pad>" if 'Lucie' in SFT_MODEL_ID else "</s>"
    # tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    tokenizer.padding_side = "right"
    sft_trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        # tokenizer = tokenizer,
        train_dataset=sft_dataset["train"],
        eval_dataset=sft_dataset["validation"],
        args=training_args,
        data_collator=DataCollatorForCompletionsOnlyLM(
        response_template=response_template,
        tokenizer=tokenizer,
        mlm=False,
        ignore_index=-100,
    ),
    )
    sft_trainer.processing_class.eos_token = "<|eot_id|>" if 'Lucie' in SFT_MODEL_ID else "<|im_end|>"
    sft_trainer.processing_class.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
    sft_trainer.processing_class.pad_token = "<pad>" if 'Lucie' in SFT_MODEL_ID else "</s>"
    sft_trainer.processing_class.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    # If you only want to train on the assistant’s responses (ignoring prompt tokens),
    # wrap the trainer so that the loss is computed only over the generated portion:
    # from unsloth.chat_templates import train_on_responses_only
    # print(sft_trainer.train_dataset)
    # sft_trainer = train_on_responses_only(
    #     sft_trainer,
    #     instruction_part="<|start_header_id|>user<|end_header_id|>\n\n" if 'Lucie' in SFT_MODEL_ID else "<|im_start|>user\n", #"<|start_header_id|>user<|end_header_id|>\n\n",#
    #     response_part="<|start_header_id|>assistant<|end_header_id|>\n\n" if 'Lucie' in SFT_MODEL_ID else "<|im_start|>assistant", #"<|start_header_id|>assistant<|end_header_id|>\n\n"#
    #     tokenizer=tokenizer
    #     )
    print(sft_trainer.train_dataset)


generation_config = model.generation_config
# set do_sample to False for reproducibility
# generation_config.do_sample = False

sft_trainer.add_callback(
    SacreBleuCallback(
        sft_trainer,
        generation_config=generation_config,
        eval_steps=50,
        comet_metric=comet22_metric
    )
)


ValueError: Column name ['text'] not in the dataset. Current columns in the dataset: ['id_en', 'messages']

In [None]:
wandb.init(project="joker-pun-translation", name=run_name)

In [30]:
if DEVELOPMENT_MODE:
    print(sft_trainer.processing_class.eos_token)
    print(sft_trainer.processing_class.eos_token_id)
    print(sft_trainer.processing_class.pad_token)
    print(sft_trainer.processing_class.pad_token_id)
    print(sft_trainer.processing_class.bos_token)
    print(sft_trainer.processing_class.bos_token_id)

<|im_end|>
32000
</s>
2
<s>
1


In [31]:
if DEVELOPMENT_MODE:
    print(sft_trainer.processing_class.pad_token)

</s>


In [32]:
if DEVELOPMENT_MODE:
    print(sft_trainer.processing_class.eos_token_id)

32000


In [33]:
if DEVELOPMENT_MODE:
    print(generation_config)

GenerationConfig {
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": [
    32000
  ],
  "max_new_tokens": 256,
  "num_beams": 3,
  "pad_token_id": 32000,
  "repetition_penalty": 1.3,
  "temperature": 0.2,
  "top_p": 0.9
}



In [34]:
if DEVELOPMENT_MODE:
    print(sft_trainer.processing_class.chat_template)

{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}


In [35]:
#@title test trainer
if DEVELOPMENT_MODE:
    train_dataloader = sft_trainer.get_train_dataloader()
    first_batch = next(iter(train_dataloader))
    print(first_batch)
    print(first_batch['input_ids'][0])

    input_ids_batch = first_batch["input_ids"]
    decoded_texts = [tokenizer.decode(input_ids, skip_special_tokens=False) for input_ids in input_ids_batch]
    # tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in first_batch["labels"][0]]).replace(tokenizer.pad_token, " ")

    # print(decoded_texts)
    print(tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in first_batch["labels"][0]]).replace(tokenizer.pad_token, " "))
    print(tokenizer.decode([x for x in first_batch["input_ids"][0]]).replace(tokenizer.pad_token, " "))

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`completion_mask` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [36]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in sft_trainer.train_dataset[0]["labels"]]).replace(tokenizer.pad_token, " ")

KeyError: 'labels'

In [None]:
if DEVELOPMENT_MODE:
    print(sft_trainer.train_dataset[100])
    print(tokenizer.decode(sft_trainer.train_dataset[100]["input_ids"]))
    print("Print no pad")
    print(tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in sft_trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " "))

{'text': "<|im_start|>user\nTranslate the following text from English into French.\nEnglish: She was only a Gardener's daughter, but she knows all the rakes. \nFrench: <|im_end|>\n<|im_start|>assistant\nCe n’était qu’une fille de jardinier mais elle prenait beaucoup de râteaux.<|im_end|>\n<unk>", 'input_ids': [1, 32001, 3592, 19814, 27785, 1135, 3602, 3737, 1446, 6938, 2127, 9197, 3832, 16614, 133, 3676, 1463, 2155, 1110, 12710, 1944, 1361, 1106, 8588, 5661, 1663, 2293, 13510, 1543, 1135, 1501, 182, 1457, 1424, 145, 30206, 133, 488, 32000, 1424, 32001, 20036, 6493, 16570, 8652, 5921, 1125, 8650, 1374, 1723, 2399, 18267, 1370, 3713, 1125, 1177, 5190, 9484, 32000, 1424, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -10

In [None]:
sft_trainer.train()

In [None]:
%%capture
run = wandb.run
if run is not None:
    run.notes = (
        "Training Croissant transformers, 2 epochs sft, completions only, new random seed"
        ""
    )
    run.save()
wandb.finish()

In [None]:
sft_trainer.push_to_hub()
push_model_to_hub(
    run_name= run_name,
    use_transformers=USE_TRANSFORMERS,
    model=trainer.model,
    tokenizer=tokenizer,
    generation_config=GENERATION_CONFIG,
    save_dir="model",
    hub_repo_prefix="igorktech"
)

## ARPO

### Utils

#### Generate preference

In [None]:
from openai import OpenAI

client = OpenAI(
    base_url="http://127.0.0.1:8080/v1",  # Adjust the port if needed
    api_key="token",
)

def translate_text(client, source, target_language="French"):
    """
    Translate a given source text to the target language using the OpenAI API.
    """
    response = client.chat.completions.create(
        model='X-ALMA-13B-Group4.Q8_0.gguf',  # replace with your model identifier
        messages=[
            {"role": "user", "content": f"Translate this from English to {target_language}:\nEnglish: {source}\n{target_language}:"}
        ],
        temperature=0.9,
        max_tokens=512,
        top_p=0.6,
    )
    return response.choices[0].message.content

# Generate rejected responses for DPO training
def generate_rejected_responses(dataset):
    """
    Generate slightly worse translations for DPO training.
    These will serve as 'rejected' examples.
    """

    for row in tqdm(dataset, desc="Generating rejected responses"):
        row["prompt"] = get_prompt(row["en"])
        row["chosen"] = row["fr"]

        translation = translate_text(row["en"], target_language="French")
        row["rejected"] = translation

    return dataset

### Generate rejected responses

In [None]:
dataset_with_rejections = generate_rejected_responses(train_data)
# save the dataset with rejections to a new JSON file
def save_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
save_json(dataset_with_rejections, os.path.join(BASE_DIR_PREFIX,"data/task2/joker_pun_translation_2025_train_with_rejections.json"))


### Training

In [None]:
# import sys
# sys.path.append('./')
# sys.path.append('/content/')
from utils.cpo_trainer import CPOTrainer
from utils.cpo_config import CPOConfig

In [None]:
#@title prepare cpo dataset
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import random
def prepare_cpo_dataset(train_json_path,add_factor=0.5, hf_dataset_name="haoranxu/X-ALMA-Preference", hf_direction_filter="en-fr", split_ratio=(0.8,0.1,0.1), system_prompt=None):

    train_list = load_json(train_json_path)
    print(f"Loaded Generated {len(train_list)} preference examples")
    ds = Dataset.from_list(train_list)
    train_size = split_ratio[0]
    val_size = split_ratio[1] / (split_ratio[1] + split_ratio[2])
    train_split = ds.train_test_split(test_size=1-split_ratio[0], seed=42)['train']
    # increase set from itself by factor 1.5 of train_split
    num_to_add = int(len(train_split) * add_factor)

    # Randomly select samples from train_split to duplicate
    # Use replace=True to allow selecting the same sample multiple times
    indices_to_duplicate = np.random.choice(len(train_split), size=num_to_add, replace=True)
    samples_to_duplicate = train_split.select(indices_to_duplicate)

    # Concatenate the original train_split with the duplicated samples
    train_split = Dataset.from_list(train_split.to_list() + samples_to_duplicate.to_list())

    rest = ds.train_test_split(test_size=1-split_ratio[0], seed=42)['test']
    val_split = rest.train_test_split(test_size=split_ratio[2]/(split_ratio[1]+split_ratio[2]), seed=42)['train']
    test_split = rest.train_test_split(test_size=split_ratio[2]/(split_ratio[1]+split_ratio[2]), seed=42)['test']

    # # Load HF dataset and filter
    # hf = load_dataset(hf_dataset_name)

    # hf_pref = hf['train'].filter(lambda x: x['directions'] == hf_direction_filter,
    #                              num_proc=4)
    # print(f"Loaded HF {len(hf_pref)} preference examples")
    # # Project HF to required columns
    # hf_examples = [{'prompt': get_prompt(ex['source']), 'chosen': ex['chosen'], 'rejected': ex['reject']} for ex in hf_pref]
    # random.seed(42)
    # hf_examples = random.sample(hf_examples, k=len(hf_examples)//2)
    # hf_ds = Dataset.from_list(hf_examples)
    # Combine HF into training
    combined_train = train_split #concatenate_datasets([train_split, hf_ds])

    # Apply system prompt and chat template if requested
    def prompt_format(example):
        prompt_messages = []
        if system_prompt:
            prompt_messages.append({"role": "system", "content": system_prompt})
        prompt_messages.append({"role": "user", "content": example['prompt']})
        example['prompt'] = prompt_messages
        example['chosen'] = [{"role": "assistant", "content": example['chosen']}]
        example['rejected'] = [{"role": "assistant", "content": example['rejected']}]
        return example

    dataset = DatasetDict({'train': combined_train, 'validation': val_split, 'test': test_split})
    dataset = dataset.map(prompt_format)
    # dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})
    # Save splits
    return dataset

In [None]:
def save_dataset_to_json(dataset, path):
    df = pd.DataFrame(dataset['train'])
    df.to_json(path, orient='records', lines=True, force_ascii=False)

In [None]:
# convert dataset to df and save to json
tokenizer = AutoTokenizer.from_pretrained("igorktech/Skommarkhos-pun-translation-sft-v10")#"Qwen/Qwen3-0.6B"
train_path = os.path.join(BASE_DIR_PREFIX,"data/task2/joker_pun_translation_2025_train_with_rejections.json")
cpo_dataset = prepare_cpo_dataset(
    train_json_path=train_path,
    hf_dataset_name="haoranxu/X-ALMA-Preference",
    hf_direction_filter="en-fr",
    split_ratio=(0.95, 0.025, 0.025),
    system_prompt=None,
)
print(cpo_dataset['train'][0])
save_dataset_to_json(cpo_dataset, os.path.join(BASE_DIR_PREFIX,"data/task2/joker_pun_translation_2025_preprocessed_arpo_only_joker-v10.json"))

tokenizer_config.json:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/284 [00:00<?, ?B/s]

Loaded Generated 5838 preference examples


Map:   0%|          | 0/8319 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

Map:   0%|          | 0/146 [00:00<?, ? examples/s]

{'id_en': 'en_5481', 'en': "There was a massive outcry against the plan to build a power line across a nature preserve. The public couldn't bear the tension.", 'fr': 'On voulait alimenter la ville en électricité par une ligne qui traverserait une réserve naturelle : cela généra beaucoup de tension.', 'prompt': [{'content': "Translate the following text from English into French.\nEnglish: There was a massive outcry against the plan to build a power line across a nature preserve. The public couldn't bear the tension. \nFrench: ", 'role': 'user'}], 'chosen': [{'content': 'On voulait alimenter la ville en électricité par une ligne qui traverserait une réserve naturelle : cela généra beaucoup de tension.', 'role': 'assistant'}], 'rejected': [{'content': 'Il y a eu un tollé massif contre le projet de construire une ligne électrique à travers une réserve naturelle. Le public ne pouvait pas supporter la tension.', 'role': 'assistant'}]}


In [None]:
from trl.trainer.utils import get_quantization_config
from dataclasses import dataclass

@dataclass
class QuantizationConfig:
    load_in_4bit: bool = False
    bnb_4bit_quant_type: str = "nf4"
    use_bnb_nested_quant: bool = True
    torch_dtype: str = "bfloat16"
    load_in_8bit: bool = True  # mutually exclusive with 4-bit

quantization_args = QuantizationConfig()

quantization_config = get_quantization_config(quantization_args)

In [None]:
peft_model = "igorktech/Skommarkhos-pun-translation-sft-v10"
model, tokenizer = prepare_model(model_id=SFT_MODEL_ID, quantization_config=quantization_config, peft_model=peft_model, generation_config=generation_config)

Preparing model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(65024, 4096, padding_idx=3)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=12288, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=12288, bias=False)
          (down_proj): Linear8bitLt(in_features=12288, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
 

In [None]:
# prompt: clean torch cache

import torch

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("PyTorch CUDA cache cleared.")
else:
    print("CUDA not available. No cache to clear.")



PyTorch CUDA cache cleared.


In [None]:
# Define ARPO based on CPO training configuration with SimPO settings
run_name = "Skommarkhos-pun-translation-arpo-v10"
cpo_config = CPOConfig(
    output_dir=f"models/{run_name}",
    dataset_num_proc = 2,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-7,
    max_completion_length=256,
    max_prompt_length=256,
    # max_target_length=256,
    max_length=512,
    logging_steps=1,
    max_grad_norm=1.0,
    lr_scheduler_type="inverse_sqrt",
    loss_type="simpo",
    cpo_alpha=0.0,
    weight_decay=0.01,
    warmup_ratio=0.01,
    save_steps=10,
    save_total_limit=3,
    eval_steps=10,
    eval_strategy="steps",
    fp16=torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    # load_best_model_at_end=True,
    seed=SEED,

    report_to="wandb",
    run_name=run_name,
    logging_dir="./logs",

    # HF Hub settings
    push_to_hub=True,
    hub_model_id=f"igorktech/{run_name}",
    hub_private_repo=True,
)

In [None]:
wandb.init(project="joker-pun-translation", name=run_name)

In [None]:
# Initialize ARPO trainer with SimPO settings
cpo_trainer = CPOTrainer(
    model=model,
    args=cpo_config,
    train_dataset=cpo_dataset["train"],
    eval_dataset=cpo_dataset["validation"],
    processing_class=tokenizer
)
# model.generation_config = generation_config
# set do_sample to False for reproduccibility
# generation_config.do_sample = False

cpo_trainer.add_callback(
    SacreBleuCallback(
        cpo_trainer,
        generation_config=generation_config,
        eval_steps=40,
        comet_metric=comet22_metric
    )
)

Map (num_proc=2):   0%|          | 0/8319 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8319 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/146 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/146 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8319 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/146 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [None]:
generation_config

GenerationConfig {
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": [
    267
  ],
  "max_new_tokens": 128,
  "num_beams": 3,
  "pad_token_id": 267,
  "repetition_penalty": 1.3,
  "temperature": 0.2,
  "top_p": 0.9
}

In [None]:
cpo_trainer.processing_class.eos_token = "<|im_end|>"
cpo_trainer.processing_class.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

In [None]:
cpo_trainer.processing_class.eos_token

'<|im_end|>'

In [None]:
cpo_trainer.processing_class.pad_token

'</s>'

In [None]:
#@title test trainer
cpo_trainer.train_dataset[0]

In [None]:
cpo_trainer.train()

In [None]:
%%capture
run = wandb.run
if run is not None:
    run.notes = (
        "Lucie ARPO only joker dataset"
        ""
    )
    run.save()
wandb.finish()

In [None]:
cpo_trainer.push_to_hub()
push_model_to_hub(
    run_name= run_name,
    use_transformers=USE_TRANSFORMERS,
    model=trainer.model,
    tokenizer=tokenizer,
    generation_config=GENERATION_CONFIG,
    save_dir="model",
    hub_repo_prefix="igorktech"
)

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.34G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
merged_model.generation_config = generation_config
merged_model.save_pretrained(f"models/{run_name}/final")

In [None]:
merged_model = AutoModelForCausalLM.from_pretrained(f"models/{run_name}/final")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(SFT_MODEL_ID)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [None]:
# Test model
print("Running testing...")
pre_val_metrics, _, _ = validate_model(merged_model, tokenizer, cpo_dataset["test"])

## Submission

In [None]:
from typing import List, Optional
import json
import torch
from tqdm.auto import tqdm
from transformers import (
    PreTrainedModel,
    PreTrainedTokenizerBase,
    GenerationConfig,
)

def run_submission_inference(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizerBase,
    submission_json_path: str,
    run_id: str = "team1_task_2_Method",
    manual: int = 0,
    batch_size: int = 16,
    output_path: str = "submission.json",
):
    raw_examples = load_json(submission_json_path)  # List[dict]
    # build prompts
    prompts = [get_prompt(ex["en"]) for ex in raw_examples]

    # generate all outputs in one go
    generations = generate(
        prompts=prompts,
        model=model,
        tokenizer=tokenizer,
        generation_config=GENERATION_CONFIG,
        batch_size=batch_size,
    )

    # combine with metadata
    outputs = []
    for ex, gen in zip(raw_examples, generations):
        outputs.append({
            "run_id": run_id,
            "manual": manual,
            "id_en":  ex.get("id_en"),
            "en":     ex.get("en"),
            "fr":     gen,
        })

    # save
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(outputs, f, ensure_ascii=False, indent=2)

    print(f"Wrote {len(outputs)} records to {output_path}")


In [None]:
merged_model.generation_config

GenerationConfig {
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": [
    267
  ],
  "max_new_tokens": 256,
  "num_beams": 3,
  "pad_token_id": 267,
  "repetition_penalty": 1.3,
  "temperature": 0.2,
  "top_p": 0.9
}

In [None]:
submission_json_path = os.path.join(BASE_DIR_PREFIX,f'data/task2/joker_pun_translation_2025_test.json')
submission_run_id = f"{TEAM_NAME}_task_2_Lucie_SFT_ARPO"
manual = 0
run_submission_inference(
    merged_model, tokenizer, submission_json_path,
    run_id=submission_run_id, manual=manual,
    batch_size=32, output_path=os.path.join(BASE_DIR_PREFIX,f'data/task2/{run_name}.json')
)

Translating:   0%|          | 0/142 [00:00<?, ?it/s]

Wrote 4537 records to /content/drive/MyDrive/CLEF2025/data/task2/Skommarkhos-pun-translation-arpo-v10.json


# Task 3

In [None]:
import os
import json
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import apply_chat_template
from transformers import GenerationConfig
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
# import sacrebleu.metrics as sbmetrics
from transformers import TrainerCallback

In [None]:
set_random_seed(SEED)
os.environ["WANDB_PROJECT"] = "joker-wordplay-translation"
TASK = "task3"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
USE_TRANSFORMERS = True

In [None]:
SFT_MODEL_ID = "OpenLLM-France/Lucie-7B-Instruct-v1.1"#"croissantllm/CroissantLLMChat-v0.1"#

In [None]:
GENERATION_CONFIG = GenerationConfig(
            max_new_tokens=256,
            temperature=0.2,
            top_p=0.9,
            do_sample=True,
            num_beams=3,
            repetition_penalty=1.3,
            early_stopping=True)

In [None]:
# Define LoRA configuration for parameter-efficient fine-tuning
peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        bias="none",
        r=32,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=["lm_head", "q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    )

In [None]:
from trl.trainer.utils import get_quantization_config
from dataclasses import dataclass

@dataclass
class QuantizationConfig:
    load_in_4bit: bool = False
    bnb_4bit_quant_type: str = "nf4"
    use_bnb_nested_quant: bool = True
    torch_dtype: str = "bfloat16"
    load_in_8bit: bool = True  # mutually exclusive with 4-bit

quantization_args = QuantizationConfig()

quantization_config = get_quantization_config(quantization_args)

In [None]:
def get_prompt(en: str, description: str,
               src_lang: str = "English",
               tgt_lang: str = "French") -> str:
    return (
        f"Translate the following {src_lang} text into {tgt_lang} (with context):"
        f"\nContext: {description}\n"
        f"{src_lang}: {en}\n{tgt_lang}: "
    )

In [None]:
def load_joker_dataset(path: str, is_train: bool):
    items = load_json(path)
    ds = Dataset.from_list(items)
    def fmt(ex):
        ex['prompt'] = get_prompt(ex['en'], ex.get('description', ''))
        if is_train:
            ex['completion'] = ex['fr']
        return ex
    remove_cols = ['fr'] if is_train else []
    return ds.map(fmt, remove_columns=remove_cols)

train_ds = load_joker_dataset(f"{BASE_DIR_PREFIX}/data/task3/joker_onomastic_2025_train.json", True)
print("Train dataset length:", len(train_ds))
val_ds = train_ds.train_test_split(test_size=0.05, seed=42)

test_ds = load_joker_dataset(f"{BASE_DIR_PREFIX}/data/task3/joker_onomastic_2025_test.json", False)

# convert to one Dataset
sft_dataset = DatasetDict({'train': train_ds, 'validation': val_ds})

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

Train dataset length: 353


Map:   0%|          | 0/2696 [00:00<?, ? examples/s]

In [None]:
model, tokenizer = prepare_model(SFT_MODEL_ID, quantization_config=quantization_config, peft_config=peft_config, generation_config=GENERATION_CONFIG)

Preparing model...


tokenizer_config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.82M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.51G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/161 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(65024, 4096, padding_idx=3)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=12288, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=12288, bias=False)
          (down_proj): Linear8bitLt(in_features=12288, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
 

In [None]:
GENERATION_CONFIG.eos_token_id = [tokenizer.convert_tokens_to_ids(tokenizer.eos_token)]

In [None]:
run_name = "Skommarkhos-wordplay-translation-sft-v2"

In [None]:
training_args = SFTConfig(
    output_dir=f"models/{run_name}",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    max_grad_norm=1.0,
    weight_decay=0.01,
    warmup_ratio=0.01,
    lr_scheduler_type="inverse_sqrt",
    optim="adamw_8bit",
    logging_steps=1,
    save_steps=10,
    eval_strategy="steps",
    eval_steps=2,
    save_total_limit=2,
    fp16=False,  # not is_bfloat16_supported()
    bf16=True,
    seed=SEED,
    report_to="wandb",
    run_name=run_name,
    logging_dir="./logs",
    push_to_hub=True,
    hub_model_id=f"igorktech/{run_name}",
    hub_private_repo=True,

    max_length=512,

    max_seq_length = 512,
    eos_token=tokenizer.eos_token,
    pad_token=tokenizer.pad_token,
    completion_only_loss=True,
    packing=False,
    dataset_num_proc=2,
)
sft_trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=sft_dataset["train"],
    eval_dataset=sft_dataset["validation"],
    processing_class=tokenizer
)

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/353 [00:00<?, ? examples/s]

Adding EOS to train dataset (num_proc=2):   0%|          | 0/353 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/353 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/353 [00:00<?, ? examples/s]

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/335 [00:00<?, ? examples/s]

Adding EOS to train dataset (num_proc=2):   0%|          | 0/335 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/335 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/335 [00:00<?, ? examples/s]

Converting test dataset to ChatML (num_proc=2):   0%|          | 0/18 [00:00<?, ? examples/s]

Adding EOS to test dataset (num_proc=2):   0%|          | 0/18 [00:00<?, ? examples/s]

Tokenizing test dataset (num_proc=2):   0%|          | 0/18 [00:00<?, ? examples/s]

Truncating test dataset (num_proc=2):   0%|          | 0/18 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
#@title test trainer
train_dataloader = sft_trainer.get_train_dataloader()
first_batch = next(iter(train_dataloader))
print(first_batch['input_ids'][0])

input_ids_batch = first_batch["input_ids"]
decoded_texts = [tokenizer.decode(input_ids, skip_special_tokens=False) for input_ids in input_ids_batch]
print(decoded_texts)

In [None]:
# %%capture
wandb.init(project="joker-wordplay-translation", name=run_name)

In [None]:
sft_trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Train Loss,Test Loss
2,5.3765,No log,4.760882,4.340468
4,4.3822,No log,3.244724,2.727705
6,2.6611,No log,2.253819,1.789622
8,2.493,No log,2.123108,1.702559
10,2.4263,No log,1.958215,1.624847
12,2.4153,No log,1.83967,1.480369
14,1.5159,No log,1.764908,1.427814
16,1.3338,No log,1.70239,1.368856
18,1.6426,No log,1.631485,1.306916
20,1.2891,No log,1.576118,1.276182


TrainOutput(global_step=24, training_loss=2.490927671392759, metrics={'train_runtime': 755.6415, 'train_samples_per_second': 0.934, 'train_steps_per_second': 0.032, 'total_flos': 5160813244760064.0, 'train_loss': 2.490927671392759})

In [None]:
# %%capture
import wandb
run = wandb.run
if run is not None:
    run.notes = (
        f"{run_name} {SFT_MODEL_ID} only joker dataset"
        ""
    )
    run.save()
wandb.finish()



0,1
eval/mean_token_accuracy,▁▁▁▂▅▆▅▆▅▆▆▆▆▆▆▆▆▆▆█▇█▇█
eval/num_tokens,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▆▆▆▆▇▇████
eval/test_loss,█▄▂▂▂▂▁▁▁▁▁▁
eval/test_runtime,▂▁▁▁▃▆▂▂▂▄█▂
eval/test_samples_per_second,▆███▆▃▇▇▇▅▁▇
eval/test_steps_per_second,▆███▆▃▇▇▇▅▁▇
eval/train_loss,█▅▃▂▂▂▂▂▁▁▁▁
eval/train_runtime,▁▁▂▂▂▄▃▃▄▃█▄
eval/train_samples_per_second,██▇▇▇▅▆▆▅▆▁▅
eval/train_steps_per_second,██▇▇▇▄▆▆▅▆▁▅

0,1
eval/mean_token_accuracy,0.83529
eval/num_tokens,80654.0
eval/test_loss,1.23344
eval/test_runtime,2.1339
eval/test_samples_per_second,8.435
eval/test_steps_per_second,1.406
eval/train_loss,1.46539
eval/train_runtime,32.7109
eval/train_samples_per_second,10.241
eval/train_steps_per_second,1.284


In [None]:
sft_trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/igorktech/Skommarkhos-wordplay-translation-sft-v2/commit/28903aa97d64aa84cf274ad71bec279b4ee90a04', commit_message='End of training', commit_description='', oid='28903aa97d64aa84cf274ad71bec279b4ee90a04', pr_url=None, repo_url=RepoUrl('https://huggingface.co/igorktech/Skommarkhos-wordplay-translation-sft-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='igorktech/Skommarkhos-wordplay-translation-sft-v2'), pr_revision=None, pr_num=None)

In [None]:
push_model_to_hub(
    run_name=run_name,
    use_transformers=USE_TRANSFORMERS,
    model=sft_trainer.model,
    tokenizer=tokenizer,
    generation_config=GENERATION_CONFIG,
    save_dir="model",
    hub_repo_prefix="igorktech"
)

No files have been modified since last commit. Skipping to prevent empty commit.


README.md:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
peft_model = f"igorktech/{run_name}"
model, tokenizer = prepare_model(model_id=SFT_MODEL_ID, quantization_config=quantization_config, peft_model=peft_model, generation_config=GENERATION_CONFIG)

Preparing model...


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32002, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear8bitLt(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRM

adapter_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/386M [00:00<?, ?B/s]

In [None]:
GENERATION_CONFIG.eos_token_id = [tokenizer.convert_tokens_to_ids(tokenizer.eos_token)]

In [None]:
from tqdm import tqdm
from transformers import GenerationConfig
def run_inference(sub_path, run_id, model, manual=0, batch_size=8,generation_config: GenerationConfig=GENERATION_CONFIG, out_path='submission.json'):
    examples = load_json(sub_path)
    outputs = []
    for i in tqdm(range(0, len(examples), batch_size)):
        batch = examples[i:i+batch_size]
        prompts = [get_prompt(e['en'], e.get('description','')) for e in batch]
        decs = generate(prompts, model, tokenizer, generation_config, batch_size)
        for ex, fr in zip(batch, decs):
            outputs.append({
                'run_id': run_id,
                'manual': manual,
                'id': ex['id'],
                'en': ex['en'],
                'fr': fr,
            })
    json.dump(outputs, open(out_path,'w',encoding='utf-8'), ensure_ascii=False, indent=2)
    print(f"Wrote {len(outputs)} to {out_path}")

In [None]:
GENERATION_CONFIG.max_new_tokens = 64
# model = model.merge_and_unload()
run_inference(
    f"{BASE_DIR_PREFIX}/data/task3/joker_onomastic_2025_train.json",
    f"{TEAM_NAME}_task3_CroissantLLMChat-v0.1_SFT_Q8B_LoRA",
    model=model,
    manual=0,
    batch_size=16,
    generation_config=GENERATION_CONFIG,
    out_path=f"{BASE_DIR_PREFIX}/data/task3/{run_name}.json"
)

  0%|          | 0/23 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Setting pad/eos token IDs


  4%|▍         | 1/23 [00:10<03:47, 10.35s/it]

Setting pad/eos token IDs


  9%|▊         | 2/23 [00:30<05:41, 16.26s/it]

Setting pad/eos token IDs


 13%|█▎        | 3/23 [00:39<04:18, 12.90s/it]

Setting pad/eos token IDs


 17%|█▋        | 4/23 [00:50<03:49, 12.07s/it]

Setting pad/eos token IDs


 22%|██▏       | 5/23 [01:05<03:59, 13.29s/it]

Setting pad/eos token IDs


 26%|██▌       | 6/23 [01:16<03:28, 12.26s/it]

Setting pad/eos token IDs


 30%|███       | 7/23 [01:29<03:20, 12.51s/it]

Setting pad/eos token IDs


 35%|███▍      | 8/23 [01:44<03:20, 13.38s/it]

Setting pad/eos token IDs


 39%|███▉      | 9/23 [01:55<02:57, 12.65s/it]

Setting pad/eos token IDs


 43%|████▎     | 10/23 [02:08<02:44, 12.63s/it]

Setting pad/eos token IDs


 48%|████▊     | 11/23 [02:19<02:28, 12.34s/it]

Setting pad/eos token IDs


 52%|█████▏    | 12/23 [02:41<02:45, 15.08s/it]

Setting pad/eos token IDs


 57%|█████▋    | 13/23 [02:52<02:20, 14.06s/it]

Setting pad/eos token IDs


 61%|██████    | 14/23 [03:10<02:15, 15.11s/it]

Setting pad/eos token IDs


 65%|██████▌   | 15/23 [03:29<02:10, 16.31s/it]

Setting pad/eos token IDs


 70%|██████▉   | 16/23 [03:40<01:43, 14.78s/it]

Setting pad/eos token IDs


 74%|███████▍  | 17/23 [03:48<01:15, 12.63s/it]

Setting pad/eos token IDs


 78%|███████▊  | 18/23 [04:02<01:05, 13.04s/it]

Setting pad/eos token IDs


 83%|████████▎ | 19/23 [04:37<01:18, 19.62s/it]

Setting pad/eos token IDs


 87%|████████▋ | 20/23 [05:22<01:22, 27.43s/it]

Setting pad/eos token IDs


 91%|█████████▏| 21/23 [05:35<00:45, 22.85s/it]

Setting pad/eos token IDs


 96%|█████████▌| 22/23 [05:53<00:21, 21.49s/it]

Setting pad/eos token IDs


100%|██████████| 23/23 [05:56<00:00, 15.52s/it]

Wrote 353 to /content/drive/MyDrive/CLEF2025/data/task3/Skommarkhos-wordplay-translation-sft-v2.json



