In [4]:
%load_ext autoreload
%autoreload 2

from boolrank import *
from my_processing import paths_to_dataset
import numpy as np

loss = "siglip"
# loss = "clip"

# model = DualSiglip2Model('BAAI/llm-embedder', loss)
model = DualSiglip2Model('BAAI/bge-small-en-v1.5', loss)
# model = DualSiglip2Model('dmis-lab/biobert-v1.1', loss)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
batch_size = 2
# epochs = 5 * batch_size
epochs = 10
lr = 1e-7
eval_batch = 30
power = 1

bool_key = "bool_query"
nl_key = "nl_query"
qual_key = "quality"

data_path = "training"
path = "data/{}.jsonl"
pubmed = path.format(data_path)
TAR = path.format("TAR_data")
sysrev = path.format("sysrev_conv")
train_sources = ['pubmed-searchrefiner']
# train_sources += ['pubmed-query', 'raw-jsonl']
dataset = paths_to_dataset([pubmed, TAR, sysrev],
                           test_only_sources=['TAR', 'sysrev'],
                           train_sources=train_sources)

print(dataset)
weights = np.array(dataset["train"][qual_key])**power
lr_n = "" if lr == 1e-7 else f"lr{lr:.0E}_"
b_n = "" if batch_size == 2 else f"b{batch_size}_"
pow_n = "" if power == 1 else f"^{power}"
data_n = '_'.join(k[:10] for k in np.unique(dataset['train']['source']))

model_name = model.model_name.split("/")[-1]
model_path = f"{loss}/{model_name}/{b_n + lr_n}({data_n}){pow_n}"
print(model_path)

In [None]:
import os
import re
from transformers import Trainer, TrainingArguments
from transformers.utils.notebook import NotebookProgressCallback
from custom_trainer import NotebookProgressCallbackNoTable, WandbCallbackAveraged
from evaluation import compute_metrics
from torch.utils.data import WeightedRandomSampler

sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

class WeightedTrainer(Trainer):
    def get_train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=sampler,
            collate_fn=self.data_collator,
        )

os.environ["WANDB_PROJECT"] = "Boolean-Ranking"
os.environ["WANDB_LOG_MODEL"] = "false"

# epochs = 10
training_args = TrainingArguments(
    output_dir="models/" + model_path,
    per_device_train_batch_size=batch_size,
    num_train_epochs=epochs,
    learning_rate=lr,
    save_steps=1000,
    save_total_limit=1,
    remove_unused_columns=False,
    bf16=True,
    optim="adamw_bnb_8bit",
    logging_steps=100,
    eval_steps=200,
    eval_strategy="steps",
    eval_on_start=True,
    per_device_eval_batch_size=eval_batch,
    run_name=model_path,
    # max_steps=1000,
)

def collate_fn(batch):
    return {
        "in_bool": [ex[bool_key] for ex in batch],
        # "in_text": [re.sub("\[.*?\]", "", ex[nl_key]) for ex in batch],
        "in_text": [ex[nl_key] for ex in batch],
    }

# trainer = Trainer(
trainer = WeightedTrainer(
    model,
    training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)

trainer.remove_callback(NotebookProgressCallback)
trainer.add_callback(NotebookProgressCallbackNoTable)
trainer.add_callback(WandbCallbackAveraged)

trainer.evaluate(dataset["test"]["TAR"])
# trainer.train()
# try: trainer.train(resume_from_checkpoint=True)
# except: trainer.train(resume_from_checkpoint=False)

In [None]:
tar = dataset["test"]["TAR"]
model(tar[bool_key], tar[nl_key], False)

In [12]:
from pathlib import Path
from evaluation import evaluate_on_generated

DIR = Path("data") / "combined_outputs"
evaluate_on_generated(DIR, model)

Unnamed: 0,model,spearman,norm_offset_sum,avg_queries_per_prompt,med_queries_per_prompt
0,gpt-3,0.192,0.556,5.625,6.0
1,gpt-4-1106-preview,0.553,0.346,3.324,3.0
2,gpt-4o-mini,0.435,0.439,5.05,5.0
3,HuggingfaceH4,0.206,0.528,4.462,5.0
4,meta-llama,0.194,0.575,5.513,6.0
5,mistralai,0.22,0.504,4.205,4.0
6,o1-2024-12-17,0.01,0.613,4.436,5.0
7,open-mistral-7b,0.442,0.407,4.333,5.0
8,open-mixtral-8x7b,0.27,0.524,4.692,5.0
9,Average,0.28,0.499,4.627,4.889


In [18]:
model.load(r"models\clip\bge-small-en-v1.5\b16_lr1E-05_(pubmed-que_pubmed-sea_raw-jsonl)^4\checkpoint-11288\model.safetensors")
# model.load(r"models\clip\biobert-v1.1\b16_lr1E-05_(pubmed-que_pubmed-sea_raw-jsonl)^4\checkpoint-14110\model.safetensors")
evaluate_on_generated(DIR, model)

Unnamed: 0,model,spearman,norm_offset_sum,avg_queries_per_prompt,med_queries_per_prompt
0,gpt-3,0.111,0.587,5.625,6.0
1,gpt-4-1106-preview,0.13,0.568,3.324,3.0
2,gpt-4o-mini,0.001,0.64,5.05,5.0
3,HuggingfaceH4,0.024,0.632,4.462,5.0
4,meta-llama,0.014,0.65,5.513,6.0
5,mistralai,-0.218,0.749,4.205,4.0
6,o1-2024-12-17,0.023,0.65,4.436,5.0
7,open-mistral-7b,0.108,0.561,4.333,5.0
8,open-mixtral-8x7b,0.095,0.616,4.692,5.0
9,Average,0.032,0.628,4.627,4.889


In [19]:
model.load(r"models\clip\bge-small-en-v1.5\b16_(pubmed-que_pubmed-sea_raw-jsonl)^4\checkpoint-11288\model.safetensors")
evaluate_on_generated(DIR, model)

Unnamed: 0,model,spearman,norm_offset_sum,avg_queries_per_prompt,med_queries_per_prompt
0,gpt-3,0.205,0.567,5.625,6.0
1,gpt-4-1106-preview,0.517,0.345,3.324,3.0
2,gpt-4o-mini,0.364,0.47,5.05,5.0
3,HuggingfaceH4,0.164,0.556,4.462,5.0
4,meta-llama,0.086,0.611,5.513,6.0
5,mistralai,0.213,0.534,4.205,4.0
6,o1-2024-12-17,0.051,0.613,4.436,5.0
7,open-mistral-7b,0.393,0.463,4.333,5.0
8,open-mixtral-8x7b,0.228,0.548,4.692,5.0
9,Average,0.247,0.523,4.627,4.888889


In [None]:
from evaluation import evaluate
# paths = [r"models/siglip2/old/b2-bf-8b/e4", r"models/siglip2/old/b3-bf-8b"]
paths = [None]
amt = eval_batch
format = "pdf"
for path in paths:
    if path is not None:
        model.load(path + "/model.safetensors")
        print(path)
    else: path = model_path

    for key, data in dataset["test"].items():
        res = evaluate(model, data[bool_key][:amt], data[nl_key][:amt], plot=True, title=key)
        res["plot"].savefig(f"models/{path}/test_stats.{format}", format=format)