In [1]:
import torch
import os
import transformers
from tokenizers import AddedToken
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import fire
from dataset_utils import LANG_TABLE, load_mt_dataset
import inference
import model_utils

In [2]:
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [3]:
model_path = "models/gemma-2-2b-it"

In [2]:
ft_model_lora_path = "trains/ft_mmt_gemma2_lora/checkpoint-58702"

In [3]:
model, tokenizer = model_utils.load_model(ft_model_lora_path)

Loading tokenizer and model from: trains/ft_mmt_gemma2_lora/checkpoint-58702


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model, tokenizer = model_utils.load_quantized_model(model_path)

Loading tokenizer and model with quantization config from: models/gemma-2-2b-it


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
pair = "de-en"
src_text = "Die Ware hat unter 20 Euro gekostet."
tgt_text = "The goods cost less than 20 euros."

In [5]:
reponse = inference.translate(model, tokenizer, src_text, pair)

In [6]:
reponse

'The goods cost less than 20 euros. No explanation. " secto." "Libra." "summary". "Vasyl". "Tristan". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan\'s weapon of choice". "Tristan'

## Evaluate

In [8]:
import evaluate

metric = evaluate.load("sacrebleu")

In [9]:
result = metric.compute(predictions=[reponse], references=[tgt_text])

In [10]:
result

{'score': 35.640264633541825,
 'counts': [6, 4, 2, 0],
 'totals': [7, 6, 5, 4],
 'precisions': [85.71428571428571, 66.66666666666667, 40.0, 12.5],
 'bp': 0.8668778997501817,
 'sys_len': 7,
 'ref_len': 8}

## Test set

In [15]:
import dataset_utils
from transformers import GenerationConfig
from torch.utils.data import DataLoader
from tqdm import tqdm

In [12]:
deen_dataset = load_mt_dataset("haoranxu/WMT22-Test", pair="de-en", split="test[:50%]", tokenizer=tokenizer)

In [13]:
deen_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'label_text'],
    num_rows: 992
})

In [16]:
print(deen_dataset[0])

{'input_ids': [[2, 6176, 36142, 235292, 80056, 736, 774, 5132, 577, 4645, 235269, 793, 10200, 673, 108, 6176, 4820, 235292, 108, 6721, 29852, 3954, 8411, 235248, 235284, 235276, 3445, 1465, 21789, 4947, 235265, 108, 6176, 41589, 235292, 108]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'label_text': 'The goods cost less than 20 euros.'}


In [16]:
deen_dataset.set_format("torch")

In [19]:
# Create a DataLoader
batch_size = 16
test_dataloader = DataLoader(deen_dataset, batch_size=batch_size)

## Manual test

In [20]:
gen_config = GenerationConfig(
    max_new_tokens=512,
    do_sample=True,
    top_p=0.9,
    top_k=40,
    temperature=0.1,
    repetition_penalty=1.05,
)

In [22]:
for batch in test_dataloader:
    # Process your batch here

    print(batch)
    break
    

RuntimeError: stack expects each tensor to be equal size, but got [36] at entry 0 and [37] at entry 1

In [35]:
hypotheses = []
references = []

for example in tqdm(deen_dataset, desc="Generating translations"):
    references.append(example['label_text'])
    input_ids = torch.tensor(example['input_ids']).to(model.device)
    attention_mask = torch.tensor(example['attention_mask']).to(model.device)
    # del example['label_text']
    
    # import pdb; pdb.set_trace()
    # example = example.to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            generation_config=gen_config
        )
    # import pdb; pdb.set_trace()
    translations = tokenizer.batch_decode(outputs[:, input_ids.size(1):], skip_special_tokens=True)[0].strip()
    hypotheses.append(translations)
    
    # break

Generating translations:   2%|‚ñè         | 21/992 [00:25<19:56,  1.23s/it]


KeyboardInterrupt: 

In [39]:
len(hypotheses)

21

In [43]:
len(references)

21

In [44]:
result = metric.compute(predictions=hypotheses, references=references)

In [45]:
result

{'score': 31.892819797030025,
 'counts': [230, 138, 84, 55],
 'totals': [336, 315, 294, 273],
 'precisions': [68.45238095238095,
  43.80952380952381,
  28.571428571428573,
  20.146520146520146],
 'bp': 0.879874328284017,
 'sys_len': 336,
 'ref_len': 379}

In [17]:


def batch_translate(batch, model, tokenizer, batch_size=32):
    all_translations = []
    for i in range(0, len(batch['input_ids']), batch_size):
        input_ids = torch.tensor(batch['input_ids'][i:i+batch_size])
        attention_mask = torch.tensor(batch['attention_mask'][i:i+batch_size])
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                generation_config=gen_config
            )
        
        translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_translations.extend(translations)
    
    return all_translations

In [22]:
batched_dataset = deen_dataset.batch(batch_size=32)

AttributeError: 'Dataset' object has no attribute 'batch'

In [25]:
type(deen_dataset)

datasets.arrow_dataset.Dataset

In [21]:
# Generate translations
hypotheses = []
references = []

for batch in tqdm(deen_dataset.batch(32), desc="Generating translations"):
    batch_translations = batch_translate(batch, model, tokenizer)
    hypotheses.extend(batch_translations)
    references.extend(batch['label_text'])

    break

AttributeError: 'Dataset' object has no attribute 'batch'

## Trainer with test set

In [8]:
from transformers import Trainer, TrainingArguments
import evaluate

In [9]:
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

In [10]:
def compute_metrics(eval_preds):
    import pdb;pdb.set_trace()
    preds, labels = eval_preds
    # Decode generated translations
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Decode reference translations
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BLEU score
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[r] for r in decoded_labels])
    
    # Compute ROUGE scores
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {
        "bleu_score": bleu_results["score"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "rougeLsum": rouge_results["rougeLsum"],
    }

In [14]:
train_args = TrainingArguments(
    output_dir="test", 
    do_train=False, 
    do_eval=True,
    per_device_eval_batch_size=16, 
    prediction_loss_only=False
)

In [15]:
trainer = Trainer(model=model, args=train_args, tokenizer=tokenizer, compute_metrics=compute_metrics,)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
# Run evaluation
eval_results = trainer.evaluate(eval_dataset=deen_dataset)

TypeError: device() received an invalid combination of arguments - got (NoneType), but expected one of:
 * (torch.device device)
      didn't match because some of the arguments have invalid types: (!NoneType!)
 * (str type, int index)


In [None]:
print("Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")