In [1]:
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model
import torch
from torch.utils.data import DataLoader, random_split
import sys, os, copy
from functools import partial

# be able to import from src
sys.path.append(os.path.abspath(".."))

from src.models.causal_lm import NMTModel
from src.data import Medline, collate_translations, get_translation_prompt_skeleton, full_lang_name

  from .autonotebook import tqdm as notebook_tqdm


### Load model

In [2]:
# # NOTE: replace with accurate model directory, preferably not newer than 2023
# model_dir = "../data/qwen0-5b"
# out_dir = "../data/out"

# # load model
# model = NMTModel(model_dir, "cpu")
from transformers import AutoTokenizer, AutoModelForCausalLM

model = "Qwen/Qwen2-0.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(
    model,
    device_map='cpu',
    dtype="auto",
)

In [3]:
# prompt format adapted from "How Good Are GPT Models at Machine Translation? A Comprehensive Evaluation"
# example_prompts = [
#     "Translate this from English to Japanese: English: Would you like something to eat? Japanese: ",
#     "Translate this from English to German: English: Would you like something to eat? German: ",
# ]

# model.prompt_batch(example_prompts)

## Load data

In [4]:
dset = Medline("de", "en", "../data/wmt22")

train_dset, test_dset = random_split(dset, [0.8, 0.2])

In [15]:
prompt_form = get_translation_prompt_skeleton(full_lang_name(dset.lang_from), full_lang_name(dset.lang_to))

torch.cuda.memory._record_memory_history()

dloader = DataLoader(
    dset,
    batch_size=32,
    collate_fn=partial(collate_translations, tokenizer=tokenizer, prompt_form=prompt_form, device="cpu")
)

torch.cuda.memory._dump_snapshot("after_dataloader.pickle")


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [14]:
next(iter(dloader))

# get shape of a batch
batch = next(iter(dloader))
for k, v in batch.items():
    print(k, v.shape)

input_ids torch.Size([32, 1284])
attention_mask torch.Size([32, 1284])
labels torch.Size([32, 1284])


### PEFT

In [7]:
# load finetuneable model
# TODO: find good parameters
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model_peft = get_peft_model(model, peft_config)

model_peft.print_trainable_parameters()

trainable params: 540,672 || all params: 494,573,440 || trainable%: 0.1093


In [8]:
# TODO: find good parameters
training_args = TrainingArguments(
    output_dir=out_dir,
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# TODO: find good parameters
# TODO: override get_train_dataloader, get_eval_dataloader, and get_test_dataloader
# TODO: set ignore_index in the loss function
trainer = EncoderTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dset,
    eval_dataset=test_dset,
    processing_class=model.tokenizer,
    compute_metrics=...,
)

trainer.train()

NameError: name 'out_dir' is not defined