In [1]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, DataCollatorForLanguageModeling, GenerationConfig, pipeline
from args import TrainingArguments, DataTrainingArguments, ArgumentParser

from peft import get_peft_model

from arithmetics import PromptArithmeticsConfig

from tasks import Preprocessor

# from safetensors import safe_open

In [2]:
parser = ArgumentParser(
    (TrainingArguments, DataTrainingArguments, PromptArithmeticsConfig)
)

training_args, data_args, pt_args = parser.parse_toml_file("./configs/prompt_tuning/single-task/llama3_8b.toml")

In [3]:
model = AutoModelForCausalLM.from_pretrained(training_args.model_name_or_path, torch_dtype=torch.bfloat16).to("cuda")
model = get_peft_model(model, peft_config=pt_args)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(data_args.data_tokenizer_name_or_path, trust_remote_code=True, padding_side="left")
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
preprocessor = Preprocessor(
            [data_args.dataset_names[0]], data_args, training_args, pt_args, tokenizer
        )

train_dataset, valid_dataset, test_dataset = preprocessor.get_data()

Max target lengths: [2]


Running yelp_polarity_text_preprocessor on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Running preprocess_function on train_dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Running yelp_polarity_text_preprocessor on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running preprocess_function on valid_dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running yelp_polarity_text_preprocessor on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running preprocess_function on test_dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

In [6]:
print(train_dataset[0]["input_ids"].count(128001))
print(train_dataset[0]["input_ids"].count(128002))
print(train_dataset[0]["attention_mask"].count(0))
print(train_dataset[6]["labels"])
print(tokenizer.decode(train_dataset[6]["input_ids"]))
print(tokenizer.decode(train_dataset[2]["labels"][-2:]))
print(tokenizer.decode(train_dataset[2]["input_ids"][-2:]))
print(train_dataset[6]["labels"][-3:])
print(train_dataset[6]["input_ids"][-3:])
print(train_dataset[1]["labels"].count(-100))

1
166
166
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

In [7]:
print(tokenizer.decode(train_dataset[1]["input_ids"]))
print(len(train_dataset[0]["input_ids"]))
print(len(train_dataset[0]["attention_mask"]))

print(train_dataset[0]["attention_mask"].count(0))
print(train_dataset[0]["attention_mask"][199])
print(train_dataset[0]["input_ids"][199])

print(len(train_dataset[0]["labels"]), len(train_dataset[0]["input_ids"]), len(train_dataset[0]["attention_mask"]))

<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_t

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [10]:
trainer = Trainer(
                    model=model,
                    tokenizer=tokenizer,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=list(valid_dataset.values())[0],
                    data_collator=data_collator,
                )

[codecarbon INFO @ 12:47:12] [setup] RAM Tracking...
[codecarbon INFO @ 12:47:12] [setup] GPU Tracking...
[codecarbon INFO @ 12:47:12] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 12:47:12] [setup] CPU Tracking...
[codecarbon INFO @ 12:47:13] CPU Model on constant consumption mode: AMD EPYC 7543 32-Core Processor
[codecarbon INFO @ 12:47:13] >>> Tracker's metadata:
[codecarbon INFO @ 12:47:13]   Platform system: Linux-5.19.0-50-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 12:47:13]   Python version: 3.11.8
[codecarbon INFO @ 12:47:13]   CodeCarbon version: 2.3.5
[codecarbon INFO @ 12:47:13]   Available RAM : 503.092 GB
[codecarbon INFO @ 12:47:13]   CPU count: 128
[codecarbon INFO @ 12:47:13]   CPU model: AMD EPYC 7543 32-Core Processor
[codecarbon INFO @ 12:47:13]   GPU count: 1
[codecarbon INFO @ 12:47:13]   GPU model: 1 x NVIDIA A40


In [11]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrobert-belanec[0m ([33mrbelanec[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


[codecarbon INFO @ 12:47:41] Energy consumed for RAM : 0.000786 kWh. RAM Power : 188.65951824188232 W
[codecarbon INFO @ 12:47:41] Energy consumed for all GPUs : 0.001149 kWh. Total GPU Power : 275.46775112283564 W
[codecarbon INFO @ 12:47:41] Energy consumed for all CPUs : 0.000469 kWh. Total CPU Power : 112.5 W
[codecarbon INFO @ 12:47:41] 0.002404 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:47:56] Energy consumed for RAM : 0.001572 kWh. RAM Power : 188.65951824188232 W
[codecarbon INFO @ 12:47:56] Energy consumed for all GPUs : 0.002368 kWh. Total GPU Power : 292.87485926338667 W
[codecarbon INFO @ 12:47:56] Energy consumed for all CPUs : 0.000938 kWh. Total CPU Power : 112.5 W
[codecarbon INFO @ 12:47:56] 0.004878 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:48:11] Energy consumed for RAM : 0.002358 kWh. RAM Power : 188.65951824188232 W
[codecarbon INFO @ 12:48:11] Energy consumed for all GPUs : 0.003618 kWh. Total GPU Power : 300.082135

KeyboardInterrupt: 

[codecarbon INFO @ 12:54:11] Energy consumed for RAM : 0.021034 kWh. RAM Power : 188.65951824188232 W
[codecarbon INFO @ 12:54:11] Energy consumed for all GPUs : 0.031366 kWh. Total GPU Power : 166.46805439375427 W
[codecarbon INFO @ 12:54:11] Energy consumed for all CPUs : 0.012656 kWh. Total CPU Power : 112.5 W
[codecarbon INFO @ 12:54:11] 0.065056 kWh of electricity used since the beginning.


In [20]:
example_input = test_dataset["qnli_text"][0]["input_ids"][:-5]
example_attn_mask = test_dataset["qnli_text"][0]["attention_mask"][:-5]

print("input:", tokenizer.decode(example_input, skip_special_tokens=True))

outputs = model.generate(torch.tensor(example_input).unsqueeze(-1).reshape(1, -1).to("cuda"), attention_mask=torch.tensor(example_attn_mask).unsqueeze(-1).reshape(1, -1).to("cuda"))

print("output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


input: qnli question: How big are phycobilisomes? sentence: Phycobilins come in all colors, though phycoerytherin is one of the pigments that makes many red algae red. label: 




output: qnli question: How big are phycobilisomes? sentence: Phycobilins come in all colors, though phycoerytherin is one of the pigments that makes many red algae red. label: not_entailment


In [41]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

correct = 0
total = len(test_dataset["qnli_text"])

for example in test_dataset["qnli_text"]:
    example_label = torch.tensor(example["labels"]).index_select(0,index=(torch.tensor(example["labels"]) != -100).nonzero().flatten())
    example_input = torch.tensor(example["input_ids"])[:-len(example_label)]
    example_attn_mask = torch.tensor(example["attention_mask"])[:-len(example_label)]

    example_output = model.generate(torch.tensor(example_input).unsqueeze(-1).reshape(1, -1).to("cuda"), attention_mask=torch.tensor(example_attn_mask).unsqueeze(-1).reshape(1, -1).to("cuda"))[0]
    n_new_tokens = len(example_output) - len(example_input)

    decoded_output = tokenizer.decode(example_output[-n_new_tokens:], skip_special_tokens=True)
    decoded_label = tokenizer.decode(example_label, skip_special_tokens=True)

    if decoded_output == decoded_label:
        correct += 1

    print("output:", decoded_output , "label:", decoded_label)

  example_output = model.generate(torch.tensor(example_input).unsqueeze(-1).reshape(1, -1).to("cuda"), attention_mask=torch.tensor(example_attn_mask).unsqueeze(-1).reshape(1, -1).to("cuda"))[0]


output: not_entailment label: not_entailment
output: entailment label: entailment
output: not_entailment label: not_entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: not_entailment label: not_entailment
output: entailment label: entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: not_entailment label: not_entailment
output: entailment label: entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: entailment label: entailment
output: entailment label: entailment
output: entailment label: entailment
output: not

In [44]:
total / correct

1.0