<a href="https://colab.research.google.com/github/kotlyar-shapirov/word2vec-pytorch/blob/master/course/en/chapter3/section3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning a model with the Trainer API or Keras

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece] trl peft

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m

### Trying to Run our SFTTrainer

In [None]:
# load config file
raw_dict = load_config(config_name)
# parse configs
misc_config = MiscConfig.model_validate(raw_dict)
data_config = DatasetConfig.model_validate(raw_dict)
model_config = ModelInference.model_validate(raw_dict)
peft_config = PeftConfig.model_validate(raw_dict)
training_config = TrainingConfig.model_validate(raw_dict)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "gpt2"
model = AutoModelForCausalLM.from_pretrained(peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
tokenizer.pad_token = tokenizer.eos_token


tokenizer.padding_side = "left"
tokenizer.truncation_side = 'left'
tokenizer.pad_token = config.special_tokens.pad_token

model.generation_config.pad_token_id = tokenizer.encode(tokenizer.pad_token)[0]
model.generation_config.end_token_id = tokenizer.encode(tokenizer.pad_token)[0]

In [None]:
import os
import tempfile

import click
import transformers
from trl import SFTTrainer

from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from trl import DataCollatorForCompletionOnlyLM


from analysis.model_inference.utils.starcoder2_utils import download_model_from_clearml, prepare_model_and_tokenizer


@click.command()
@click.option('--config_name', type=str, required=True, help='Name of the config file', prompt=True)
def main(config_name: str) -> None:
    os.makedirs("./out/", exist_ok=True)

    # use configs
    os.makedirs(training_config.output_dir, exist_ok=True)
    train_data, eval_data = get_train_eval_data(data_config)
    peft_params = get_peft_params(peft_config, model_config.params_for_model.path_to_model)

    model, tokenizer = download_model_from_clearml(model_config.params_for_model)
    model, tokenizer = prepare_model_and_tokenizer(model, tokenizer, model_config.prompt_params)

    if task_type == CollatorTask.CONTINUATION:
        collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
    elif task_type == CollatorTask.LANGUAGE_MODELLING:
        collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=eval_data,
        max_seq_length=misc_config.max_seq_length,
        args=transformers.TrainingArguments(**training_config.model_dump()),
        peft_config=peft_params,
        dataset_text_field=data_config.dataset_text_field,
        data_collator=collator,
    )
    print_trainable_parameters(model)
    print('first step evaluation')
    trainer.evaluate()
    print("Training...")
    trainer.train()
    print("Training Done! 💥")

    task = Task.init(
        project_name=os.environ["CLEARML_PROJECT"], task_name=os.environ["CLEARML_TASK"], continue_last_task=True
    )
    task.connect_configuration(peft_config.model_dump(), "peft_config")

    if misc_config.save_all:
        peft_model = trainer.model
        if peft_config.adapter_type == "lora":
            merged_model = peft_model.merge_and_unload()
        else:
            merged_model = peft_model

        with tempfile.TemporaryDirectory() as tmp_dir_name:
            merged_model.save_pretrained(tmp_dir_name, safe_serialization=True)
            task.update_output_model(tmp_dir_name, model_name="merged_model")


if __name__ == '__main__':
    main()


### DeepSeeker response example is shit it doesn't understand the Trainer

### Transformers PEFT example - LORA


In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "gpt2"
model = AutoModelForCausalLM.from_pretrained(peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
tokenizer.pad_token = tokenizer.eos_token

In [33]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [34]:
# Prepare the dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [37]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [36]:
trainer = Trainer(model=model,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["validation"],)

trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

### Transformers classifier example


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [3]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [5]:
training_args.per_device_eval_batch_size=2
training_args.per_device_train_batch_size=2

In [6]:
training_args.eval_steps = 0.1

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [9]:
# trainer.train()

In [10]:
# predictions = trainer.predict(tokenized_datasets["validation"])
# print(predictions.predictions.shape, predictions.label_ids.shape)

In [28]:
# import numpy as np

# preds = np.argmax(predictions.predictions, axis=-1)

In [11]:
# import evaluate

# metric = evaluate.load("glue", "mrpc")
# metric.compute(predictions=preds, references=predictions.label_ids)

In [30]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [36]:
from transformers import Trainer

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.535402,0.759804,0.840909
2,0.611800,0.381774,0.835784,0.883072
3,0.397700,0.551837,0.857843,0.901361


TrainOutput(global_step=1377, training_loss=0.4276924659656976, metrics={'train_runtime': 219.7511, 'train_samples_per_second': 50.075, 'train_steps_per_second': 6.266, 'total_flos': 405114969714960.0, 'train_loss': 0.4276924659656976, 'epoch': 3.0})

In [12]:
from datasets import load_dataset
ds = load_dataset("bigcode/the-stack-smol", data_dir="data/python", token='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')

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-Coder-0.5B')
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-0.5B')

In [42]:
tokenized_datasets = ds.map(lambda x: tokenizer(x["content"],return_tensors='pt', padding=True, truncation=True, max_length=512).to('cuda'), batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [43]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['content', 'avg_line_length', 'max_line_length', 'alphanum_fraction', 'licenses', 'repository_name', 'path', 'size', 'lang', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [46]:
from transformers import Trainer

model.train()

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,

)

  trainer = Trainer(


In [47]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.