In [1]:
!pip install trl > /dev/null

One can easily fine-tune your SFT model using SFTTrainer from TRL. Let us assume your dataset is imdb, the text you want to predict is inside the text field of the dataset, and you want to fine-tune the facebook/opt-350m model.

In [1]:
import trl
trl.__version__

'0.7.11'

In [1]:
import os
os.getpid()

6622

In [None]:
proc = os.getpid()
# os.kill(proc,9)

In [1]:
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

dataset = load_dataset("imdb", split="train")
dataset = dataset.train_test_split(test_size=0.2)
dataset = dataset['test'].train_test_split(test_size=0.1)
# dataset

In [2]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=3,
    per_device_train_batch_size=4,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    num_train_epochs=1
)

In [3]:
trainer = SFTTrainer(
    "facebook/opt-350m",
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    max_seq_length=512,
    args=args
)
# may be a seperate training_arg object has to be passed



Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

- facebook/opt-350m is 663MB on hdd and loads to 1450 MB inside GPU, why?
- train_bs = 1 and test_bs = 1 ==> training okay
- train_ys = 2 and test_bs = 1 ==> training okay
- train_ys = 2 and test_bs = 2 ==> training okay (8.6GB)
- train_ys = 3 and test_bs = 3 ==> training okay (9.2GB)
- train_ys = 3 and test_bs = 4 ==> training okay (10.8GB)
- train_bs = 4 and test_bs = 4 ==> training fail (11.96GB)

In [4]:
trainer.train()

Step,Training Loss,Validation Loss
200,No log,3.566108


KeyboardInterrupt: 

You can use the DataCollatorForCompletionOnlyLM to train your model on the **generated prompts only**. Note that this works only in the case when packing=False. To instantiate that collator for instruction data, pass a response template and the tokenizer.

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [2]:
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")

model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

dataset = dataset.train_test_split(test_size=0.3)['test'].train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 4805
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1202
    })
})

In [5]:
dataset['train'][0]

{'instruction': 'Use JavaScript to encrypt a given string using MD5 algorithm.',
 'input': 'string = "This is a test."',
 'output': 'let encryptedString = CryptoJS.MD5(string).toString();'}

In [3]:
def formatting_prompts_func(example):
    output_texts = []
    # traverse the batches 
    for i in range(len(example['instruction'])):
        # and make the batches as Question and output as answers
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"
# create a DataCollator that is imported from trl for CompletionLM
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)


In [4]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt/',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=200,
    num_train_epochs=1
)

In [5]:
trainer = SFTTrainer(
    model,
    args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    formatting_func=formatting_prompts_func,
    data_collator=collator,
)



Map:   0%|          | 0/4805 [00:00<?, ? examples/s]

Map:   0%|          | 0/1202 [00:00<?, ? examples/s]

- train_bs = 3 and test_bs = 4 ==> training fail (11.8GB)
- train_bs = 1 and test_bs = 4 ==> training fail (11.9GB)
- train_bs = 1 and test_bs = 1 ==> training okay (9.6GB)

In [6]:
trainer.train()

Step,Training Loss,Validation Loss
200,No log,2.649155


KeyboardInterrupt: 

To instantiate that collator for assistant style conversation data, pass a response template, an instruction template and the tokenizer. Here is an example of how it would work to fine-tune opt-350m **on assistant completions** only on the Open Assistant Guanaco dataset:

In [6]:
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    TrainingArguments
)
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [3]:
dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")
dataset = dataset.train_test_split(test_size=0.3)['test'].train_test_split(test_size=0.2)

model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
dataset

Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2363
    })
    test: Dataset({
        features: ['text'],
        num_rows: 591
    })
})

In [4]:
instruction_template = "### Human:"
response_template = "### Assistant:"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template,
                                           response_template=response_template,
                                           tokenizer=tokenizer,
                                           mlm=False)


In [7]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt/',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=200,
    num_train_epochs=1
)
# - train_bs = 1 and test_bs = 1 ==> training okay (9.6GB)

In [1]:
trainer = SFTTrainer(
    model,
    args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    data_collator=collator,
)

trainer.train()

NameError: name 'SFTTrainer' is not defined

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

def print_tokens_with_ids(txt):
    tokens = tokenizer.tokenize(txt, add_special_tokens=False)  # only tokenize the text 
    token_ids = tokenizer.encode(txt, add_special_tokens=False) # encode the text
    print(list(zip(tokens, token_ids)))  # zip them together and return

prompt = """### User: Hello\n\n### Assistant: Hi, how can I help you?"""

print_tokens_with_ids(prompt) 
# [..., ('▁Hello', 15043), ('<0x0A>', 13), ('<0x0A>', 13), ('##', 2277), ('#', 29937), ('▁Ass', 4007), ('istant', 22137), (':', 29901), ...]

response_template = "### Assistant:"

print_tokens_with_ids(response_template) 

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

[('▁###', 835), ('▁User', 4911), (':', 29901), ('▁Hello', 15043), ('<0x0A>', 13), ('<0x0A>', 13), ('##', 2277), ('#', 29937), ('▁Ass', 4007), ('istant', 22137), (':', 29901), ('▁Hi', 6324), (',', 29892), ('▁how', 920), ('▁can', 508), ('▁I', 306), ('▁help', 1371), ('▁you', 366), ('?', 29973)]
[('▁###', 835), ('▁Ass', 4007), ('istant', 22137), (':', 29901)]


The **setup_chat_format() function** in trl easily sets up a model and tokenizer for conversational AI tasks. This function:

- Adds special tokens to the tokenizer, e.g. <|im_start|> and <|im_end|>, to indicate the start and end of a conversation.

- Resizes the model’s embedding layer to accommodate the new tokens.

- Sets the chat_template of the tokenizer, which is used to format the input data into a chat-like format. The default is chatml from OpenAI.

- optionally you can pass resize_to_multiple_of to resize the embedding layer to a multiple of the resize_to_multiple_of argument, e.g. 64. If you want to see more formats being supported in the future, please open a GitHub issue on trl

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments
)
from datasets import load_dataset
from trl import setup_chat_format, SFTTrainer

In [15]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

# Set up the chat format with default 'chatml' format
model, tokenizer = setup_chat_format(model, tokenizer)

In [12]:
# load jsonl dataset
# dataset = load_dataset("json", data_files="path/to/dataset.jsonl", split="train")
# load dataset from the HuggingFace Hub
dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
dataset = dataset.train_test_split(test_size=0.3)['test'].train_test_split(test_size=0.3)
dataset

In [9]:
dataset[0]

{'messages': [{'content': "When did Virgin Australia start operating?\nVirgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
   'role': 'user'},
  {'content': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
   'role': 'assistant'}]}

In [7]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt/',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=200,
    num_train_epochs=1
)
# - train_bs = 1 and test_bs = 1 ==> training okay (9.8GB)

In [None]:
# You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to 1024
# No chat template is defined for this tokenizer - using the default template for the 
# GPT2TokenizerFast class.

In [None]:
tokenizer.chat_template

In [16]:
trainer = SFTTrainer(
    "facebook/opt-350m",
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    packing=True,
)



Generating train split: 0 examples [00:00, ? examples/s]


No chat template is defined for this tokenizer - using the default template for the GPT2TokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



Generating train split: 0 examples [00:00, ? examples/s]

In [17]:
trainer.train()  # training goes through

Step,Training Loss,Validation Loss
200,No log,2.803561


Checkpoint destination directory /home/aicoder/training/sftt_opt/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 

The following is very powerful way of tackling dataset loading

In [2]:
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from datasets import load_dataset

In [6]:
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

In [8]:
# dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")
dataset = dataset.train_test_split(test_size=0.3)['test'].train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 4204
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1803
    })
})

In [4]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt/',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=200,
    num_train_epochs=1
)
# - train_bs = 1 and test_bs = 1 ==> training okay (9.8GB)

In [10]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

trainer = SFTTrainer(
    model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    formatting_func=formatting_prompts_func,
)

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

Map:   0%|          | 0/1803 [00:00<?, ? examples/s]

SFTTrainer **supports example packing**, where multiple short examples are packed in the same input sequence to increase training efficiency. This is done with the ConstantLengthDataset utility class that returns constant length chunks of tokens from a stream of examples. 

To enable the usage of this dataset class, simply pass packing=True to the SFTTrainer constructor.

In [13]:
def formatting_func(example):
    text = f"### Question: {example['instruction']}\n ### Answer: {example['output']}"
    return text

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    packing=True,
    formatting_func=formatting_func
)

# trainer.train()



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [1]:
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments
)

dataset = load_dataset("imdb", split="train")
dataset = dataset.train_test_split(test_size=0.3)['test'].train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5250
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2250
    })
})

In [2]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [3]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt/',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=200,
    num_train_epochs=1
)
# - train_bs = 1 and test_bs = 1 ==> training okay (9.8GB)

In [5]:
# "EleutherAI/gpt-neo-125m" is 586 MB on hdd
trainer = SFTTrainer(
    "EleutherAI/gpt-neo-125m",
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    peft_config=peft_config,
    args=args
)
# with peft takes 3.8GB for training
trainer.train()



Map:   0%|          | 0/5250 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
200,No log,3.746261


Checkpoint destination directory /home/aicoder/training/sftt_opt/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 

In [4]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# trainig adapter with 8-bit model
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-neo-125m",
    load_in_8bit=True,
    device_map="auto",
)

trainer = SFTTrainer(
    model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    peft_config=peft_config,
)  # takes 4.3GB for training

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Map:   0%|          | 0/5250 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

In [5]:
trainer.train()



Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# this enables use of flash_attention1

with torch.backends.cuda.sdp_kernel(enable_flash=True,
                                    enable_math=False,
                                    enable_mem_efficient=False):
    trainer.train()

To use Flash Attention 2, first install the latest flash-attn package:

pip install flash_attention

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    attn_implementation="flash_attention_2"
)

### Using model creation utility

In [8]:
from trl import (
    ModelConfig,
    SFTTrainer,
    get_kbit_device_map,
    get_peft_config, 
    get_quantization_config
)
from transformers import TrainingArguments

In [3]:
model_config = ModelConfig(
    model_name_or_path="facebook/opt-350m",
    attn_implementation=None, # or "flash_attention_2"
)

In [4]:
torch_dtype = (
    model_config.torch_dtype
    if model_config.torch_dtype in ["auto", None]
    else getattr(torch, model_config.torch_dtype)
)

In [5]:
quantization_config = get_quantization_config(model_config)
quantization_config

In [9]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/sftt_opt/',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='steps',
    save_steps=200,
    num_train_epochs=1
)

In [10]:
model_kwargs = dict(
    revision=model_config.model_revision,
    trust_remote_code=model_config.trust_remote_code,
    attn_implementation=model_config.attn_implementation,
    torch_dtype=torch_dtype,
    use_cache=False if args.gradient_checkpointing else True,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

In [11]:
model_kwargs

{'revision': 'main',
 'trust_remote_code': False,
 'attn_implementation': None,
 'torch_dtype': None,
 'use_cache': True,
 'device_map': None,
 'quantization_config': None}

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path,
                                             **model_kwargs)

trainer = SFTTrainer(
    ...,
    model=model_config.model_name_or_path,
    peft_config=get_peft_config(model_config),
)

### Reward Trainer

The reward model should be trained on a dataset of paired examples, where each example is a tuple of two sequences. The reward model should be trained to predict which example in the pair is more relevant to the task at hand.

The reward trainer expects a very specific format for the dataset. The dataset should contain two 4 entries at least if you don’t use the default RewardDataCollatorWithPadding data collator. 

Therefore the final dataset object should contain two 4 entries at least if you use 
the default RewardDataCollatorWithPadding data collator. The entries should be named:

input_ids_chosen

attention_mask_chosen

input_ids_rejected

attention_mask_rejected

You should pass an **AutoModelForSequenceClassification model** to the RewardTrainer, along with a RewardConfig which configures the hyperparameters of the training.

In [14]:
from peft import LoraConfig, TaskType
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardTrainer, RewardConfig
from datasets import load_dataset

In [49]:
dataset = load_dataset("Anthropic/hh-rlhf")
dataset = dataset['test'].train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 5986
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 2566
    })
})

In [46]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token 

In [47]:
def pre_process(row):
    chosen = tokenizer(row['chosen'], max_length=512, truncation=True)
    rejected = tokenizer(row['rejected'], max_length=512, truncation=True)
    final = {}
    final['input_ids_chosen'] = chosen['input_ids']
    final['attention_mask_chosen'] = chosen['attention_mask']
    final['input_ids_rejected'] = rejected['input_ids'] 
    final['attention_mask_rejected'] = rejected['attention_mask']
    return final

In [50]:
dataset = dataset.map(pre_process,
                      remove_columns=['chosen','rejected'],)
dataset

Map:   0%|          | 0/5986 [00:00<?, ? examples/s]

Map:   0%|          | 0/2566 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 5986
    })
    test: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 2566
    })
})

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("gpt2")

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def add_margin(row):
    # Assume you have a score_chosen and score_rejected columns that you want to use to compute the margin
    return {'margin': row['chosen'] - row['rejected']}

# dataset = dataset.map(add_margin)

In [51]:
trainer = RewardTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
)

In [52]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
200,No log,0.835181,0.5265


Checkpoint destination directory /home/aicoder/training/sftt_opt/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 