# Instruction Finetuning.

In this notebook we'll be finetuning all parameters of a pretrained model

In [1]:
import os

from enum import Enum
from functools import partial
import pandas as pd

# DL
import torch
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from datasets import load_dataset
from trl import SFTTrainer

In [2]:
login('', add_to_git_credential=True)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
print("Torch version: ", torch.__version__)
print("Is CUDA available: ", torch.cuda.is_available())

Torch version:  2.1.0+cu118
Is CUDA available:  True


## Data processing: create datasets and dataloaders

In [4]:
model_name = "TinyLlama/TinyLlama_v1.1"
dataset_name = "HuggingFaceH4/no_robots"
tokenizer = AutoTokenizer.from_pretrained(model_name)
template = """{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"""
tokenizer.chat_template = template

In [5]:
def preprocess(samples):
    batch = []
    for conversation in samples["messages"]:
        batch.append(tokenizer.apply_chat_template(conversation, tokenize=False))
    return {"context": batch}

In [6]:
dataset = load_dataset(dataset_name)
dataset = dataset.map(
    preprocess,
    batched=True,
    #remove_columns=dataset['train_sft'].column_names
)
print(dataset)
print(dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'context'],
        num_rows: 9500
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'context'],
        num_rows: 500
    })
})
{'prompt': 'Please summarize the goals for scientists in this text:\n\nWithin three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird 

In [7]:
dataset['train'][0]

{'prompt': 'Please summarize the goals for scientists in this text:\n\nWithin three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird Observatory can help guide best practices when restoring the native habitat that has long served as critical refuge for imperiled birds and animals as adjacent marshes flood more with rising sea levels. “We can’t ask restoration ecologists to plant nonnative species or to 

In [8]:
dataset['test'][0]

{'prompt': 'Aster is a chatbot who answers questions with rhymes.',
 'prompt_id': 'd6c011ffb1ff8a9abe9bd24caf3f9817454a1f054d5d0e0360d19bf50cf6b20c',
 'messages': [{'content': 'Aster is a chatbot who answers questions with rhymes.',
   'role': 'system'},
  {'content': 'Where did chocolate originate?', 'role': 'user'},
  {'content': 'Chocolate is 4000 years old/Mexico is where it was first sold',
   'role': 'assistant'},
  {'content': 'Where was milk chocolate invented?', 'role': 'user'},
  {'content': 'Switzerland was the first to add milk/To make their chocolate smooth as silk',
   'role': 'assistant'},
  {'content': 'What are some good desserts that use chocolate?',
   'role': 'user'},
  {'content': 'Pie, tart, cookies, and cake/Chocolate is great to bake',
   'role': 'assistant'}],
 'category': 'Chat',
 'context': '<|im_start|>system\nAster is a chatbot who answers questions with rhymes.<|im_end|>\n<|im_start|>user\nWhere did chocolate originate?<|im_end|>\n<|im_start|>assistant\nCh

## Load pretrained model and tokenizer

In [9]:
class ChatmlSpecialTokens(str, Enum):
    user = "<|im_start|>user"
    assistant = "<|im_start|>assistant"
    system = "<|im_start|>system"
    eos_token = "<|im_end|>"
    bos_token = "<s>"
    pad_token = "<pad>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    pad_token=ChatmlSpecialTokens.pad_token.value,
    bos_token=ChatmlSpecialTokens.bos_token.value,
    eos_token=ChatmlSpecialTokens.eos_token.value,
    additional_special_tokens=ChatmlSpecialTokens.list(),
    trust_remote_code=True
)
tokenizer.chat_template = template
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

  return self.fget.__get__(instance, owner)()


Embedding(32005, 2048)

Store the base model predictions on a subset of 25 samples from eval test

In [11]:
tokenizer.padding_side = "left"

def get_predictions_batched(samples, column_name):
    batch = []
    for conversation in samples["messages"]:
        chatml_gen_prompt = tokenizer.apply_chat_template(
            conversation[:-1],
            tokenize=False,
            add_generation_prompt=True
        )
        batch.append(chatml_gen_prompt)
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
        inputs = {k:v.to("cuda") for k, v in inputs.items()}
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_p=0.95,
            temperature=0.2,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
        outputs = tokenizer.batch_decode(outputs)
        outputs = [output.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip() for output in outputs]
        return {column_name: outputs}

In [12]:
model.to("cuda")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32005, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
    (rotary_emb): LlamaRo

In [13]:
test_dataset = load_dataset(dataset_name)["test"].shuffle().select(range(25))
test_dataset = test_dataset.map(
    partial(
        get_predictions_batched,
        column_name="base_assistant_message"
    ),
    batched=True,
    batch_size=1
)
print(test_dataset)
print(test_dataset[0])

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'base_assistant_message'],
    num_rows: 25
})
{'prompt': 'My sibling and I are having an argument about if cats or dogs are better. I happen to know for a fact that cats are better, I just need some facts that prove it. Brainstorm a list of at least half a dozen things about cats and state how each one makes them better than dogs. ', 'prompt_id': '51113c20a33637c45ffa7790adf858b82411041fd590a33eb7237a8dcc20f759', 'messages': [{'content': 'My sibling and I are having an argument about if cats or dogs are better. I happen to know for a fact that cats are better, I just need some facts that prove it. Brainstorm a list of at least half a dozen things about cats and state how each one makes them better than dogs. ', 'role': 'user'}, {'content': "I'm sorry to hear that you and your sibling are arguing. Here are some facts about why cats are better than dogs: \n1. Purring. This is something cats do that dogs are not phys

## Training

In [14]:
output_dir = "tinyllama_instruct"
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 16
logging_steps = 25
learning_rate = 2e-5
max_grad_norm = 1.0
max_steps = 250
num_train_epochs = 1
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_sq_length = 2048

In [15]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    fp16=True,
    report_to=["tensorboard", "wandb"],
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)



In [16]:
dataset["test"]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'context'],
    num_rows: 500
})

In [17]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    packing=True,
    dataset_text_field="context",
    max_seq_length=max_sq_length
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]



In [None]:
trainer.train()
trainer.save_model()

# Load the pretrained model

In [None]:
model = AutoModelForCausalLM.from_pretrained('my_repo_endpoint', trust_remote_code=True)
model.to('cuda')
model.to(torch.float16)
model.eval()

In [None]:
test_dataset = test_dataset.map(
    partial(get_predictions_batched, column_name='instruct_assistant_message'),
    batched=True,
    batch_size=1
)
print(test_dataset)
print(test_dataset[0])

## Comparing outputs of base model and instruction finetuned model

In [None]:
test_dataset = test_dataset.to_pandas()
pd.set_option('max_colwidth', 300)
test_dataset[['message', 'base_assistant_message', 'instruct_assistant_message']][:25]

## Generate on som random instruction

In [None]:
messages = [
    {
        'role': 'user',
        'content': 'What an essay on generative IA'
    }
]
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(text, return_tensors='pt')
inputs = {k:v.to('cuda') for k, v in inputs.items()}
outputs = model.generate(
    **inputs,
    max_new_tokens=2000,
    do_sample=True,
    top_p=0.95,
    temperature=0.2,
    repetition_penalty=1.1,
    eos_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(outputs[0]))