# Instruction Finetuning.

In this notebook we'll be finetuning all parameters of a pretrained model

In [1]:
import os

from enum import Enum
from functools import partial
import pandas as pd

# DL
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments
)
from datasets import load_dataset
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Torch version: ", torch.__version__)
print("Is CUDA available: ", torch.cuda.is_available())

Torch version:  2.0.1+cu117
Is CUDA available:  True


## Data processing: create datasets and dataloaders

In [3]:
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T"
dataset_name = "HuggingFaceH4/no_robots"
tokenizer = AutoTokenizer.from_pretrained(model_name)
template = """{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"""
tokenizer.chat_template = template

In [4]:
def preprocess(samples):
    batch = []
    for conversation in samples["messages"]:
        batch.append(tokenizer.apply_chat_template(conversation, tokenize=False))
    return {"context": batch}

In [5]:
dataset = load_dataset(dataset_name)
dataset = dataset.map(
    preprocess,
    batched=True,
    #remove_columns=dataset['train_sft'].column_names
)
print(dataset)
print(dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'context'],
        num_rows: 9500
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'messages', 'category', 'context'],
        num_rows: 500
    })
})
{'prompt': 'Please summarize the goals for scientists in this text:\n\nWithin three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird 

In [6]:
dataset['train'][0]

{'prompt': 'Please summarize the goals for scientists in this text:\n\nWithin three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird Observatory can help guide best practices when restoring the native habitat that has long served as critical refuge for imperiled birds and animals as adjacent marshes flood more with rising sea levels. “We can’t ask restoration ecologists to plant nonnative species or to 

In [7]:
dataset['test'][0]

{'prompt': 'Aster is a chatbot who answers questions with rhymes.',
 'prompt_id': 'd6c011ffb1ff8a9abe9bd24caf3f9817454a1f054d5d0e0360d19bf50cf6b20c',
 'messages': [{'content': 'Aster is a chatbot who answers questions with rhymes.',
   'role': 'system'},
  {'content': 'Where did chocolate originate?', 'role': 'user'},
  {'content': 'Chocolate is 4000 years old/Mexico is where it was first sold',
   'role': 'assistant'},
  {'content': 'Where was milk chocolate invented?', 'role': 'user'},
  {'content': 'Switzerland was the first to add milk/To make their chocolate smooth as silk',
   'role': 'assistant'},
  {'content': 'What are some good desserts that use chocolate?',
   'role': 'user'},
  {'content': 'Pie, tart, cookies, and cake/Chocolate is great to bake',
   'role': 'assistant'}],
 'category': 'Chat',
 'context': '<|im_start|>system\nAster is a chatbot who answers questions with rhymes.<|im_end|>\n<|im_start|>user\nWhere did chocolate originate?<|im_end|>\n<|im_start|>assistant\nCh

## Load pretrained model and tokenizer

In [8]:
class ChatmlSpecialTokens(str, Enum):
    user = "<|im_start|>user"
    assistant = "<|im_start|>assistant"
    system = "<|im_start|>system"
    eos_token = "<|im_end|>"
    bos_token = "<s>"
    pad_token = "<pad>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    pad_token=ChatmlSpecialTokens.pad_token.value,
    bos_token=ChatmlSpecialTokens.bos_token.value,
    eos_token=ChatmlSpecialTokens.eos_token.value,
    additional_special_tokens=ChatmlSpecialTokens.list(),
    trust_remote_code=True
)
tokenizer.chat_template = template
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

Embedding(32005, 2048)

Store the base model predictions on a subset of 25 samples from eval test

In [10]:
tokenizer.padding_side = "left"

def get_predictions_batched(samples, column_name):
    batch = []
    for conversation in samples["messages"]:
        chatml_gen_prompt = tokenizer.apply_chat_template(
            conversation[:-1],
            tokenize=False,
            add_generation_prompt=True
        )
        batch.append(chatml_gen_prompt)
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True
        )
        inputs = {k:v.to("cuda") for k, v in inputs.items()}
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_p=0.95,
            temperature=0.2,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
        outputs = tokenizer.batch_decode(outputs)
        outputs = [output.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip() for output in outputs]
        return {column_name: outputs}

In [11]:
model.to("cuda")

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.93 GiB total capacity; 1.41 GiB already allocated; 21.44 MiB free; 1.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
test_dataset[0]

{'prompt': 'Write a picnic announcement letter about a spontaneous picnic on the African plains. The author is a hyena, and the hosts are lions.',
 'prompt_id': 'd181278e1b38176c0969f3353fff5e6c420e04b575910d912fc520e607435836',
 'messages': [{'content': 'Write a picnic announcement letter about a spontaneous picnic on the African plains. The author is a hyena, and the hosts are lions.',
   'role': 'user'},
  {'content': 'Hi everybody!\n\nJoin us for an impromptu picnic on the plains!\nEveryone is welcome. Bring your appetite!\n\nLars Roarful, alpha of the local lion pride has announced that due to a slight miscalculation about how much of his extended family was coming to visit, they have lots of leftovers and he hopes that we will join him and his wife and kids for a picnic party! Ha ha ha, I know I’ll be there!\n\nHere are the details for you:\n\nTime: ASAP\nLocation: Across from the shaded pond (the one Lilac hangs out at with her hippo family)\n\nThere will also be live entertainm

In [None]:
test_dataset = load_dataset(dataset_name)["test"].shuffle().select(range(25))
test_dataset = test_dataset.map(
    partial(
        get_predictions_batched,
        column_name="base_assistant_message"
    ),
    batched=True,
    batch_size=1
)
print(test_dataset)
print(test_dataset[0])

Map:   0%|          | 0/25 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 25/25 [01:42<00:00,  4.10s/ examples]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'base_assistant_message'],
    num_rows: 25
})
{'prompt': "I'm traveling to Europe and I want to visit some historical World War I locations. Where should I visit? I don't want to visit France though. Use bullet points for the list. I want no more than 10 suggestions.", 'prompt_id': 'eb4fee5c685b36bffcd84b2c68db01fe4842302c24190e144f391b7016bb2330', 'messages': [{'content': "I'm traveling to Europe and I want to visit some historical World War I locations. Where should I visit? I don't want to visit France though. Use bullet points for the list. I want no more than 10 suggestions.", 'role': 'user'}, {'content': 'Here are a few WWI historical locations to consider visiting while in Europe, excluding France:\n\n• The Latin Bridge, Sarajevo, Bosnia: This is where the war started when Austro-Hungarian Empire Archduke Franz Ferdinand and his wife were assassinated.\n• Langemark German Cemetery, Belgium: A cemetery of 44




## Training

In [None]:
output_dir = "tinyllama_instruct"
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 16
logging_steps = 25
learning_rate = 2e-5
max_grad_norm = 1.0
max_steps = 250
num_train_epochs = 1
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_sq_length = 2048

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    fp16=True,
    report_to=["tensorboard", "wandb"],
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
dataset["test"]

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category', 'context'],
    num_rows: 500
})

In [23]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    packing=True,
    dataset_text_field="context",
    max_seq_length=max_sq_length   
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 1571 examples [00:03, 432.48 examples/s]
Generating train split: 84 examples [00:00, 426.75 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling para

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6675d8f8-3451be5e1f5247e96b15ffd5;4f45a026-9c53-4d41-8cdd-3496e91fb67f)

Invalid username or password.