## Fine tune a llama3 model with m-a-p/Code-Feedback

### Setup

In [44]:
#!pip install -q transformers[torch] datasets
#!pip install -q bitsandbytes trl peft
#!pip install flash-attn --no-build-isolation

### Load Data + Preprocessing

In [2]:
from datasets import load_dataset, DatasetDict

In [3]:
raw_datasets = load_dataset("m-a-p/Code-Feedback")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'messages'],
        num_rows: 66383
    })
})

In [4]:
# SPLIT data in train/test

indices_1 = range(0,100)
indices_2 = range(101,201)
dataset_dict = {
    "train": raw_datasets["train"].select(indices_1),
    "test": raw_datasets["train"].select(indices_2)
}
raw_dataset = DatasetDict(dataset_dict)
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'messages'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'messages'],
        num_rows: 100
    })
})

In [5]:
raw_dataset["train"][0]

{'id': 1,
 'messages': [{'role': 'user',
   'content': 'Write a Ruby code to convert a double-linked list to a single-linked list without using any built-in methods or data structures.'},
  {'role': 'assistant',
   'content': 'Here is a Ruby code that converts a double-linked list to a single-linked list without using any built-in methods or data structures:\n\n```ruby\nclass Node\n  attr_accessor :value, :next_node, :prev_node\n\n  def initialize(value, next_node = nil, prev_node = nil)\n    @value = value\n    @next_node = next_node\n    @prev_node = prev_node\n  end\nend\n\ndef convert_to_single_linked_list(head)\n  current_node = head\n  while current_node.next_node != nil\n    current_node.next_node.prev_node = nil\n    current_node = current_node.next_node\n  end\n  current_node\nend\n\n# Create a double-linked list\nnode1 = Node.new(1)\nnode2 = Node.new(2)\nnode3 = Node.new(3)\n\nnode1.next_node = node2\nnode2.prev_node = node1\nnode2.next_node = node3\nnode3.prev_node = node2\n

#### Tokenizer

In [49]:
from transformers import AutoTokenizer
from huggingface_hub import login

In [50]:
model_id = "meta-llama/Meta-Llama-3-8B"

In [51]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable default for models without max length
if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048

loading file tokenizer.json from cache at /home/ec2-user/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/ec2-user/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/ec2-user/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [52]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE


In [53]:
import re
import random
from multiprocessing import cpu_count


def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

In [54]:
column_names = list(raw_dataset["train"].features)
column_names


['id', 'messages']

In [55]:
raw_dataset = raw_dataset.map(apply_chat_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)

In [56]:
train_data = raw_dataset["train"]
test_data = raw_dataset["test"]

for index in random.sample(range(len(raw_dataset["train"])), 3):
  print(f"Sample {index} of the processed training set:\n\n{raw_dataset['train'][index]['text']}")
  

Sample 81 of the processed training set:

<|system|>
<|end_of_text|>
<|user|>
You are tasked with implementing a multithreaded program to process a list of tasks concurrently. Each task is represented by a function that takes no arguments and returns a result. Your goal is to design a solution that efficiently utilizes multiple threads to execute these tasks in parallel.

You are given a list of `n` tasks and a predefined number of threads to use for concurrent execution. Your task is to implement a function `execute_tasks(tasks: List[Callable[[], Any]], num_threads: int) -> List[Any]` that takes in the list of tasks and the number of threads and returns a list of results obtained after executing all the tasks concurrently using the specified number of threads.

The function signature is as follows:
```python
from typing import List, Callable, Any

def execute_tasks(tasks: List[Callable[[], Any]], num_threads: int) -> List[Any]:
    pass
```

Your implementation should ensure that the 

### Model Definition

In [57]:
# QloRa SFT
from transformers import BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
)

device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None
device_map


{'': 0}

In [58]:
model_kwargs = dict(
    #attnt_implementation=True,
    torch_dtype="auto",
    use_cache=False,
    device_map=device_map,
    quantization_config=quant_config
)

### SFT Trainer

In [59]:
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig
from transformers import TrainingArguments

In [3]:
output_dir = "../../model_saved/llama_code_feedback-8B"

In [61]:
# based on config
training_args = TrainingArguments(
    fp16=True, # specify bf16=True instead when training on GPUs that support bf16
    do_eval=True,
    eval_strategy="epoch",
    gradient_accumulation_steps=128,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

# based on config
peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [62]:
trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
PyTorch: setting up devices
loading configuration file config.json from cache at /home/ec2-user/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "trans

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /home/ec2-user/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128001,
  "max_length": 4096,
  "temperature": 0.6,
  "top_p": 0.9
}

PyTorch: setting up devices


Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2140 > 2048). Running this sequence through the model will result in indexing errors
Using auto half precision backend


In [63]:
train_result = trainer.train()

***** Running training *****
  Num examples = 63
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 128
  Total optimization steps = 1
  Number of trainable parameters = 54,525,952


Epoch,Training Loss,Validation Loss



***** Running Evaluation *****
  Num examples = 65
  Batch size = 1
  Num examples = 65
  Batch size = 1


### Saving Model

In [64]:
metrics = train_result.metrics
max_train_samples = 100
metrics["train_samples"] = min(max_train_samples, len(train_data))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
trainer.save_model(training_args.output_dir)

Saving model checkpoint to ../model_saved/llama_code_feedback-8B


***** train metrics *****
  epoch                    =        1.0
  total_flos               =  5450195GF
  train_loss               =     0.5522
  train_runtime            = 0:04:01.95
  train_samples            =        100
  train_samples_per_second =       0.26
  train_steps_per_second   =      0.004


loading configuration file config.json from cache at /home/ec2-user/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.0",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ../model_saved/llama_code_feedback-8B/tokenizer_config.json
Special tokens file sav

### Inference

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

output_dir = "../../model_saved/llama_code_feedback-8B"

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
import torch

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {"role": "user", "content": "Write a Ruby code to convert a double-linked list to a single-linked list"},
]

DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

# prepare the messages for the model
input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

'''
# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
'''

'\n# inference\noutputs = model.generate(\n        input_ids=input_ids,\n        max_new_tokens=256,\n        do_sample=True,\n        temperature=0.7,\n        top_k=50,\n        top_p=0.95\n)\nprint(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])\n'