## fine tune a llama3.1 with m-a-p/Code-Feedback

### Setup

In [1]:
#!pip install -q transformers[torch] datasets
#!pip install -q bitsandbytes trl peft
#!pip install flash-attn --no-build-isolation

### Load Data + Preprocessing

In [2]:
from datasets import load_dataset, DatasetDict

In [3]:
raw_datasets = load_dataset("m-a-p/Code-Feedback")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'messages'],
        num_rows: 66383
    })
})

In [4]:
# SPLIT data in train/test

indices_1 = range(0,1000)
indices_2 = range(1001,2001)
dataset_dict = {
    "train": raw_datasets["train"].select(indices_1),
    "test": raw_datasets["train"].select(indices_2)
}
raw_dataset = DatasetDict(dataset_dict)
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'messages'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'messages'],
        num_rows: 1000
    })
})

In [5]:
raw_dataset["train"][0]

{'id': 1,
 'messages': [{'role': 'user',
   'content': 'Write a Ruby code to convert a double-linked list to a single-linked list without using any built-in methods or data structures.'},
  {'role': 'assistant',
   'content': 'Here is a Ruby code that converts a double-linked list to a single-linked list without using any built-in methods or data structures:\n\n```ruby\nclass Node\n  attr_accessor :value, :next_node, :prev_node\n\n  def initialize(value, next_node = nil, prev_node = nil)\n    @value = value\n    @next_node = next_node\n    @prev_node = prev_node\n  end\nend\n\ndef convert_to_single_linked_list(head)\n  current_node = head\n  while current_node.next_node != nil\n    current_node.next_node.prev_node = nil\n    current_node = current_node.next_node\n  end\n  current_node\nend\n\n# Create a double-linked list\nnode1 = Node.new(1)\nnode2 = Node.new(2)\nnode3 = Node.new(3)\n\nnode1.next_node = node2\nnode2.prev_node = node1\nnode2.next_node = node3\nnode3.prev_node = node2\n

#### Tokenizer

In [6]:
from transformers import AutoTokenizer
from huggingface_hub import login

In [7]:
model_id = "meta-llama/Meta-Llama-3-8B"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

# set pad_token_id equal to the eos_token_id if not set
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable default for models without max length
if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048

In [9]:
DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE


In [10]:
import re
import random
from multiprocessing import cpu_count


def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

In [11]:
column_names = list(raw_dataset["train"].features)
column_names


['id', 'messages']

In [12]:
raw_dataset = raw_dataset.map(apply_chat_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)

Applying chat template (num_proc=8):   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template (num_proc=8):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
train_data = raw_dataset["train"]
test_data = raw_dataset["test"]

for index in random.sample(range(len(raw_dataset["train"])), 3):
  print(f"Sample {index} of the processed training set:\n\n{raw_dataset['train'][index]['text']}")
  

Sample 492 of the processed training set:

<|system|>
<|end_of_text|>
<|user|>
Explain the step-by-step process of Insertion Sort and discuss its time and space complexity. Additionally, compare Insertion Sort with other sorting algorithms like Quick Sort and Merge Sort, and analyze their advantages and disadvantages in terms of performance and use cases. Provide a piece of erroneous code that attempts to implement Insertion Sort incorrectly and explain the error and how it can be fixed.<|end_of_text|>
<|assistant|>
Insertion Sort is a simple sorting algorithm that works by dividing the input array into a sorted and an unsorted portion. It repeatedly takes elements from the unsorted portion and inserts them into their correct position in the sorted portion.

Here is the step-by-step process of Insertion Sort:

1. Start with the second element (index 1) of the array.
2. Compare the second element with the first element. If the second element is smaller, swap them.
3. Move to the next un

### Model Definition

In [14]:
# QloRa SFT
from transformers import BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
)

device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None
device_map


{'': 0}

In [15]:
model_kwargs = dict(
    #attnt_implementation=True,
    torch_dtype="auto",
    use_cache=False,
    device_map=device_map,
    quantization_config=quant_config
)

### SFT Trainer

In [16]:
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

In [17]:
output_dir = "../model_saved/llama_code_feedback-8B"

In [18]:
# based on config
training_args = TrainingArguments(
    fp16=True, # specify bf16=True instead when training on GPUs that support bf16
    do_eval=True,
    eval_strategy="epoch",
    gradient_accumulation_steps=128,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

# based on config
peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)


In [19]:
trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2151 > 2048). Running this sequence through the model will result in indexing errors


Generating train split: 0 examples [00:00, ? examples/s]

Using auto half precision backend


In [20]:
train_result = trainer.train()

***** Running training *****
  Num examples = 661
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 128
  Total optimization steps = 5
  Number of trainable parameters = 54,525,952
