# Tutorial 
https://www.datacamp.com/tutorial/llama3-fine-tuning-locally

# import

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

from peft import (
    LoraConfig,
    get_peft_model
)

import os, torch 
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm
2024-06-29 15:51:00.972518: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-29 15:51:00.998214: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Params

In [2]:
torch_dtype = torch.float16
base_model = "meta-llama/Meta-Llama-3-8B"
attn_implementation = "eager"
dataset_name = "ruslanmv/ai-medical-chatbot"

# Load LLama 3 with 4bit quanitization

In [2]:
#quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True
)

#load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    device_map = 'auto',
    attn_implementation = attn_implementation
)


NameError: name 'BitsAndBytesConfig' is not defined

# Load tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

# Add Adapter

In [None]:
peft_config = LoraConfig (
    r = 16, 
    lora_alpha = 32, 
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CASUAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

model = get_peft_model(model, peft_config)

# Load finetune dataset

In [None]:
dataset = load_dataset(dataset_name, split = "all")
dataset = dataset.shuffle(seed = 65).select((range(1000))) # select 1000 items for demo

def format_chat_template(row):
    row_json = [
        {'role': 'user', 'content': row['Patient']},
        {'role': 'assistant', 'content': row['Doctor']},
    ]
    row['text'] = tokenizer.apply_chat_template(row_json, tokenize=False) # don't tokenize this
    return row

#map this dataset into correct format
dataset = dataset.map(
    format_chat_template,
    num_proc = 4
)

print(dataset['text'][3])

dataset = dataset.train_test_split(test_size = 0.1) #split into train and test

# Set training arguments

In [None]:
training_arguments = TrainingArguments(
    output_dir = 'llama_3_8b_chat_doctor',
    per_device_train_batch_size=1, # The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for training.
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2, #Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
    optim="paged_adamw_32bit",
    evaluation_strategy="steps",
    num_train_epochs=1,
    eval_steps = 0.2,
    warmup_steps=10,
    logging_steps=1,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16 = False, #
    #Requires Ampere or higher NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
    #bf16 16-bit (mixed) precision training instead of 32-bit training. 
    bf16= False, 
    group_by_length=True,
)


In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config, #LoRA config that reduce training params and more efficient trainng
    max_seq_length=512, #avoid GPU Mem exceed error
    dataset_text_field="text",
    args = training_arguments,
    packing = False
)

trainer.train()