In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch

import os


  from .autonotebook import tqdm as notebook_tqdm


### Fine-tuning Llama 3.2

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format,SFTConfig

In [3]:
run = wandb.init(
    project='Fine-tune Llama 3.2 on Customer Support Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Currently logged in as: [33mshreyak[0m ([33mshreyakchakraborty[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [6]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "llama-3.2-3b-it-Ecommerce-ChatBot"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [7]:
# Determine the appropriate settings based on GPU capabilities
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [19]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation="eager"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# Add a new padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Resize the model embeddings to match the new tokenizer size
model.resize_token_embeddings(len(tokenizer))

Loading checkpoint shards:  50%|█████     | 1/2 [00:08<00:08,  8.00s/it]


KeyboardInterrupt: 

In [9]:
# Add a new padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Resize the model embeddings to match the new tokenizer size
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128257, 3072)

In [13]:
#Importing the dataset
dataset = load_dataset(dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['flags', 'instruction', 'category', 'intent', 'response'],
        num_rows: 26872
    })
})

In [14]:

# Shuffle the dataset
dataset = dataset.shuffle(seed=65)

train_dataset = dataset['train'].select(range(1150))

# Split the 'train' dataset into training and evaluation sets
split_dataset = train_dataset.train_test_split(test_size=0.1, seed=65)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

instruction = """You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.
    """
def format_chat_template(row):
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

# Apply the formatting function to the training dataset
train_dataset = train_dataset.map(
    format_chat_template,
    num_proc=4
)

# Apply the formatting function to the evaluation dataset
eval_dataset = eval_dataset.map(
    format_chat_template,
    num_proc=4
)

Map (num_proc=4): 100%|██████████| 1035/1035 [00:00<00:00, 3232.54 examples/s]
Map (num_proc=4): 100%|██████████| 115/115 [00:00<00:00, 591.92 examples/s]


In [15]:
print(train_dataset)
print(eval_dataset)


Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
    num_rows: 1035
})
Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response', 'text'],
    num_rows: 115
})


In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

model = get_peft_model(model, peft_config)

In [16]:
training_config = SFTConfig(
    max_seq_length=512,
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_8bit",  # Use 8-bit AdamW optimizer for efficiency
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate every 100 steps
    logging_steps=10,  # Log every 10 steps
    warmup_steps=10,
    learning_rate=2e-4,
    fp16=True,  # Enable mixed precision training
    logging_dir="./logs",
    report_to="none",  # Set to "wandb" to use Weights & Biases
    dataset_text_field="text",  # Specify the text field in the dataset
    packing=False  # Set packing parameter here
)


trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_config,
    tokenizer=tokenizer
)

  trainer = SFTTrainer(
Applying chat template to train dataset: 100%|██████████| 1035/1035 [00:00<00:00, 13727.07 examples/s]
Tokenizing train dataset: 100%|██████████| 1035/1035 [00:00<00:00, 1293.46 examples/s]
Tokenizing train dataset: 100%|██████████| 1035/1035 [00:00<00:00, 3019.16 examples/s]
Applying chat template to eval dataset: 100%|██████████| 115/115 [00:00<00:00, 8841.93 examples/s]
Tokenizing eval dataset: 100%|██████████| 115/115 [00:00<00:00, 1228.51 examples/s]
Tokenizing eval dataset: 100%|██████████| 115/115 [00:00<00:00, 2810.51 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [17]:
trainer.train()

Step,Training Loss,Validation Loss
100,0.7656,0.739997
200,0.6495,0.663197
300,0.6203,0.627453
400,0.5674,0.602982
500,0.6,0.587307




TrainOutput(global_step=517, training_loss=0.7003643332874291, metrics={'train_runtime': 512.1486, 'train_samples_per_second': 2.021, 'train_steps_per_second': 1.009, 'total_flos': 3471377783808000.0, 'train_loss': 0.7003643332874291})

In [24]:
wandb.finish()

0,1
eval/loss,█▅▃▂▁
eval/mean_token_accuracy,▁▅▆▇█
eval/runtime,▁▇▆▆█
eval/samples_per_second,█▂▃▃▁
eval/steps_per_second,█▂▃▃▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▇▄▂▇█▄▅▃▃▄▃▃▃▂▂▂▃▂▃▃▄▄▂▁▂▄▂▂▂▂▃▂▂▁▅▂▃▄▃▃
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▄▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.57581
eval/mean_token_accuracy,0.82149
eval/runtime,21.5624
eval/samples_per_second,5.333
eval/steps_per_second,5.333
total_flos,3387638257809408.0
train/epoch,0.99903
train/global_step,517.0
train/grad_norm,0.92126
train/learning_rate,0.0


In [18]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

adapter_model.safetensors: 100%|██████████| 1.67G/1.67G [01:00<00:00, 27.5MB/s]


CommitInfo(commit_url='https://huggingface.co/shreyakqss/llama-3.2-3b-it-Ecommerce-ChatBot/commit/181c58c704551996422f7847c69a50e6d2c3cb9e', commit_message='Upload model', commit_description='', oid='181c58c704551996422f7847c69a50e6d2c3cb9e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shreyakqss/llama-3.2-3b-it-Ecommerce-ChatBot', endpoint='https://huggingface.co', repo_type='model', repo_id='shreyakqss/llama-3.2-3b-it-Ecommerce-ChatBot'), pr_revision=None, pr_num=None)