In [None]:
!pip install -q -U bitsandbytes  # For 4-bit quantization (QLoRA)
!pip install -q -U transformers accelerate peft trl  # Hugging Face tools
!pip install -q datasets

In [None]:
from datasets import load_dataset
# Load a public dataset
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")

In [None]:
def format_prompt(sample):
    # This is the template the model will learn to use
    instruction = (
        "You are a helpful and professional customer support agent. "
        "Answer the customer question concisely and resolve their issue.\n\n"
        f"### Question:\n{sample['instruction']}\n\n"
        f"### Response:\n{sample['response']}"
    )
    return {"text": instruction}

# Apply the format to the dataset
formatted_dataset = dataset.map(format_prompt)

In [None]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto" # Maps the model layers efficiently to the GPU
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16, # Rank of the update matrices
    lora_alpha=32, # Scaling factor
    target_modules=["q_proj", "v_proj"], # Target key attention matrices
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_tuned_customer_agent",
    num_train_epochs=1,
    per_device_train_batch_size=2, # Reduced to save memory
    gradient_accumulation_steps=8, # Increased to compensate for smaller batch size
    learning_rate=2e-4,
    logging_steps=25,
    save_strategy="epoch",
    bf16=True, # Use BFloat16 for faster training on modern GPUs
    report_to="none", # Disable reporting to Weights & Biases
    gradient_checkpointing=True, # Enable gradient checkpointing to save memory
    gradient_checkpointing_kwargs={'use_reentrant': False}
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset["train"],
    peft_config=lora_config
)
trainer.train()

# Save the adapter weights to the directory
trainer.model.save_pretrained("./final_adapter")

In [None]:
test_question = "I haven't received my refund after 10 days. My account is X990."
prompt = (
    "You are a helpful and professional customer support agent. "
    "Answer the customer question concisely and resolve their issue.\n\n"
    f"### Question:\n{test_question}\n\n"
    f"### Response:\n"
)

# Use the fine-tuned model to generate the response
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')
output = model.generate(input_ids, max_new_tokens=200, do_sample=False)

# Decode and display
print(tokenizer.decode(output[0], skip_special_tokens=True))