# **Final packages and Libraries**

In [None]:
# üîÑ Clean install for T4 fine-tuning (CUDA 12.1 compatible)
!pip uninstall -y torch torchvision torchaudio bitsandbytes triton transformers accelerate datasets peft trl

# 1Ô∏è‚É£ Core PyTorch stack (CUDA 12.1 build ‚Äî works with T4)
!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121

# 2Ô∏è‚É£ Quantization + model fine-tuning libs
!pip install bitsandbytes==0.43.0 triton==2.2.0

# 3Ô∏è‚É£ Hugging Face ecosystem
!pip install transformers==4.43.3 peft==0.10.0 trl==0.9.4 accelerate==0.30.1 datasets==2.19.0




^C
Collecting bitsandbytes==0.43.0
  Downloading bitsandbytes-0.43.0-py3-none-win_amd64.whl.metadata (1.9 kB)


ERROR: Could not find a version that satisfies the requirement triton==2.2.0 (from versions: none)
ERROR: No matching distribution found for triton==2.2.0


In [None]:
import torch, bitsandbytes, transformers
print("‚úÖ Torch:", torch.__version__)
print("‚úÖ Torch CUDA available:", torch.cuda.is_available())
print("‚úÖ bitsandbytes:", bitsandbytes.__version__)
print("‚úÖ Transformers:", transformers.__version__)


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
################################################################################
# MODEL AND DATA CONFIGURATION
################################################################################

# Base model from Hugging Face
model_name = "meta-llama/Llama-3.1-8B-Instruct"

#model_name = "meta-llama/Llama-3.1-7B"

# Local bilingual fine-tuning dataset (JSONL format)
#dataset_name = "/content/drive/MyDrive/Finetuning_Llama_3.1/llama31_finetune_dataset.jsonl"
dataset_name = "C:\Users\Lavanya\Desktop\synkcode\Llama_3.1_finetuning\en_llama31_finetune_dataset.jsonl"
# Output name for the fine-tuned adapter/model
new_model = "Llama-3.1_octa_finetuned"

################################################################################
# QLoRA PARAMETERS
################################################################################

# LoRA attention dimension
lora_r = 32 #64 LoRA rank r=64 is moderate. If memory issues arise, you could slightly reduce it to r=32 for a small reduction in memory usage.

# Alpha scaling factor for LoRA
lora_alpha = 16

# Dropout probability in LoRA layers
lora_dropout = 0.1

################################################################################
# BITSANDBYTES (4-BIT QUANTIZATION) PARAMETERS
################################################################################

# Enable 4-bit quantization
use_4bit = True

# Compute dtype for 4-bit quantization
bnb_4bit_compute_dtype = "float16"   # "bfloat16" if you have A100+

# Quantization type: 'nf4' (recommended)
bnb_4bit_quant_type = "nf4"

# Nested quantization (usually False for T4)
use_nested_quant = False

################################################################################
# TRAINING ARGUMENTS
################################################################################

# Output directory for logs/checkpoints
output_dir = "./results"

# Number of epochs
num_train_epochs = 1

# Mixed precision settings (T4 -> fp16 only)
fp16 = True
bf16 = False

# Batch sizes (T4 safe)
per_device_train_batch_size = 2
per_device_eval_batch_size = 2

# Accumulate gradients (simulate larger batch)
gradient_accumulation_steps = 4

# Gradient checkpointing for memory efficiency
gradient_checkpointing = True

# Gradient clipping
max_grad_norm = 0.3

# Learning rate and weight decay
learning_rate = 2e-4
weight_decay = 0.001

# Optimizer
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"
warmup_ratio = 0.03

# Training steps control
max_steps = -1  # full epoch
save_steps = 100
logging_steps = 25

# Efficiency
group_by_length = True

################################################################################
# SFT (SUPERVISED FINE-TUNING) PARAMETERS
################################################################################

# Max token sequence length
max_seq_length = 256   # good for instruction tuning

# Pack shorter examples together to save space
packing = False

# Device mapping (auto for Colab)
device_map = "auto"

################################################################################
# NOTES
################################################################################
# ‚úÖ Designed for Google Colab (T4 GPU, 15GB VRAM)
# ‚úÖ QLoRA fine-tuning with 4-bit quantization
# ‚úÖ Works with bilingual JSONL dataset (messages format)
# ‚úÖ Recommended dataset key: {"messages": [{"role": "...", "content": "..."}]}
################################################################################


In [None]:
# Load dataset (you can process it here)
from datasets import load_dataset
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

from datasets import load_dataset

# Load dataset
dataset = load_dataset("json", data_files=dataset_name)
dataset = dataset["train"]

# ‚úÖ Convert each conversation into a single "text" field
def format_conversation(example):
    messages = example["messages"]
    text = ""
    for m in messages:
        role = "User" if m["role"] == "user" else "Assistant"
        text += f"{role}: {m['content'].strip()}\n"
    return {"text": text.strip()}

dataset = dataset.map(format_conversation)

# ‚úÖ Check the first sample
print(dataset[0]["text"])



# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Enable 4-bit quantization
    bnb_4bit_quant_type="nf4",             # NormalFloat4 quantization
    bnb_4bit_use_double_quant=True,        # Nested quantization for memory efficiency
    bnb_4bit_compute_dtype=torch.bfloat16, # Compute precision
    llm_int8_enable_fp32_cpu_offload=True  # ‚úÖ Correct flag for CPU offload
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)



# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

In [None]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

**Step 6:Use the text generation pipeline to ask questions like ‚ÄúWhat is a large language model?‚Äù Note that I‚Äôm formatting the input to match Llama 2 prompt template.**

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a HScode?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

**Step 5: Check the plots on tensorboard, as follows**

In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

**Step 7: Store New Llama2 Model (Llama-2-7b-chat-finetune)**

How can we store our new Llama-2-7b-chat-finetune model now? We need to merge the weights from LoRA with the base model. Unfortunately, as far as I know, there is no straightforward way to do it: we need to reload the base model in FP16 precision and use the peft library to merge everything.

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

**Step 8: Push Model to Hugging Face Hub**

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login

model.push_to_hub("Lavan1999/Llama-2-7b-chat-finetune", check_pr=True)

tokenizer.push_to_hub("Lavan1999/Llama-2-7b-chat-finetune",check_pr=True)
