In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

# Understanding how Llama is initialised and Inferenced

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch


base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

messages = [{"role": "user", "content": "Who is the prime minister of India !"}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

In [None]:
from IPython.display import Markdown, display

messages = [
    {
        "role": "system",
        "content": "You are a skilled Java developer specializing in data structures and algorithms.",
    },
    {
        "role": "user",
        "content": "Can you write me code to solve a two sum problem ?",
    },
]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=512, do_sample=True)

display(
    Markdown(
            outputs[0]["generated_text"].split(
                "<|start_header_id|>assistant<|end_header_id|>"
            )[1]
        )
    )

# Finetuning Llama

In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

2025-09-24 21:36:05.835459: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758749765.859548     156 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758749765.866526     156 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# -------------------------------
# 1. Load Dataset
# -------------------------------
df = pd.read_csv("/kaggle/input/synthetic-dataset/balanced_dataset.csv")  # columns: description, category

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["category"], random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [3]:
# -------------------------------
# 2. Define Classes
# -------------------------------
labels = df["category"].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

In [4]:
# -------------------------------
# 3. Load Tokenizer + Model
# -------------------------------
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Quantization Config (4-bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,              # enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # computation dtype
    bnb_4bit_use_double_quant=True,        # double quantization (saves memory)
    bnb_4bit_quant_type="nf4"              # normal float 4
)

# Model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# -------------------------------
# 4. Convert to Instruction Format
# -------------------------------
def format_example(example):
    prompt = (
        f"Classify the following bank transaction into one of these categories:\n"
        f"{', '.join(labels)}\n\n"
        f"Description: {example['description']}\n\nCategory:"
    )
    target = f" {example['category']}"
    return {"text": prompt + target}

train_dataset = train_dataset.map(format_example)
test_dataset = test_dataset.map(format_example)

Map:   0%|          | 0/6009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1503 [00:00<?, ? examples/s]

In [6]:
# -------------------------------
# 5. Tokenization
# -------------------------------
def tokenize_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=512,
    )

train_tokenized = train_dataset.map(tokenize_fn, batched=True, remove_columns=train_dataset.column_names)
test_tokenized = test_dataset.map(tokenize_fn, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/6009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1503 [00:00<?, ? examples/s]

In [7]:
# -------------------------------
# 6. LoRA Config
# -------------------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [9]:
# -------------------------------
# 7. Training Setup
# -------------------------------
train_args = TrainingArguments(
    output_dir="./llama-classifier",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="epoch",  # evaluate after each epoch
    fp16=True,
    optim="adamw_torch",
    warmup_ratio=0.05,
    lr_scheduler_type="linear",
    report_to="none"
)

In [10]:
# -------------------------------
# 8. Data Collator
# -------------------------------
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [16]:
# -------------------------------
# 9. Custom Evaluation
# -------------------------------
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

def evaluate_model(test_df):
    y_true, y_pred = [], []
    for _, row in test_df.iterrows():
        prompt = (
            f"Classify the following bank transaction into one of these categories:\n"
            f"{', '.join(labels)}\n\n"
            f"Description: {row['description']}\n\nCategory:"
        )
        
        # Generate
        out = pipe(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]

        # Try extracting after "Category:"
        if "Category:" in out:
            candidate = out.split("Category:")[-1].strip()
        else:
            candidate = out  # fallback: whole output

        # Take first word only (avoid extra sentences)
        if candidate.strip():
            pred = candidate.split()[0]
        else:
            pred = "Miscellaneous"  # fallback category

        # Ensure prediction is a valid label
        if pred not in labels:
            # try to map by fuzzy match
            matches = [lbl for lbl in labels if lbl.lower() in candidate.lower()]
            pred = matches[0] if matches else "Miscellaneous"

        y_true.append(row["category"])
        y_pred.append(pred)

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, labels=labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred, labels=labels))
    print("\nAccuracy:", accuracy_score(y_true, y_pred))


In [13]:
# -------------------------------
# 10. Trainer
# -------------------------------
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    data_collator=collator,
)

In [14]:
# -------------------------------
# 11. Train
# -------------------------------
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,0.4644,0.448071
2,0.3962,0.402424
3,0.3945,0.399548


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=1128, training_loss=0.5141635084828586, metrics={'train_runtime': 4870.0864, 'train_samples_per_second': 3.702, 'train_steps_per_second': 0.232, 'total_flos': 2.042263825931059e+16, 'train_loss': 0.5141635084828586, 'epoch': 3.0})

In [17]:
# -------------------------------
# 12. Final Evaluation
# -------------------------------
evaluate_model(test_df)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Classification Report:
                    precision    recall  f1-score   support

         Education       1.00      0.35      0.52       125
Travel & Transport       1.00      1.00      1.00       125
         Groceries       0.95      1.00      0.98       125
     Miscellaneous       0.42      0.90      0.58       125
 Bills & Utilities       1.00      1.00      1.00       125
  Health & Fitness       1.00      1.00      1.00       125
          Shopping       0.93      0.62      0.74       125
     Entertainment       0.83      0.77      0.80       125
       Investments       0.85      0.75      0.80       126
            Income       1.00      0.97      0.98       126
     Food & Drinks       0.99      0.97      0.98       126
       Withdrawals       0.95      1.00      0.97       125

          accuracy                           0.86      1503
         macro avg       0.91      0.86      0.86      1503
      weighted avg       0.91      0.86      0.86      1503


Confusion Ma

In [18]:
# -------------------------------
# 13. Save
# -------------------------------
model.save_pretrained("./llama-classifier")
tokenizer.save_pretrained("./llama-classifier")

('./llama-classifier/tokenizer_config.json',
 './llama-classifier/special_tokens_map.json',
 './llama-classifier/chat_template.jinja',
 './llama-classifier/tokenizer.json')

## Inferencing Finetuned-Llama3.2

In [37]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="./llama-classifier",
    tokenizer="./llama-classifier",
    device_map="auto"
)

def classify_transaction(text):
    prompt = (
        f"Classify the following bank transaction into one of these categories:\n"
        f"{', '.join(labels)}\n\n"
        f"Description: {text}\n\nCategory:"
    )
    output = pipe(prompt, max_new_tokens=20, do_sample=False)
    generated = output[0]["generated_text"].split("Category:")[-1].strip().split("\n")[0].strip()

    return generated  # fallback


print(classify_transaction("Sent Rs.510.00 From HDFC Bank A/C *0552 To Swiggy Limited On 20/09/25 Ref 111495595089 Not You? Call 18002586161/SMS BLOCK UPI to 7308080808"))
print(classify_transaction("UPI Payment to Amazon"))
print(classify_transaction("Salary credited from company"))
print(classify_transaction("Recharge of Airtel mobile"))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Food & Drinks
Investments
Income
Bills & Utilities


## Pushing into Huggingface Hub

In [19]:
!pip install -q huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from huggingface_hub import login
import os

login(os.getenv("HF_TOKEN"))

In [None]:
from huggingface_hub import HfApi
import os

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="./llama-classifier",
    repo_id="karthiksagarn/llama3-3.2b-finetuned-financial",
    repo_type="model",
)


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/karthiksagarn/llama3-3.2b-finetuned-financial/commit/98dac881f092e17ee2dd44d05ea3bf0cf72cdab2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='98dac881f092e17ee2dd44d05ea3bf0cf72cdab2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/karthiksagarn/llama3-3.2b-finetuned-financial', endpoint='https://huggingface.co', repo_type='model', repo_id='karthiksagarn/llama3-3.2b-finetuned-financial'), pr_revision=None, pr_num=None)