In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_from_disk, Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device.")
else:
    device = torch.device("cpu")
    print("MPS not available, using CPU.")

Using MPS device.


In [3]:
model_path = "models/my_llm_mail_classifier"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


# Load the dataset

In [6]:
ds = load_from_disk("./data/llm_mail_dataset")

In [8]:
ds['test'][0]['items']

{'label': 2,
 'text': '<|im_start|>system\n\nYou are a helpful mail sorting assistant.\nYou will classify the email summary into one of the following categories:"India Bank", "India School", "US Bank", "US School"\nNo explanation is needed.\nThe output should only be one of the following: "India Bank", "India School", "US Bank", "US School"\n<|im_end|>\n<|im_start|>user\nCategorise: Investment update Your portfolio summary is ready Hi JENNIFER, Your investments gained 2.3% this month. Review your performance and rebalancing recommendations. Portfolio value $127,845.92 as of May 30, 2025<|im_end|>\n<|im_start|>assistant\n'}

# Evaluate the fine-tuned model

In [9]:
label_names = [
	"IN_Bank",
	"IN_School",
	"US_Bank",
	"US_School"
]

In [10]:
def evaluate(model, tokenizer, data):
    tokenized_input = tokenizer(data,return_tensors="pt").to(device)
    response = model.generate(
        tokenized_input.input_ids,
        attention_mask=tokenized_input.attention_mask,
        max_new_tokens=10,
    )
    decoded_message = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
    decoded_category = decoded_message.split("Category: ")[1]
    return decoded_category

In [13]:
error_counter = 0
index = 0
total_messages = len(ds['test'])
for data in ds['test']:
    result = evaluate(model, tokenizer, data['items']['text'])
    actual = label_names[data['items']['label']]
    if actual != result:
        print(f"Actual: {actual}, Result: {result}, Index: {index}")
        error_counter += 1
    index += 1
print(f"Total messages: {total_messages}")
print(f"Total Error: {error_counter}")
print("-"*50)
print(f"Accuracy: {((1-error_counter / total_messages)*100)}%")

Actual: US_Bank, Result: IN_Bank, Index: 11
Actual: US_Bank, Result: IN_Bank, Index: 15
Actual: US_Bank, Result: IN_Bank, Index: 31
Total messages: 43
Total Error: 3
--------------------------------------------------
Accuracy: 93.02325581395348%
