In [1]:
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name_distilbert = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
model_name_qwen = "Qwen/Qwen3-1.7B"

In [3]:
lable_names = [
	"IN_Bank",
	"IN_School",
	"US_Bank",
	"US_School"
]

In [4]:
ds = load_from_disk("./data/mail_dataset_labeled")
ds["test"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['IN_Bank', 'IN_School', 'US_Bank', 'US_School'], id=None)}

In [5]:
ds["test"][:2]

{'text': ['13-05-2025 Dear Rohit Gupta, Suspicious activity noticed on HDFC Credit Card XX9156. Card temporarily blocked as precaution',
  'Roosevelt Middle School progress reports now available in parent portal, contact office for login assistance.'],
 'label': [0, 3]}

In [6]:
mail_summaries = [
	ds["text"] for ds in ds["test"]
]


# Distilbert

In [None]:
from transformers import pipeline

In [None]:
model_distilbert = pipeline("zero-shot-classification", model=model_name_distilbert, device_map="cpu")

In [None]:
results = model_distilbert(mail_summaries[0], candidate_labels=lable_names)
print(results)

In [None]:
Error_counter = 0
for sumamry in ds["test"]:
	result = model_distilbert(sumamry["text"], candidate_labels=lable_names)
	largest_score = max(result["scores"])
	largest_label = result["labels"][result["scores"].index(largest_score)]
	actual_label = lable_names[sumamry["label"]]
	if largest_label != actual_label:
		Error_counter += 1
print(f"Total messages: {len(ds['test'])}")
print(f"Total Error: {Error_counter}")
print("-"*50)
print(f"Accuracy: {((Error_counter / len(ds['test']))*100)}%")

# Qwen

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [10]:
model_qwen = pipeline("text-generation", model=model_name_qwen, device_map="cpu")

Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.95s/it]
Device set to use mps


In [None]:
message_system = """
You are a helpful mail sorting assistant.
You will classify the email summary into one of the following categories:"India Bank", "India School", "US Bank", "US School"
Ensure your output is from the above list only.
No explanation is needed.
The output should only be one of the following: "India Bank", "India School", "US Bank", "US School"
"""

In [12]:
def generate_chat_messages(dataset):
	messages = [
		[
			{"role": "system", "content": message_system},
			{"role": "user", "content": f"Classify: {ds['text']}"}
		] for ds in dataset
	]
	return messages

In [13]:
chat_messages = generate_chat_messages(ds["test"])
len(chat_messages)

42

In [14]:
response = model_qwen(chat_messages[0], max_new_tokens=100, )
print(response[0]["generated_text"][-1]["content"])

<think>
Okay, let's see. The user wants me to classify this email summary into one of the categories: India Bank, India School, US Bank, US School.

The email is from someone named Rohit Gupta. The subject line is "Suspicious activity noticed on HDFC Credit Card XX9156. Card temporarily blocked as precaution." The date is 13-05-2025. 

First, I need to figure out which bank is mentioned. HDF


In [15]:
def decode_output(output_text):
	# print(chat_message["text"][0])
	input_length = len(chat_message["text"][0][0])
	# print(input_length)
	# print("--"*50)
	# print(output_text[0])
	output_length = len(output_text[0])
	# print(output_length)
	parsed_output = tokenizer_qwen.decode(output_text[0][input_length:])
	return parsed_output

In [16]:
for sumamry in ds["test"]:
	# print(sumamry["text"])
	complete_message = f"{message_system}\n\n{sumamry['text']}"
	print(complete_message)
	print("-"*50)


You are a helpful mail sorting assistant.
No explanation is needed.

You will classify the email summary into one of the following categories:"India Bank", "India School", "US Bank", "US School"

Ensure your output is from the above list only.


13-05-2025 Dear Rohit Gupta, Suspicious activity noticed on HDFC Credit Card XX9156. Card temporarily blocked as precaution
--------------------------------------------------

You are a helpful mail sorting assistant.
No explanation is needed.

You will classify the email summary into one of the following categories:"India Bank", "India School", "US Bank", "US School"

Ensure your output is from the above list only.


Roosevelt Middle School progress reports now available in parent portal, contact office for login assistance.
--------------------------------------------------

You are a helpful mail sorting assistant.
No explanation is needed.

You will classify the email summary into one of the following categories:"India Bank", "India School