In [30]:
from transformers import AutoTokenizer
from datasets import load_from_disk, Dataset, DatasetDict

# Model

In [2]:
model_id = "Qwen/Qwen2.5-0.5B-Instruct"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
dataset = load_from_disk("./data/mail_dataset_labeled")

# Dataset

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 335
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 42
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 43
    })
})

In [6]:
label_names = ["IN_Bank", "IN_School", "US_Bank", "US_School"]

In [7]:
message_system = """
You are a helpful mail sorting assistant.
You will classify the email summary into one of the following categories:"India Bank", "India School", "US Bank", "US School"
No explanation is needed.
The output should only be one of the following: "India Bank", "India School", "US Bank", "US School"
"""

## Transforming the train and validation datasets

In [10]:
def convert_to_training_messages(dataset):
 messages = [
   [
    {
        "role": "system", 
        "content": message_system
    },
    {
        "role": "user",
        "content": f"Categorise: {ds['text']}"
    },
    {
        "role": "assistant",
        "content": f"Category: {label_names[ds['label']]}"
    }
   ] for ds in dataset
 ]
 texts = [
  tokenizer.apply_chat_template(message, tokenize=False) 
    for message in messages
 ]
 return {"text": texts}

In [11]:
ds_train = convert_to_chat_messages(dataset["train"])
ds_val = convert_to_chat_messages(dataset["validation"])

## Transforming the test dataset

In [None]:
def convert_to_prompting_messages(dataset):
  data = []
  for ds in dataset:
    message = [
      {
        "role": "system", 
        "content": message_system
      },
      {
          "role": "user",
          "content": f"Categorise: {ds['text']}"
      }
    ]
    chat_message = tokenizer.apply_chat_template(
      message,
      tokenize=False,
      add_generation_prompt=True,
    )
    data.append(
      {
        "text": chat_message,
        "label": ds["label"]
      }
    )
  return {"items": data}

In [27]:
ds_test = convert_to_prompting_messages(dataset["test"])

# Create dataset

In [28]:
ds_train = Dataset.from_dict(ds_train)
ds_val = Dataset.from_dict(ds_val)
ds_test = Dataset.from_dict(ds_test)

In [31]:
ds_combined = DatasetDict({
	"train": ds_train,
	"validation": ds_val,
	"test": ds_test
})

In [32]:
ds_combined.save_to_disk("./data/llm_mail_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 335/335 [00:00<00:00, 26733.61 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 42/42 [00:00<00:00, 8143.90 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 43/43 [00:00<00:00, 15751.53 examples/s]


# Load and test

In [33]:
ds_llm = load_from_disk("./data/llm_mail_dataset")
ds_llm

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 335
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 42
    })
    test: Dataset({
        features: ['items'],
        num_rows: 43
    })
})