# **Email Classifier**

## pip install required libraries

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

## Import required libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset

In [3]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

## Data Pre-processing

In [38]:
# Load Sample Data
data = {
    "Email Text": [
        "Hello, I want to know more about the features of Product A. Can you send me details?",
        "I need pricing information for Product B. Please share a quote.",
        "Can you provide specifications for Product C? I am interested in its capabilities.",
        "Does Product A come with a warranty? Please let me know.",
        "I'm considering buying Product B in bulk. Do you offer discounts?",
        "What are the delivery options for Product C?",
        "Can you compare Product A and B for me? I'm trying to decide which one to buy.",
        "I want to integrate Product C into our system. Does it support API access?"
    ],
    "Category": ["Product A",
                 "Product B",
                 "Product C",
                 "Product A",
                 "Product B",
                 "Product C",
                 "Product A / Product B",
                 "Product C"]
}
df = pd.DataFrame(data)

# Label Mapping
label_map = {"Product A": 0,
             "Product B": 1,
             "Product C": 2,
             "Product A / Product B": 3,
             "Product A / Product C": 4,
             "Product B / Product A": 5,
             "Product B / Product C": 6,
             "Product C / Product A": 7,
             "Product C / Product B": 8     # I have included extra labels for edge cases; in case the dataset gets updated
             }
df["Labels"] = df["Category"].map(label_map)

# Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Email Text"].tolist(), df["Labels"].tolist(), test_size=0.2, random_state=42
)

# Save data to csv file for easeier retrieval
df.to_csv("email_dataset.csv", index=False)

In [39]:
# Verify
print("\nFinal dataset:")
print(df[["Email Text", "Category", "Labels"]])


Final dataset:
                                          Email Text               Category  \
0  Hello, I want to know more about the features ...              Product A   
1  I need pricing information for Product B. Plea...              Product B   
2  Can you provide specifications for Product C? ...              Product C   
3  Does Product A come with a warranty? Please le...              Product A   
4  I'm considering buying Product B in bulk. Do y...              Product B   
5       What are the delivery options for Product C?              Product C   
6  Can you compare Product A and B for me? I'm tr...  Product A / Product B   
7  I want to integrate Product C into our system....              Product C   

   Labels  
0       0  
1       1  
2       2  
3       0  
4       1  
5       2  
6       3  
7       2  


## Tokenization and Training the model

In [None]:
model_name = "bert-base-uncased"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Convert to dataset
class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=9)

# Define Performance Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=1)
    return {
            "precision": precision,
            }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    learning_rate = 1e-5,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train() # You will be asked your wandb API Key, make sure you have it before running the trainer.

# Save Model
model.save_pretrained(f"./email_classification_model_{model_name}")
tokenizer.save_pretrained(f"./email_classification_model_{model_name}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,2.376141,0.0,1.0,0.0,0.0
2,No log,2.35288,0.0,1.0,0.0,0.0
3,No log,2.329516,0.0,1.0,0.0,0.0
4,No log,2.316335,0.0,1.0,0.0,0.0
5,No log,2.311042,0.0,1.0,0.0,0.0


('./email_classification_model_bert-base-uncased/tokenizer_config.json',
 './email_classification_model_bert-base-uncased/special_tokens_map.json',
 './email_classification_model_bert-base-uncased/vocab.txt',
 './email_classification_model_bert-base-uncased/added_tokens.json',
 './email_classification_model_bert-base-uncased/tokenizer.json')

## Deployment & Inference

In [41]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Function to predict email category
def predict_email_category(text):
    model = AutoModelForSequenceClassification.from_pretrained(f"./email_classification_model_{model_name}")
    tokenizer = AutoTokenizer.from_pretrained(f"./email_classification_model_{model_name}")
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    category_map = {0: "Product A", 1: "Product B", 2: "Product C", 3: "Product A / Product B"}
    return category_map[predicted_class]

## TESTING

In [60]:
# Test predictions

email = ["I am interested in buying Product A.", "I want to know more about Product B", "I wish to know more about Product A and Product B."]
for i in email:
  print(predict_email_category(i))

Product A
Product B
Product A / Product B
