In [2]:
import os 
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="max_split_size_mb:128"

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from datasets import Dataset, ClassLabel
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, precision_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm
2024-05-19 11:02:22.655357: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
df = pd.read_csv('Training_data_preprocessed.csv')

In [4]:
df.head()

Unnamed: 0,Text,Category,EmailType,clean_text,tokens
0,"Hi *******, Your payment to Uber India was App...",category_3,email_type_93,hi payment uber india approved paid amount ube...,"['hi', 'payment', 'uber', 'india', 'approved',..."
1,Your Zomato Online Ordering receipt Refund Pro...,category_3,email_type_84,zomato online ordering receipt refund processe...,"['zomato', 'online', 'ordering', 'receipt', 'r..."
2,Electricity Bill Payment Successful ‚Çπ 979 Fo...,category_3,email_type_3,electricity bill payment successful ‚çπ 979 ce...,"['electricity', 'bill', 'payment', 'successful..."
3,Payment requested by FINCFRIENDS PVT. LTD. Rec...,category_3,email_type_92,payment requested fincfriends pvt ltd receipt ...,"['payment', 'requested', 'fincfriends', 'pvt',..."
4,Greetings from Swiggy Your order was delivered...,category_3,email_type_86,greeting swiggy order delivered 29 minute rate...,"['greeting', 'swiggy', 'order', 'delivered', '..."


In [5]:
# Convert categorical labels to numerical labels
category_labels = ClassLabel(names=list(df['Category'].unique()))
type_labels = ClassLabel(names=list(df['EmailType'].unique()))

df['Category'] = df['Category'].map(lambda x: category_labels.str2int(x))
df['EmailType'] = df['EmailType'].map(lambda x: type_labels.str2int(x))

# # Split the data into train and test sets
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create Dataset objects
train_dataset = Dataset.from_pandas(df)
# test_dataset = Dataset.from_pandas(test_df)

In [11]:

label_to_id

{0: 'category_3', 1: 'category_1', 2: 'category_2'}

In [41]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

def tokenize_function(examples):
    return tokenizer(examples['clean_text'], padding='max_length', truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
# test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(['Text', 'clean_text', 'tokens'])
# test_dataset = test_dataset.remove_columns(['Text', 'clean_text', 'tokens'])

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'Category', 'EmailType'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'Category', 'EmailType'])

# # Move tensors to the MPS device
# train_dataset = train_dataset.with_transform(lambda examples: {k: v.to(device) for k, v in examples.items()})
# test_dataset = test_dataset.with_transform(lambda examples: {k: v.to(device) for k, v in examples.items()})


cuda


                                                                  

In [8]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


category_class_weights=compute_class_weight(class_weight = 'balanced',classes = np.unique(train_dataset['Category']),y = list(train_dataset['Category'].numpy()))
category_class_weights=torch.tensor(category_class_weights,dtype=torch.float)

emailtype_class_weights=compute_class_weight(class_weight = 'balanced',classes=np.unique(train_dataset['EmailType']),y=list(train_dataset['EmailType'].numpy()) )
emailtype_class_weights=torch.tensor(emailtype_class_weights,dtype=torch.float)

# Convert class weights to dictionary format
# category_class_weights_dict = dict(enumerate(category_class_weights))
# emailtype_class_weights_dict = dict(enumerate(emailtype_class_weights))


In [42]:
from sklearn.utils.class_weight import compute_class_weight
class MultiTaskBERT(BertPreTrainedModel):
    def __init__(self, config, Category, EmailType):
        super().__init__(config)
        self.Category = Category
        self.EmailType = EmailType
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier_category = nn.Linear(config.hidden_size, Category)
        self.classifier_type = nn.Linear(config.hidden_size, EmailType)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, Category=None, EmailType=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        logits_category = self.classifier_category(pooled_output)
        logits_type = self.classifier_type(pooled_output)

        loss = None
        if Category is not None and EmailType is not None:
            # add class weights here

            loss_fct_category = nn.CrossEntropyLoss()
            loss_fct_type = nn.CrossEntropyLoss()
            
            # loss_fct = nn.CrossEntropyLoss()
            loss_category = loss_fct_category(logits_category, Category)
            loss_type = loss_fct_type(logits_type, EmailType)
            loss = loss_category + loss_type

        return SequenceClassifierOutput(
            loss=loss,
            logits=(logits_category, logits_type),
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [43]:
def compute_metrics(logits_category, logits_type, labels_category, labels_type):
    pred_category = logits_category.argmax(dim=1).cpu()
    pred_type = logits_type.argmax(dim=1).cpu()
    labels_category = labels_category.cpu()
    labels_type = labels_type.cpu()
    category_accuracy = accuracy_score(labels_category, pred_category)
    type_accuracy = accuracy_score(labels_type, pred_type)
    category_precision = precision_score(labels_category, pred_category, average='weighted')
    type_precision = precision_score(labels_type, pred_type, average='weighted')
    category_f1 = f1_score(labels_category, pred_category, average='weighted')
    type_f1 = f1_score(labels_type, pred_type, average='weighted')
    category_recall = recall_score(labels_category, pred_category, average='weighted')
    type_recall = recall_score(labels_type, pred_type, average='weighted')

    return {
        'category_accuracy': category_accuracy,
        'emailtype_accuracy': type_accuracy,
        'category_f1': category_f1,
        'emailtype_f1': type_f1,
        'category_precision': category_precision,
        'emailtype_precision': type_precision,
        'category_recall': category_recall,
        'emailtype_recall': type_recall,

    }


In [44]:
num_labels_category = len(category_labels.names)
num_labels_type = len(type_labels.names)

def evaluate_model(model, eval_fold_dataset, device):
    total_eval_loss = 0
    all_logits_category = []
    all_logits_type = []
    all_labels_category = []
    all_labels_type = []
    for batch in tqdm(DataLoader(eval_fold_dataset, batch_size=16), desc=f"Evaluating Fold {fold + 1}/{num_folds} Epoch {epoch + 1}/{num_train_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss
            total_eval_loss += loss.item()
            logits_category, logits_type = outputs.logits
            all_logits_category.append(logits_category)
            all_logits_type.append(logits_type)
            all_labels_category.append(batch['Category'])
            all_labels_type.append(batch['EmailType'])

    avg_eval_loss = total_eval_loss / len(eval_fold_dataset)
    all_logits_category = torch.cat(all_logits_category, dim=0)
    all_logits_type = torch.cat(all_logits_type, dim=0)
    all_labels_category = torch.cat(all_labels_category, dim=0)
    all_labels_type = torch.cat(all_labels_type, dim=0)

    metrics = compute_metrics(all_logits_category, all_logits_type, all_labels_category, all_labels_type)
    return metrics

def update_metrics(metrics, category_metrics, emailtype_metrics):
    category_metrics["accuracy"].append(metrics["category_accuracy"])
    category_metrics["precision"].append(metrics["category_precision"])
    category_metrics["recall"].append(metrics["category_recall"])
    category_metrics["f1"].append(metrics["category_f1"])
    
    emailtype_metrics["accuracy"].append(metrics["emailtype_accuracy"])
    emailtype_metrics["precision"].append(metrics["emailtype_precision"])
    emailtype_metrics["recall"].append(metrics["emailtype_recall"])
    emailtype_metrics["f1"].append(metrics["emailtype_f1"])
    return category_metrics, emailtype_metrics

In [None]:
from sklearn.model_selection import KFold
from tqdm import tqdm
from torch.utils.data import DataLoader
import numpy as np
# Define the number of folds for cross-validation
num_folds = 5

# Initialize KFold
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
category_metrics = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": []
}
emailtype_metrics = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": []
}

train_category_metrics = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": []
}
train_emailtype_metrics = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": []
}
# Define lists to store evaluation metrics for each fold
category_accuracy_list = []
type_accuracy_list = []
category_f1_list = []
type_f1_list = []
category_precision_list = []
type_precision_list = []
num_train_epochs = 3
for fold, (train_index, eval_index) in enumerate(kf.split(train_dataset)):
    print(f"Fold {fold + 1}/{num_folds}")

    # Split the dataset into training and evaluation sets for this fold
    train_fold_dataset = train_dataset.select(train_index)
    eval_fold_dataset = train_dataset.select(eval_index)

    # Initialize model for this fold
    model = MultiTaskBERT.from_pretrained('bert-base-uncased', Category=num_labels_category, EmailType=num_labels_type)
    model.to(device)

    # Define optimizer for this fold
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    # Training loop for this fold
    for epoch in range(num_train_epochs):
        model.train()
        total_train_loss = 0

        for batch in tqdm(DataLoader(train_fold_dataset, batch_size=8, shuffle=True), desc=f"Training Fold {fold + 1}/{num_folds} Epoch {epoch + 1}/{num_train_epochs}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        avg_train_loss = total_train_loss / len(train_fold_dataset)

        # Evaluation loop for this fold
        model.eval()
    metrics = evaluate_model(model, eval_fold_dataset, device)
    category_metrics, emailtype_metrics = update_metrics(metrics, category_metrics, emailtype_metrics)
    train_metrics = evaluate_model(model, train_fold_dataset, device)
    train_category_metrics, train_emailtype_metrics = update_metrics(train_metrics, train_category_metrics, train_emailtype_metrics)
    print(f"Fold {fold + 1}, Category Train Accuracy: {train_metrics['category_accuracy']:.2f}%, Precision: {train_metrics['category_precision']:.2f}, Recall: {train_metrics['category_recall']:.2f}, F1-Score: {train_metrics['category_f1']:.2f}")
    print(f"Fold {fold + 1}, EmailType Train Accuracy: {train_metrics['emailtype_accuracy']:.2f}%, Precision: {train_metrics['emailtype_precision']:.2f}, Recall: {train_metrics['emailtype_recall']:.2f}, F1-Score: {train_metrics['emailtype_f1']:.2f}")
    print(f"Fold {fold + 1}, Category Accuracy: {metrics['category_accuracy']:.2f}%, Precision: {metrics['category_precision']:.2f}, Recall: {metrics['category_recall']:.2f}, F1-Score: {metrics['category_f1']:.2f}")
    print(f"Fold {fold + 1}, EmailType Accuracy: {metrics['emailtype_accuracy']:.2f}%, Precision: {metrics['emailtype_precision']:.2f}, Recall: {metrics['emailtype_recall']:.2f}, F1-Score: {metrics['emailtype_f1']:.2f}")
    torch.save(model.state_dict(), f"models/bert_preprocessed_{epoch + 1}.bin")
print(f"Average Category Accuracy: {np.mean(category_metrics['accuracy']):.2f}%")
print(f"Average Category Precision: {np.mean(category_metrics['precision']):.2f}")
print(f"Average Category Recall: {np.mean(category_metrics['recall']):.2f}")
print(f"Average Category F1-Score: {np.mean(category_metrics['f1']):.2f}")

In [16]:
from torch.utils.data import DataLoader
from tqdm import tqdm

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=16)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Function to compute accuracy
def compute_accuracy(predictions, labels):
    preds = predictions.argmax(dim=1)
    correct = (preds == labels).sum().item()
    return correct / labels.size(0)

# Function to compute metrics
def compute_metrics(logits_category, logits_type, labels_category, labels_type):
    pred_category = logits_category.argmax(dim=1)
    pred_type = logits_type.argmax(dim=1)

    category_accuracy = accuracy_score(labels_category.cpu().numpy(), pred_category.cpu().numpy())
    type_accuracy = accuracy_score(labels_type.cpu().numpy(), pred_type.cpu().numpy())

    category_f1 = f1_score(labels_category.cpu().numpy(), pred_category.cpu().numpy(), average='weighted')
    type_f1 = f1_score(labels_type.cpu().numpy(), pred_type.cpu().numpy(), average='weighted')

    return {
        'category_accuracy': category_accuracy,
        'type_accuracy': type_accuracy,
        'category_f1': category_f1,
        'type_f1': type_f1,
    }

num_train_epochs = 3

for epoch in range(num_train_epochs):
    # Training loop
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_train_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_train_epochs} - Average Training Loss: {avg_train_loss}")

    # Evaluation loop
    model.eval()
    total_eval_loss = 0
    all_logits_category = []
    all_logits_type = []
    all_labels_category = []
    all_labels_type = []

    for batch in tqdm(eval_dataloader, desc=f"Evaluating Epoch {epoch + 1}/{num_train_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss
            total_eval_loss += loss.item()
            logits_category, logits_type = outputs.logits
            all_logits_category.append(logits_category)
            all_logits_type.append(logits_type)
            all_labels_category.append(batch['Category'])
            all_labels_type.append(batch['EmailType'])

    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    all_logits_category = torch.cat(all_logits_category, dim=0)
    all_logits_type = torch.cat(all_logits_type, dim=0)
    all_labels_category = torch.cat(all_labels_category, dim=0)
    all_labels_type = torch.cat(all_labels_type, dim=0)

    metrics = compute_metrics(all_logits_category, all_logits_type, all_labels_category, all_labels_type)
    print(f"Epoch {epoch + 1}/{num_train_epochs} - Average Evaluation Loss: {avg_eval_loss}")
    print(f"Category Accuracy: {metrics['category_accuracy']}, Type Accuracy: {metrics['type_accuracy']}")
    print(f"Category F1 Score: {metrics['category_f1']}, Type F1 Score: {metrics['type_f1']}")
    torch.save(model.state_dict(), f"models/bert_assertion_{epoch + 1}.bin")
# Save the model and tokenizer after training
model.save_pretrained('./multi-task-bert')
tokenizer.save_pretrained('./multi-task-bert')


Training Epoch 1/3: 100%|██████████| 3513/3513 [46:18<00:00,  1.26it/s]


Epoch 1/3 - Average Training Loss: 0.7380412492743083


Evaluating Epoch 1/3: 100%|██████████| 440/440 [04:01<00:00,  1.82it/s]


Epoch 1/3 - Average Evaluation Loss: 0.34802815979769963
Category Accuracy: 0.9914602903501281, Type Accuracy: 0.9329632792485055
Category F1 Score: 0.9917304538108238, Type F1 Score: 0.9229827690683152


Training Epoch 2/3: 100%|██████████| 3513/3513 [46:17<00:00,  1.26it/s]


Epoch 2/3 - Average Training Loss: 0.2581941508437506


Evaluating Epoch 2/3: 100%|██████████| 440/440 [04:01<00:00,  1.82it/s]


Epoch 2/3 - Average Evaluation Loss: 0.261272983845662
Category Accuracy: 0.9928835752917734, Type Accuracy: 0.9434955878166809
Category F1 Score: 0.99276757520957, Type F1 Score: 0.9365035182367243


Training Epoch 3/3: 100%|██████████| 3513/3513 [46:15<00:00,  1.27it/s]


Epoch 3/3 - Average Training Loss: 0.16841492507801067


Evaluating Epoch 3/3: 100%|██████████| 440/440 [04:00<00:00,  1.83it/s]


Epoch 3/3 - Average Evaluation Loss: 0.2273039766238071
Category Accuracy: 0.9921719328209507, Type Accuracy: 0.9560204953031597
Category F1 Score: 0.9920422716776524, Type F1 Score: 0.9523872022466164


('./multi-task-bert/tokenizer_config.json',
 './multi-task-bert/special_tokens_map.json',
 './multi-task-bert/vocab.txt',
 './multi-task-bert/added_tokens.json')