**Import Libraries**

In [None]:
import os  # Interact with the operating system for file and directory operations
import torch  # PyTorch library for tensor computations and deep learning
import pandas as pd  # Data manipulation and analysis, especially for tabular data
from sklearn.preprocessing import LabelEncoder  # Utility for encoding labels as integers
from transformers import DebertaTokenizer, DebertaForSequenceClassification  # Tokenizer and model for sequence classification tasks
from torch.utils.data import Dataset, DataLoader  # Utilities for handling datasets and creating data loaders
from transformers import AdamW  # Optimizer for training the model
import numpy as np  # Numerical operations
from sklearn.model_selection import StratifiedShuffleSplit  # Utility for splitting datasets into training and testing sets while preserving class distribution
import time  # Measuring and manipulating time

**Mound Drive**

In [None]:
# Mount Google Drive to access files and save outputs
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Load The Dataset**

In [None]:
# Specify the file path in Google Drive and load the dataset after preprocessing
file_path = '/content/drive/My Drive/dataset_after_preprocessing.csv'
dataset = pd.read_csv(file_path)

**Encode the labels**

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'Assignee' column and transform it to numeric labels
# This step converts categorical labels in the 'Assignee' column to numeric labels,
# which is necessary for training machine learning models.
dataset['Assignee_Class'] = label_encoder.fit_transform(dataset['Assignee'])

# The LabelEncoder in scikit-learn assigns numeric labels to the unique categories in alphabetical order 
# (or lexicographical order for strings). 
# This means that the first unique category in alphabetical order is labeled as 0, the second as 1, and so on.

**Show the dataset**

In [None]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after labeling
dataset

Unnamed: 0,Summary_Stemmed,Assignee,label
0,"['scroll', 'scroll', 'mice', 'touchpad', 'etc'...",amit@chromium.org,151
1,"['proxi', 'caus', 'network', 'request', 'fail'...",jon@chromium.org,1326
2,"['web', 'inspector', 'button', 'dock', 'main',...",pfeldman@chromium.org,2039
3,"['habari', 'admin', 'interfac', 'render', 'cor...",jon@chromium.org,1326
4,"['maxim', 'second', 'larger', 'monitor', 'work...",pkasting@chromium.org,2061
...,...,...,...
197914,"['updat', 'gleanj', 'dashboard', 'ignor', 'gle...",brosa,350
197915,"['autocomplet', 'type', 'valid', 'valu', 'pass...",brosa,350
197916,"['intermitt', 'slow', 'see', 'ping', 'show', '...",brosa,350
197917,"['investig', 'string', 'metric', 'type', 'adeq...",pmcmanis,2070


**Split the dataset**

In [None]:
# Create the StratifiedShuffleSplit object
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# This line creates a StratifiedShuffleSplit object with the following parameters:
# - n_splits=1: Specifies that there will be only one split.
# - test_size=0.2: Indicates that 20% of the dataset will be used as the test set.
# - random_state=42: Ensures reproducibility by using a fixed seed for the random number generator.

# Split the dataset into train and test sets
train_idx, test_idx = next(sss.split(dataset, dataset['Assignee_Class']))

# This line performs the split based on the 'Assignee_Class' column to ensure that the train and test sets 
# have a similar class distribution. 'sss.split' returns the indices of the train and test samples.

# Create the training DataFrame
train_df = dataset.iloc[train_idx].reset_index(drop=True)

# This line creates a training DataFrame using the indices obtained from the split. 
# 'iloc' is used to select the rows corresponding to the train indices. 
# 'reset_index(drop=True)' resets the index of the training DataFrame.

# Create the test DataFrame
test_df = dataset.iloc[test_idx].reset_index(drop=True)

# This line creates a test DataFrame using the indices obtained from the split. 
# 'iloc' is used to select the rows corresponding to the test indices. 
# 'reset_index(drop=True)' resets the index of the test DataFrame.

**Initialize the tokenizer**

In [None]:
# Initialize the tokenizer
# This loads the pre-trained tokenizer from the 'microsoft/deberta-base' model
# The tokenizer is used to convert text into tokens that can be fed into the DeBERTa model
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

**Create the Dataset class**

In [None]:
# Dataset class for text classification
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing 'Summary_Stemmed' and 'label' columns.
            tokenizer (DebertaTokenizer): Pre-trained tokenizer for encoding text.
            max_len (int): Maximum length of the input text after tokenization.
        """
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        # Returns the number of samples in the dataset
        return len(self.dataframe)

    def __getitem__(self, idx):
        """
        Retrieves a sample from the dataset by index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            dict: Dictionary containing 'input_ids', 'attention_mask', and 'label'.
                  'input_ids': Tokenized and encoded input IDs.
                  'attention_mask': Attention mask indicating the padded tokens.
                  'label': Tensor containing the label of the sample.
        """
        text = self.dataframe.iloc[idx]['Summary_Stemmed']
        label = self.dataframe.iloc[idx]['label']

        # Encode text using the tokenizer with specified parameters
        encoding = self.tokenizer.encode_plus(
            text,  # Input text to be tokenized
            add_special_tokens=True,  # Add special tokens (like [CLS], [SEP])
            max_length=self.max_len,  # Maximum length of the tokenized sequence
            padding='max_length',  # Pad sequences to max_length
            return_attention_mask=True,  # Return attention mask to distinguish real tokens from padding tokens
            return_tensors='pt',  # Return PyTorch tensors
            truncation=True  # Truncate sequences longer than max_length
        )

        # Construct and return a dictionary containing processed input data
        return {
            'input_ids': encoding['input_ids'].flatten(),      # Flattened token IDs
            'attention_mask': encoding['attention_mask'].flatten(),  # Flattened attention mask
            'label': torch.tensor(label, dtype=torch.long)    # Tensorized label
        }

**Create datasets and dataloaders for train and test sets**

In [None]:
# Create datasets and dataloaders for training and testing
train_dataset = TextDataset(train_df, tokenizer, max_len=128)
test_dataset = TextDataset(test_df, tokenizer, max_len=128)

# DataLoader for training dataset
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# DataLoader for test dataset
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

**Initialize the DeBerta model, move the model to the appropriate device and set up an optimizer**

In [None]:
# Initialize the model
num_labels = len(label_encoder.classes_)  # Determine the number of output labels from the label encoder
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=num_labels)
# Load the pre-trained DeBERTa model for sequence classification with the specified number of output labels
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Choose device (GPU if available, else CPU)
model = model.to(device)  # Move the model to the selected device for computation

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)
# Use AdamW optimizer to update the model's parameters during training with a learning rate of 2e-5

# Optimizer with weight decay
# weight_decay = 0.001
# optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=weight_decay)

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Directory to save training checkpoints**

In [None]:
# Directory to save checkpoints
CHECKPOINT_DIR = '/content/drive/MyDrive/checkpoints'

# Create the directory if it does not exist
if not os.path.exists(CHECKPOINT_DIR):
    os.makedirs(CHECKPOINT_DIR)

**Training function with checkpoint saving**

In [None]:
# Training function with checkpoint saving
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples, epoch):
    model = model.train()  # Set the model to training mode
    losses = []  # List to store losses for each batch
    correct_predictions = 0  # Counter for correct predictions

    # Iterate over batches in the data loader
    for d in data_loader:
        input_ids = d['input_ids'].to(device)  # Move input_ids to device (GPU or CPU)
        attention_mask = d['attention_mask'].to(device)  # Move attention_mask to device
        labels = d['label'].to(device)  # Move labels to device

        # Forward pass through the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss  # Get the loss from the model's output
        logits = outputs.logits  # Get the logits (raw predictions)

        # Calculate accuracy
        _, preds = torch.max(logits, dim=1)  # Get the predicted labels
        correct_predictions += torch.sum(preds == labels)  # Count correct predictions
        losses.append(loss.item())  # Append the loss value to the losses list

        # Backward pass and optimization step
        loss.backward()  # Compute gradients
        optimizer.step()  # Update model parameters
        optimizer.zero_grad()  # Clear gradients for the next iteration

    # Save checkpoint after each epoch
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f'checkpoint_epoch_{epoch}.pt')
    torch.save({
        'epoch': epoch,  # Save current epoch
        'model_state_dict': model.state_dict(),  # Save model state
        'optimizer_state_dict': optimizer.state_dict(),  # Save optimizer state
        'loss': np.mean(losses),  # Save mean loss for the epoch
    }, checkpoint_path)

    # Calculate accuracy and average loss for the epoch
    return correct_predictions.double() / n_examples, np.mean(losses)

**Evaluation function**

In [None]:
# Evaluation functionwhat is cross enropy loss
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()  # Set the model to evaluation mode
    losses = []  # List to store losses for each batch
    correct_predictions = 0  # Counter for correct predictions

    with torch.no_grad():  # Disable gradient computation for evaluation
        for d in data_loader:
            input_ids = d['input_ids'].to(device)  # Move input_ids to device (GPU or CPU)
            attention_mask = d['attention_mask'].model_state_dictto(device)  # Move attention_mask to device
            labels = d['label'].to(device)  # Move labels to device

            # Forward pass through the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss  # Get the loss from the model's output
            logits = outputs.logits  # Get the model_state_dictlogits (raw predictions)

            # Calculate accuracy
            _, preds = torch.max(logits, dim=1)  # Get the predicted labels
            correct_predictions += torch.sum(preds == labels)  # Count correct predictions
            losses.append(loss.item())  # Append the loss value to the losses list

    # Calculate accuracy and average loss for the entire evaluation swhat is cross enropy losset
    return correct_predictions.double() / n_examples, np.mean(losses)

**Loss function**

In [None]:
# Loss function
loss_fn = torch.nn.CrossEntropyLoss().to(device)
# Define the loss function as CrossEntropyLoss, suitable for multi-class classification tasks. 
# Move the loss function to the specified device (GPU or CPU) for computation.


**Load checkpoint if available**

In [None]:
# Load checkpoint if available
def load_checkpoint(model, optimizer, checkpoint_path):
    if os.path.exists(checkpoint_path):  # Check if the checkpoint file exists
        checkpoint = torch.load(checkpoint_path)  # Load the checkpoint using torch.load()
        model.load_state_dict(checkpoint['model_state_dict'])  # Load the model's state_dict from the checkpoint
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])  # Load the optimizer's state_dict from the checkpoint
        epoch = checkpoint['epoch']  # Retrieve the epoch number from the checkpoint
        loss = checkpoint['loss']  # Retrieve the loss value from the checkpoint
        print(f"Checkpoint loaded. Resuming training from epoch {epoch} with loss {loss:.4f}.")
        return epoch  # Return the epoch number to resume training from where it left off
    else:
        print("No checkpoint found. Starting training from scratch.")
        return 0  # Return 0 to indicate starting training from the beginning

**Training and evaluating loop**

In [None]:
# Load the last checkpoint
last_checkpoint_path = os.path.join(CHECKPOINT_DIR, 'checkpoint_epoch_57.pt')
start_epoch = load_checkpoint(model, optimizer, last_checkpoint_path)
# Specify the path of the last saved checkpoint using the predefined checkpoint directory.
# Call the load_checkpoint function to load the model and optimizer states from the specified checkpoint.
# The returned value, start_epoch, indicates the epoch from which training will resume.


# Training loop
EPOCHS = 100  # Define the total number of epochs for training

for epoch in range(start_epoch, EPOCHS):  # Iterate over each epoch starting from start_epoch to EPOCHS
    print(f'Epoch {epoch + 1}/{EPOCHS}')  # Print the current epoch number and total epochs
    print('-' * 10)  # Print a separator line

    # Record the start time of the epoch
    start_time = time.time()

    # Perform training for one epoch
    train_acc, train_loss = train_epoch(
        model,  # Pass the model
        train_loader,  # Pass the training data loader
        loss_fn,  # Pass the loss function (CrossEntropyLoss)
        optimizer,  # Pass the optimizer (AdamW)
        device,  # Pass the device (GPU or CPU)
        None,  # No scheduler used (passing None)
        len(train_df),  # Total number of examples in the training dataset
        epoch + 1  # Current epoch number
    )

    # Record the end time of the epoch
    end_time = time.time()

    # Calculate the duration taken for this epoch
    epoch_duration = end_time - start_time

    # Print training loss and accuracy for the current epoch
    print(f'Train loss {train_loss} accuracy {train_acc}')
    # Print the time taken for the current epoch
    print(f'Time taken for epoch {epoch + 1}: {epoch_duration:.2f} seconds')

    # Evaluate the model on the validation set
    val_acc, val_loss = eval_model(
        model,  # Pass the model
        test_loader,  # Pass the validation data loader
        loss_fn,  # Pass the loss function (CrossEntropyLoss)
        device,  # Pass the device (GPU or CPU)
        len(test_df)  # Total number of examples in the validation dataset
    )

    # Print validation loss and accuracy for the current epoch
    print(f'Val   loss {val_loss} accuracy {val_acc}')