In [None]:
# Imports used

from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm  
import torch.nn as nn
import os
import torch
import buggy_cell_vector_evalualtion_clean
from torch.utils.data import Dataset
import gc
from torch.amp import autocast, GradScaler
import torch.cuda as cuda
import time
from transformers import T5EncoderModel, RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Custom dataset to wrap notebook data to easily use with DataLoader

class NotebookDataset(Dataset):
    def __init__(self, all_ids, all_masks, all_labels):
        # all_ids is a tensor of chunnks where each row is a chunk of a notebook
        self.ids = all_ids
        self.masks = all_masks
        self.labels = all_labels

    def __len__(self): 
        return len(self.ids)

    def __getitem__(self, i): # to get the data for a single book, can be in multiple chunks
        return {
          "input_ids": self.ids[i],           
          "attention_mask": self.masks[i],    
          "labels": self.labels[i],         
        }

In [None]:
# Custom collate function that creates batches of notebooks.

def custom_collate_fn(batch):
    input_ids = ([item['input_ids'] for item in batch])
    attention_mask = ([item['attention_mask'] for item in batch])
    labels = [item['labels'] for item in batch]
    return {'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels}

In [None]:
# This section prepares JupOtter for cell-level bug detection in Jupyter notebooks, more in depth documentation avalable in run_model.ipynb.

class CodeT5TokenClassifier(nn.Module):
    def __init__(self, model_name, num_labels=1):
        """
        model_name: e.g., 'Salesforce/codet5-base'
        """
        super(CodeT5TokenClassifier, self).__init__() # get the base encoder model
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.d_model
        self.classifier = nn.Linear(hidden_size, num_labels)  # intitialize linear laryer that will map hidden states to a single logit
    
    def forward(self, input_ids, attention_mask, start_token_ids, end_token_ids, labels=None, calc_loss=1):
        """
        Loss calculation:
        calc_loss: 0 for no loss, 1 for chunk weighted binary cross entropy loss, 2 for cell weighted binary cross entropy loss.
        """
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) #getting the hidden states
        hidden_states = encoder_outputs.last_hidden_state  # getting the last hidden state

        # Create a mask to identify positions of the target tokens, one for the start tokens,
        # one for the end token
        start_mask = torch.zeros_like(input_ids, dtype=torch.bool)
        end_mask = torch.zeros_like(input_ids, dtype=torch.bool)
        for token_id in start_token_ids:
            start_mask |= (input_ids == token_id) # getting all of the start tokens present in the code
        for token_id in end_token_ids:
            end_mask |= (input_ids == token_id) # getting all of the end tokens presentin in the code
        
        # will hold the prediction vector of each chunk
        logits_list = []

        for i in range(hidden_states.size(0)):  # iterate over hidden states for each sample in the batch
            hs = hidden_states[i]

             # Find positions of start and end tokens in this example
            start_positions = (start_mask[i]).nonzero(as_tuple=True)[0]
            end_positions = (end_mask[i]).nonzero(as_tuple=True)[0]
            
           
            cell_logits = []  # one logit per cell, stores logits for cells in this chunk
            for start_token_pos, end_token_pos in zip(start_positions, end_positions):

                cell_hidden_state = hs[start_token_pos:end_token_pos+1]  # getting the hidden state between the special token bounds
                cell_rep = cell_hidden_state.mean(dim=0)  # this averages the array of tokens into a vector where each entrie is the average of the features in a token.
                logit = self.classifier(cell_rep)  # using the classifier on the vector of averaged tokens
                cell_logits.append(logit)

            if cell_logits: # for if logits generated for the sample
                logits_list.append(torch.stack(cell_logits)) # add logit to logits list
    
            else:
                # if no cell pairs are found, append an empty tensor 
                logits_list.append(torch.empty(0, self.classifier.out_features, device=hs.device))

        # starting loss calculation
        loss = None
        if labels is None or calc_loss == 0: # if no labels are provided or we do not want to calculate loss
            return {"logits": logits_list}
        elif calc_loss == 1:  # if labels are provided and we want to calculate loss chunk weighted binary cross entropy loss
            loss_fct = nn.BCEWithLogitsLoss() # useing binary cross entorphy loss, this is what the paper this idea was based on uses
            losses = []
            # loop over each examples logits and corresponding labels
            for logits, lbl in zip(logits_list, labels):
                if len(logits) != len(lbl):
                    lbl = lbl[:len(logits)]  # trim lbl to match logits length
                    print(f"Trimmed lbl to match logits length: {len(lbl)}")
                logits = logits.squeeze(-1)
                if logits.numel() > 0:  # only calculate loss if logits are not empty
                    losses.append(loss_fct(logits, lbl.float()))  # calculate loss

                    
            if losses:
                loss = torch.stack(losses).mean()
            return {"loss": loss, "logits": logits_list}
        
        elif calc_loss == 2:  # to calculate loss cell weighted binary cross entropy loss
            loss_fct = nn.BCEWithLogitsLoss()
            list_logits = torch.cat(logits_list).squeeze(1)
            list_lbl = torch.cat(labels).float()

            if len(list_logits) != len(list_lbl): # trim if lenghths do not match
                print(f"Trimming labels from {len(list_lbl)} to {len(list_logits)}")
                list_lbl = list_lbl[:len(list_logits)]

                # calculate the loss and scale it by the batch size
            loss = loss_fct(list_logits, list_lbl)

            return {"loss": loss, "logits": logits_list}

In [None]:
# Model loading and special token setup

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')


# setting up the special tokens use
start_special_tokens = [f"<CELL_{i}>" for i in range(1, 1024)]
end_special_tokens = [f"<END_CELL_{i}>" for i in range(1, 1024)]
all_special_tokens = start_special_tokens + end_special_tokens

# Add tokens if not already in the vocabulary.
for token in all_special_tokens:
    if token not in tokenizer.get_vocab():
        tokenizer.add_tokens([token])

# Get token IDs
start_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in start_special_tokens]
end_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in end_special_tokens]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Instantiate model and resize embeddings to account for new tokens.
model = CodeT5TokenClassifier('Salesforce/codet5-base').to(device)

model.encoder.resize_token_embeddings(len(tokenizer))

print(f"device: {device}")

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


device: cuda


In [None]:
# To load tokenized data, ensure the path is correct. Tokenizer as well as code to save tokenized content is in the run model file.

load_path = "dataset\\tokenized_content\\name_of_file.pt"

tokenized_data = torch.load(load_path)

train_ids = tokenized_data['train_ids']
test_ids = tokenized_data['test_ids']
train_masks = tokenized_data['train_masks']
test_masks = tokenized_data['test_masks']
train_labels = tokenized_data['train_labels']
test_labels = tokenized_data['test_labels']

print("Tokenized data loaded successfully.")

  tokenized_data = torch.load(load_path)


Tokenized data loaded successfully.


In [None]:
# Training loop setup

# Set checkpoint directory
checkpoint_dir = "models\\checkpoints"

   
print(f"Using device: {device}")

train_dataset = NotebookDataset(train_ids, train_masks, train_labels)
test_dataset = NotebookDataset(test_ids, test_masks, test_labels)

batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)


optimizer = AdamW(model.parameters(), lr=1e-5)

model_tester = buggy_cell_vector_evalualtion_clean.VectorEval()

print("---------------------- BASE LINE ----------------------")
model_tester.eval_vector_batched(test_loader, model, start_token_ids, end_token_ids, device, chunk_size=4)
model_tester.print_results()
model_tester.reset()

scaler = GradScaler()
epochs = 10
loss_fct = nn.BCEWithLogitsLoss()

print("Starting training")

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    print(f"Epoch {epoch + 1}/{epochs}")
    train_bar = tqdm(train_loader, desc="Training", leave=False)
    train_start = time.time()

    for batch in train_bar: # looping through the batches of notebooks

        input_ids_batch = batch['input_ids']
        attention_mask_batch = batch['attention_mask']
        labels_batch = batch['labels']

        optimizer.zero_grad()
        train_loss = 0.0 # initialize the loss for this batch

        # going thorugh each notebook in the batch
        for batch_ids, batch_masks, batch_labels in zip(input_ids_batch, attention_mask_batch, labels_batch):
            # get first n chunks in the notebook, we use chunk size of 4
            batch_ids = batch_ids[:4].to(device)
            batch_masks = batch_masks[:4].to(device)
            batch_labels = [lbl.to(device) for lbl in batch_labels[:4]]

            # Use autocast for mixed precision training
            with autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(
                    input_ids=batch_ids,
                    attention_mask=batch_masks,
                    start_token_ids=start_token_ids,
                    end_token_ids=end_token_ids,
                    labels=batch_labels,
                    calc_loss=2
                )

                
                # calculate the loss and scale it by the batch size
                loss = outputs["loss"] / batch_size
                scaler.scale(loss).backward()
                train_loss += loss.item()

            del batch_ids, batch_masks, batch_labels
            torch.cuda.empty_cache()
            gc.collect()

        scaler.step(optimizer)
        scaler.update()

        total_loss += train_loss
        train_bar.set_postfix(loss=train_loss)

    epoch_time = time.time() - train_start
    print(f"Epoch {epoch + 1} finished. Total Loss: {total_loss:.4f}. Time: {epoch_time:.2f} sec")

    # validation step
    model.eval()
    total_eval_loss = 0.0
    test_bar = tqdm(test_loader, desc="Validating", leave=False)

    val_start = time.time()
    with torch.no_grad():
        for batch in test_bar: # looping through the batches of notebooks
            input_ids_batch = batch['input_ids']
            attention_mask_batch = batch['attention_mask']
            labels_batch = batch['labels']

            batch_loss = 0.0

            # go through each notebook in the batch
            for batch_ids, batch_masks, batch_labels in zip(input_ids_batch, attention_mask_batch, labels_batch):
                # get first n chunks in the notebook, we use chunk size of 4
                batch_ids = batch_ids[:4].to(device)
                batch_masks = batch_masks[:4].to(device)
                batch_labels = [lbl.to(device) for lbl in batch_labels[:4]]
                # Use autocast for mixed precision
                with autocast(device_type='cuda', dtype=torch.float16):
                    outputs = model(
                        input_ids=batch_ids,
                        attention_mask=batch_masks,
                        start_token_ids=start_token_ids,
                        end_token_ids=end_token_ids,
                        labels=batch_labels,
                        calc_loss=2
                    )

                    # calculate the loss and scale it by the batch size to get validation loss
                    loss = outputs["loss"] / batch_size
                    batch_loss += loss.item()

                del batch_ids, batch_masks, batch_labels
                torch.cuda.empty_cache()
                gc.collect()

            total_eval_loss += batch_loss
            test_bar.set_postfix(val_loss=batch_loss)

    avg_eval_loss = total_eval_loss / len(test_loader)
    val_time = time.time() - val_start
    print(f"Validation Loss per batch: {avg_eval_loss:.4f}, Time: {val_time:.2f} seconds")

        

    # after validation in each epoch get the results:
    eval_start = time.time()
    model_tester.eval_vector_batched(test_loader, model, start_token_ids, end_token_ids, device, chunk_size=4)
    model_tester.print_results()
    model_tester.reset()
    eval_time = time.time() - eval_start
    print(f"Evaluation Time (F1/Recall/Precision): {eval_time:.2f}s time: {eval_time:.2f} seconds")

    # save checkpoint at the end of each epoch to the checkpoint directory
    checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch + 1}.pt")
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")


Using device: cuda
Starting training
Epoch 1/10


Training:   0%|          | 1/4113 [00:24<28:05:53, 24.60s/it, loss=0.619]