### Model Loading and Token Classification Setup

This section loads and prepares JupOtter based on **CodeT5** for **cell-level bug detection** in Jupyter notebooks.

- **Model**: A custom PyTorch model class 'CodeT5TokenClassifier' is used to set up JupOtter. This class uses a CodeT5 encoder and a linear classification layer to predict bugs. The CodeT5 model used can be configured through changing 'default_encoder_path' at the top of the file.
- **Input/Output**: For each input sample (chunk of notebook), the model extracts hidden states corresponding to special tokens `<CELL_i>` and `<END_CELL_i>`. It averages the token embeddings between these boundaries and applies a classifier to predict bugs.
- **Loss Modes**:
  - This setup contains 3 loss modes choosen through the 'calc_loss' parameter. `calc_loss = 0`: No loss is calculated, only logits are returned. `calc_loss = 1`: Chunk-weighted binary cross-entropy loss. `calc_loss = 2`: Cell-weighted binary cross-entropy loss.
- **Special Tokens**: `2046` start/end tokens (`<CELL_1>`, ..., `<END_CELL_1023>`) are added to the tokenizer to delimit code cells.
- **Model Loading**: The model is loaded from a saved checkpoint configured through the 'saved_model_path' parameter. Tokenizer embeddings are resized to account for the new special tokens.


In [None]:
import torch
import torch.nn as nn
from transformers import T5EncoderModel, RobertaTokenizer

saved_model_path = "models\\JupOtter-base_epoch_9.pt"
tokenizer_path = 'Salesforce/codet5-base'
default_encoder_path = 'Salesforce/codet5-base'
num_max_cells = 1024  # Number of special tokens to use

class CodeT5TokenClassifier(nn.Module):
    def __init__(self, model_name, num_labels=1):
        """
        model_name: e.g., 'Salesforce/codet5-base'
        num_labels: Number of labels per cell
        """
        super(CodeT5TokenClassifier, self).__init__() # get the base encoder model
        self.encoder = T5EncoderModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.d_model
        self.classifier = nn.Linear(hidden_size, num_labels)  # intitialize linear laryer that will map hidden states to a single logit
    
    def forward(self, input_ids, attention_mask, start_token_ids, end_token_ids, labels=None, calc_loss=1):

        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) #getting the hidden states
        hidden_states = encoder_outputs.last_hidden_state  # getting the last hidden state

        # Create a mask to identify positions of the target tokens, one for the start tokens,
        # one for the end token
        start_mask = torch.zeros_like(input_ids, dtype=torch.bool)
        end_mask = torch.zeros_like(input_ids, dtype=torch.bool)
        for token_id in start_token_ids:
            start_mask |= (input_ids == token_id) # getting all of the start tokens present in the code
        for token_id in end_token_ids:
            end_mask |= (input_ids == token_id) # getting all of the end tokens presentin in the code
      
        # will hold the prediction vector of each chunk
        logits_list = []

        for i in range(hidden_states.size(0)):  # iterate over hidden states for each batch
            hs = hidden_states[i]

             # Find positions of start and end tokens in this sample to find the cells
            start_positions = (start_mask[i]).nonzero(as_tuple=True)[0]
            end_positions = (end_mask[i]).nonzero(as_tuple=True)[0]
            
           
            cell_logits = []  # one logit per cell, this will hold the logits
            for start_token_pos, end_token_pos in zip(start_positions, end_positions):

                cell_hidden_state = hs[start_token_pos:end_token_pos+1]  # getting the hidden state between the special token bounds, corresponds to one cell
                cell_rep = cell_hidden_state.mean(dim=0)  # this averages the array of tokens into a vector where each entrie is the average of the features in a token.
                logit = self.classifier(cell_rep)  # using the classifier on the vector of averaged tokens to get the prediction for the cell
                cell_logits.append(logit)

            if cell_logits: # for if logits generated for the sample
                logits_list.append(torch.stack(cell_logits)) # add logit to logits list
            else:
                # if no cell pairs are found, append an empty tensor 
                logits_list.append(torch.empty(0, self.classifier.out_features, device=hs.device))

        # starting loss calculation
        loss = None
        if labels is None or calc_loss == 0: # if no labels are provided or we do not want to calculate loss
            return {"logits": logits_list}
        elif calc_loss == 1:  # if labels are provided and we want to calculate loss chunk weighted binary cross entropy loss
            loss_fct = nn.BCEWithLogitsLoss() # useing binary cross entorphy loss, this is what the paper this idea was based on uses
            losses = []
            # loop over each examples logits and corresponding labels
            for logits, lbl in zip(logits_list, labels):
                if len(logits) != len(lbl):
                    lbl = lbl[:len(logits)]  # trim lbl to match logits length
                    print(f"Trimmed lbl to match logits length: {len(lbl)}")
                logits = logits.squeeze(-1)
                if logits.numel() > 0:  # only calculate loss if logits are not empty
                    losses.append(loss_fct(logits, lbl.float()))  # calculate loss

                    
            if losses:
                loss = torch.stack(losses).mean()
            return {"loss": loss, "logits": logits_list}
        
        elif calc_loss == 2:  # to calculate loss cell weighted binary cross entropy loss
            loss_fct = nn.BCEWithLogitsLoss()
            list_logits = torch.cat(logits_list).squeeze(1)
            list_lbl = torch.cat(labels).float()

            if len(list_logits) != len(list_lbl): # trim if lenghths do not match
                print(f"Trimming labels from {len(list_lbl)} to {len(list_logits)}")
                list_lbl = list_lbl[:len(list_logits)]

                # calculate the loss and scale it by the batch size
            loss = loss_fct(list_logits, list_lbl)

            return {"loss": loss, "logits": logits_list}
    
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

# setting up special tokens for cell boundaries
start_special_tokens = [f"<CELL_{i}>" for i in range(1, num_max_cells)]
end_special_tokens = [f"<END_CELL_{i}>" for i in range(1, num_max_cells)]
all_special_tokens = start_special_tokens + end_special_tokens

# Add tokens if not already in the vocabulary.
for token in all_special_tokens:
    if token not in tokenizer.get_vocab():
        tokenizer.add_tokens([token])

# Get token IDs
start_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in start_special_tokens]
end_token_ids = [tokenizer.convert_tokens_to_ids(token) for token in end_special_tokens]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Instantiate model and resize embeddings to account for new tokens.
model = CodeT5TokenClassifier(default_encoder_path).to(device)
model.encoder.resize_token_embeddings(len(tokenizer))

print(f"device: {device}")

# load model checkpoint
try:
    checkpoint = torch.load(saved_model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'] if 'model_state_dict' in checkpoint else checkpoint)
    print(f"Loaded model from {saved_model_path}")
except Exception as e:
    print(f"Failed to load model from {saved_model_path}: {e}")

device: cuda


  checkpoint = torch.load(saved_model_path, map_location=device)


Loaded model from models\JupOtter-base_epoch_9.pt


### Data Cleaning and Loading

This cell performs the following steps:

- **Data Cleaning**:  
  Uses the `cleanData_csv` module to process raw Jupyter notebooks stored in the directory `"test_notebooks"`.  
  The cleaned and parsed data is saved as a CSV file `"parsed_notebook_data.csv"` in the dataset directory.  

- **Data Loading**:  
  Opens the cleaned CSV file and reads the notebook code samples and their corresponding labels.  
  Code samples are stored in `codeSamples` and raw label strings in `raw_labels`.

- **Label Processing**:  
  After csv dataset is created, the string labels are converted from the CSV into Python lists of integers using `ast.literal_eval`.  
  The resulting list of integer labels is stored in `labels_ints` for later use in model training or evaluation. `labels_ints` are cell level
  labels, while codeSamples contains the corresponding notebook code cells.

'cleanData_csv.Data_manage()' can be used to retrive different statistics about the dataset after the dataset has been labeled and stored in a CSV file. OtterDataset is already labeld and stored in a CSV file in the OtterDataset directory.


In [6]:
import cleanData_csv
import csv
import ast



# file_path_of_notebooks = "test_notebooks"
data_file_path = "dataset\\CodeParrot_Subset\\CodeParrot_dataset.csv" # path to where the cleaned data will be saved, must be a csv file
# data_file_path = "dataset\\OtterDataset\\OtterDataset.csv" 
# data_file_path = "dataset\\JupyterErrors_dataset\\JupyterError_dataset.csv"


clean = cleanData_csv.Data_Clean()
manage = cleanData_csv.Data_manage()

# clean.create_notebook_train_data(file_path_of_notebooks, data_file_path, 2) # uncomment this line to create dataset, csv file must be moved from the folder of notebooks
print(manage.ensureNoDuplicates(data_file_path)) # ensure no duplicates in the dataset

# print(f"Data cleaned and saved to {data_file_path}")


codeSamples, raw_labels, skipped_files = [], [], []

first = True
csv.field_size_limit(10000000)
with open(data_file_path, 'r', newline='', encoding='utf-8') as csvfile:
    fileReader = csv.reader(csvfile)
    next(fileReader)
    for row in fileReader:
        try:
            codeSamples.append(row[1]) # rq2 filtered code samples based on column 4, ensuring the specific error types were present in the book
            raw_labels.append(row[-1])
        except Exception as e:
            print(f"Error processing row {row}: {e}")
            skipped_files.append(row[0]) 
            

# turn labels to a list of ints
labels_ints = [ast.literal_eval(label.strip()) for label in raw_labels]

True


### Tokenization and Chunking of Notebook Cells

This cell performs the following steps to prepare the data for model input:

- **Padding Functions**:  
  Defines helper functions `pad` and `pad_mask` to pad token sequences and attention masks to a fixed length (chunk token limit), ensuring consistent input sizes for the model.

- **Tokenization and Chunking**:  
  Iterates over each notebook’s code samples and their labels:  
  - Each notebook is split into individual cells using the regex pattern.  
  - Cells are tokenized individually using the pretrained tokenizer, with no padding or truncation beyond 2,500 tokens.  
  - Cells longer than 2,500 tokens are skipped to avoid excessively large inputs.  
  - Tokenized cells are grouped into chunks up to 2,500 tokens in length. If adding a new cell would exceed this limit, the current chunk is saved (with padding), and a new chunk is started.  
  - Corresponding attention masks and label tensors for the chunks are created and stored.

- **Data Structures for Tokenized Content**:  
  The tokenized and padded input IDs, attention masks, and labels for all notebooks are stored in:  
  - `tokenized_chunks_ids`  
  - `tokenized_chunks_attention_mask`  
  - `tokenized_chunk_labels`

This cell contains optional code to create a train test split, during our evaluation we used a random state of 42 to split OtterDataset


In [7]:
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split

cell_pattern = re.compile(r"<CELL_\d+>(.*?)<END_CELL_\d+>", re.DOTALL)

def pad(seq, max_len=2500, pad_id=tokenizer.pad_token_id):
    padding = torch.full((max_len - seq.size(0),), pad_id, dtype=seq.dtype)
    return torch.cat((seq, padding))

def pad_mask(mask, max_len=2500):
    padding = torch.zeros(max_len - mask.size(0), dtype=mask.dtype)
    return torch.cat((mask, padding))

# to hold final tokenized data
tokenized_chunks_ids = []       
tokenized_chunks_attention_mask = []
tokenized_chunk_labels = []   
skipped = 0
for code, label in tqdm(zip(codeSamples, labels_ints), total=len(codeSamples)):
    cells = [match.group(0) for match in cell_pattern.finditer(code)]

    current_notebook_chunks = [] #to hold the chunks of the current notebook as we build them, chunks up to 2048 tokens
    current_notebook_masks = []
    current_notebook_labels = [] #to hold the labels of each chunk

    current_chunk = [] # current one we are building
    current_mask_chunk = []
    current_labels = []
    current_length = 0
    label_idx = 0
    if len(cells) != len(label):
        print(f"Label mismatch with cell num")
    for cell_num, cell in enumerate(cells):
        encoding = tokenizer(cell, padding=False, truncation=True, max_length=2501, return_tensors="pt")
        next_cell = encoding["input_ids"].squeeze(0)
        next_mask = encoding["attention_mask"].squeeze(0)
        next_cell_len = next_cell.size(0)

        next_cell_len = next_cell.size(0)

        # skip overly long cells and their labels to maintain consistency
        if next_cell_len > 2500: 
            label_idx += 1
            skipped += 1
            continue

        # if current chunk length + length to add is greater than 2048, save the current chunk and start a new one
        if current_length + next_cell_len > 2500:
            if current_chunk: #as long as we had a chunk build at this point append what he had
                current_notebook_chunks.append(pad(torch.cat(current_chunk))) # concationate the chunk together and add it to list of chunks
                current_notebook_masks.append(pad_mask(torch.cat(current_mask_chunk)))
                current_notebook_labels.append(torch.tensor(current_labels, dtype=torch.long))
                
            current_chunk = [next_cell] # update next chunk with what we just tokenized
            current_mask_chunk = [next_mask]
            current_labels = [label[label_idx]]
            current_length = next_cell_len
        else:
            current_chunk.append(next_cell) #ortherwise if adding current chunk does not exceed the limit, add it to the current chunk
            current_mask_chunk.append(next_mask)
            current_labels.append(label[label_idx])
            current_length += next_cell_len

        label_idx += 1

    # save the final chunk of current notebook if it exists after for loop
    if current_chunk:
        current_notebook_chunks.append(pad(torch.cat(current_chunk))) # concationate current chunk together and add it to list of chunks for this book
        current_notebook_masks.append(pad_mask(torch.cat(current_mask_chunk)))
        current_notebook_labels.append(torch.tensor(current_labels, dtype=torch.long))
        

    if len(current_notebook_chunks) > 0: # if we have any chunks in the current notebook, add them to the list of all notebooks
        tokenized_chunks_ids.append(torch.stack(current_notebook_chunks)) # add the current notebook chunks to the list of all notebooks
        tokenized_chunks_attention_mask.append(torch.stack(current_notebook_masks))
        tokenized_chunk_labels.append(current_notebook_labels)



# uncomment for train test split
# train_ids, test_ids, train_masks, test_masks, train_labels, test_labels = train_test_split(
#     tokenized_chunks_ids,
#     tokenized_chunks_attention_mask,
#     tokenized_chunk_labels,
#     test_size=0.2,
#     random_state=42
# )

100%|██████████| 4892/4892 [00:24<00:00, 203.43it/s]


In [None]:
# The following cell is used to load split tokenized data to test. To load the unsplit data, uncomment the lines below and comment out the above lines.

import torch

load_path = "dataset\\tokenized_content\\name_of_file.pt"

tokenized_data = torch.load(load_path)

train_ids = tokenized_data['train_ids']
test_ids = tokenized_data['test_ids']
train_masks = tokenized_data['train_masks']
test_masks = tokenized_data['test_masks']
train_labels = tokenized_data['train_labels']
test_labels = tokenized_data['test_labels']

# Uncomment the lines below to load unsplit data
# test_ids = tokenized_data['test_ids']
# test_masks = tokenized_data['test_masks']
# test_labels = tokenized_data['test_labels']

print("Tokenized data loaded successfully.")
print(f"Train IDs: {len(train_ids)}, Test IDs: {len(test_ids)}")



In [None]:
# the following cell is used to save tokenized data for testing, note that this saves the train test split, not the unsplit data. Unsplit data is used for testing our 
# code parrot jupyter errors dataset and the jupyter errors dataset. To save the unsplit data, uncomment the lines below and comment out the above lines.

# To load tokenized data, ensure the path is correct. Tokenizer as well as code to save tokenized content is in the run model file.

save_path = "dataset\\tokenized_content\\name_of_file.pt"

tokenized_data = {
    'train_ids': train_ids,
    'test_ids': test_ids,
    'train_masks': train_masks,
    'test_masks': test_masks,
    'train_labels': train_labels,
    'test_labels': test_labels
}

# to save unsplit data, uncomment the following lines and comment out the above lines
# tokenized_data = {
#     'test_ids': tokenized_chunks_ids,
#     'test_masks': tokenized_chunks_attention_mask,
#     'test_labels': tokenized_chunk_labels
# }
torch.save(tokenized_data, save_path)
print(f"Tokenized data saved to {save_path}")


### Model Evaluation

This cell prepares the tokenized data for batching and evaluates the trained model. We have included commented out code to perform cell-level, file-level, and single notebook analysis. Analysis is done using 'buggy_cell_vector_evalualtion_clean' which contains many functions to aid in evaluation.

In [8]:
import buggy_cell_vector_evalualtion_clean
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

class NotebookDataset(Dataset):
    def __init__(self, all_ids, all_masks, all_labels):
        # here all_ids is a tensor of chunnks where each row is a chunk of a notebook
        self.ids = all_ids
        self.masks = all_masks
        self.labels = all_labels

    def __len__(self): # just num samples
        return len(self.ids)

    def __getitem__(self, i): # to get the data for a single book, can be in multiple chunks
        return {
          "input_ids": self.ids[i],           
          "attention_mask": self.masks[i],    
          "labels": self.labels[i],         
        }

def custom_collate_fn(batch):
    # Stack fixed-size tensors for inputs and attention masks.
    input_ids = ([item['input_ids'] for item in batch])
    attention_mask = ([item['attention_mask'] for item in batch])
    # Leave labels as a list of tensors, since they are variable-length.
    labels = [item['labels'] for item in batch]
    return {'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels}


# Create the dataset for testing from whole tokenized dataset
test_dataset = NotebookDataset(tokenized_chunks_ids, tokenized_chunks_attention_mask, tokenized_chunk_labels)


test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)


model_tester = buggy_cell_vector_evalualtion_clean.VectorEval()
print("---------------------- Evaluation Results ----------------------")
# the following is for evaluating the model on the test set at cell-level
model_tester.eval_vector_batched(test_loader, model, start_token_ids, end_token_ids, device, chunk_size=4)
model_tester.print_results()
model_tester.reset()

# the following is for evaluating the model on the test set at file-level
# model_tester.eval_vector_batched(test_loader, model, start_token_ids, end_token_ids, device, chunk_size=4, eval_type=2)
# model_tester.print_results_file_level()
# model_tester.reset()

# the following is for evaluating a single book at cell-level
# model_tester.eval_single_book(test_loader, model, start_token_ids, end_token_ids, device, chunk_size=4)
# model_tester.print_results()
#11

---------------------- Evaluation Results ----------------------


Evaluating batches: 100%|██████████| 4770/4770 [29:27<00:00,  2.70batch/s]

Total 1: 4056, Total 0: 74352

===== Model Evaluation Metrics Totals =====
Total books: 4770, Total cells: 78408, Total buggy cells: 3816
True positives: 2981, True negatives: 73517, Total correct: 76498
False positives: 1075, False negatives: 835, Total incorrect: 1910

===== Cell-aggregated =====
Precision: 0.7350
Recall: 0.7812
F1 Score: 0.7574
Accuracy: 0.9756
Buggy Cell Ratio: 0.0487


===== File-aggregated =====
Precision Score: 0.8639
Recall Score: 0.9186
F1 Score: 0.8904
Accuracy Score: 0.9620



