In [None]:
import torch
# the following cell is used to load tokenized data for testing, note that this loads the train test split, not the unsplit data. Unsplit data is used for testing our 
# code parrot jupyter errors dataset and the jupyter errors dataset. To load the unsplit data, you can uncomment the lines below and comment out the above lines.

# To load tokenized data, ensure the path is correct. Tokenizer as well as code to save tokenized content is in the run model file.

load_path = "dataset\\tokenized_content\\file_name.pt"

tokenized_data = torch.load(load_path)

train_ids = tokenized_data['train_ids']
test_ids = tokenized_data['test_ids']
train_masks = tokenized_data['train_masks']
test_masks = tokenized_data['test_masks']
train_labels = tokenized_data['train_labels']
test_labels = tokenized_data['test_labels']

# Uncomment the lines below to load unsplit data
# test_ids = tokenized_data['test_ids']
# test_masks = tokenized_data['test_masks']
# test_labels = tokenized_data['test_labels']

print("Tokenized data loaded successfully.")

  tokenized_data = torch.load(load_path)


Tokenized data loaded successfully.


The following cell is used to configure Flake8 for file level bug detection in Jupyter Notebooks. It works by decoding tokenized content into single Python files removing special tokens. It then uses Flake8 on the Python file to make a prediction on whether or not the file contains a bug. Errors we used in our Flake8 configuration were selected to reduce false positives in bug detection avoiding things such as stylistic recommendations.

In [None]:
import tempfile
import subprocess
import os
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
from transformers import RobertaTokenizer


# tokenizer setup for decoding
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

# setting up the special tokens use for finding cell boundaries in tokenized content
start_special_tokens = [f"<CELL_{i}>" for i in range(1, 1024)]
end_special_tokens = [f"<END_CELL_{i}>" for i in range(1, 1024)]
all_special_tokens = start_special_tokens + end_special_tokens

# Add tokens if not already in the vocabulary.
for token in all_special_tokens:
    if token not in tokenizer.get_vocab():
        tokenizer.add_tokens([token])

correctnessOfPredictions = [] # holds true or false for each prediction, true if the prediction is correct, false otherwise.


#### starting decoding notebooks
flat_codes, flat_labels = [], []
for chunks_ids, chunks_masks, chunk_label_lists in tqdm(
    zip(test_ids, test_masks, test_labels),
    total=len(test_ids),
    desc="Decoding & cleaning notebooks",
    dynamic_ncols=True,
):
    file_ids = chunks_ids[:4] # use same chunk size as JupOtter-base and JupOtter-small
    is_buggy = int(any((lbls == 1).any().item() for lbls in chunk_label_lists[:4]))
    flat_list = file_ids.reshape(-1).tolist()
    decoded = tokenizer.decode(flat_list, skip_special_tokens=True)
    
    # after this is just ensuring special tokens are removed even though we set skip_special_tokens=True
    for token in tokenizer.all_special_tokens:
        pattern = re.escape(token)
        decoded = re.sub(pattern, "", decoded)

    decoded = re.sub(r"<CELL_\d+>", "", decoded)
    decoded = re.sub(r"<END_CELL_\d+>", "", decoded)
    flat_codes.append(decoded)
    flat_labels.append(is_buggy)
#### end of decoding notebooks

results = [] # holds the results of the predictions, each element is a tuple for its notebook (is_buggy, label)

tq = tqdm(
    enumerate(zip(flat_codes, flat_labels)),
    total=len(flat_codes),
    desc="Static analysis Eval",
    dynamic_ncols=True,
    leave=True,
)
buggy_pred = 0
non_buggy_pred = 0
skipped = 0
for i, (code, label) in tq:
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", encoding="utf-8", delete=False) as tmp_file:
        tmp_file.write(code)
        tmp_filename = tmp_file.name


    try:
        result = subprocess.run(
            [ 
                    "flake8",
                    "--select=E9,F402,F405,F406,F407,F501,F502,F503,F505,F506,F507,F508,F509,F521,F524,F525,F621,F622,F633,F701,F702,F704,F706,F707,F821,F822,F823,F831,F901",
                    tmp_filename
                ],
        capture_output=True,
        text=True,
        encoding='utf-8' 
    )

        is_buggy = 0 if result.returncode == 0 else 1
        if is_buggy:
            buggy_pred += 1 
        else:
            non_buggy_pred += 1

        if is_buggy != label:
            correctnessOfPredictions.append(False)
        else:
            correctnessOfPredictions.append(True)

    except subprocess.TimeoutExpired:
        skipped += 1
        continue  # skip this file and move on

    os.remove(tmp_filename)
    results.append((is_buggy, label))

    # Live metrics, only used to display progress in the tqdm bar
    preds_so_far = [pred for pred, _ in results]
    labels_so_far = [true for _, true in results]
    f1 = f1_score(labels_so_far, preds_so_far, zero_division=0)
    acc = accuracy_score(labels_so_far, preds_so_far)
    tq.set_postfix({'F1': f"{f1:.3f}", 'Acc': f"{acc:.3f}", 'Recall': f"{recall_score(labels_so_far, preds_so_far, zero_division=0):.3f}"})
    tq.refresh()  


# Evaluate
correct = sum([pred == true for pred, true in results])
total = len(results)
accuracy = correct / total

print(f"\nFile-level Bug Detection via Flake8:")
print(f"Accuracy: {accuracy:.4f} ({correct}/{total})")

# Split predictions and labels
predictions = [pred for pred, _ in results]
labels = [true for _, true in results]

# Compute metrics
accuracy = accuracy_score(labels, predictions)
precision = precision_score(labels, predictions)
recall = recall_score(labels, predictions)
f1 = f1_score(labels, predictions)


print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"Skipped {skipped} files due to timeout.")
print(f"Buggy predictions: {buggy_pred}, Non-buggy predictions: {non_buggy_pred}")
