In [1]:
import pandas as pd

df = pd.read_csv("../code_bug_fix_pairs.csv") #read dataset

print("Shape:",df.shape) #see the shape (1000 rows, 6 columns)
df.head() #just print the first 5 rows

Shape: (1000, 6)


Unnamed: 0,id,buggy_code,fixed_code,commit_message,commit_url,date
0,1,"x = [1, 2, 3]\nprint x\n# Sample ID: 1","x = [1, 2, 3]\nprint(x)\n# Sample ID: 1",Improved readability with proper indentation,https://github.com/open-source-repo/commit/a5a...,2024-12-16
1,2,"list = [1, 2, 3, 4]\nfor i in list\n print(...","lst = [1, 2, 3, 4]\nfor i in lst:\n print(i...",Corrected conditional operator mistake,https://github.com/open-source-repo/commit/f47...,2024-01-03
2,3,def factorial(n):\n if n == 1\n retu...,def factorial(n):\n if n == 1:\n ret...,Resolved off-by-one error in loop,https://github.com/open-source-repo/commit/e89...,2023-09-05
3,4,def foo()\n print('Missing colon in functio...,def foo():\n print('Fixed missing colon in ...,Added missing parentheses for print function,https://github.com/open-source-repo/commit/bd7...,2024-09-15
4,5,def factorial(n):\n if n == 1\n retu...,def factorial(n):\n if n == 1:\n ret...,Fixed bug in recursive function call,https://github.com/open-source-repo/commit/d66...,2024-01-24


In [50]:
from transformers import AutoTokenizer #tokenizer from pretrained model

tok = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
print(type(tok).__name__,",Vocabulary size:", tok.vocab_size)

RobertaTokenizerFast ,Vocabulary size: 32100


In [51]:
#we split the data into 2 big lists
texts_buggy = df["buggy_code"].astype(str).tolist() #the list with the buggy code
texts_clean = df["fixed_code"].astype(str).tolist() #the list with the clean code

In [52]:
enc_buggy = tokenizer(
    texts_buggy,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

print(enc_buggy["input_ids"].shape)
print(enc_buggy["attention_mask"].shape)
#print(enc_buggy)

torch.Size([1000, 41])
torch.Size([1000, 41])


In [56]:
enc_clean = tokenizer(
    texts_clean,
    padding=True, # dynamic padding to longest in batch
    truncation=True, # enforce cap
    max_length=512, # your chosen context length
    return_tensors="pt" #return tensors, not lists
)

def compare_pair(i):
    print("=" * 100)
    print(f"PAIR {i}")

    for label, texts, enc in [
        ("BUGGY", texts_buggy, enc_buggy),
        ("FIXED", texts_clean, enc_clean),
    ]:
        ids = enc["input_ids"][i]
        mask = enc["attention_mask"][i]
        tokens = tokenizer.convert_ids_to_tokens(ids[mask == 1])

        print(f"\n--- {label} ---")
        print("TOKENS:")
        print(tokens[:80])
        print("\nRECONSTRUCTED:")
        print(tokenizer.convert_tokens_to_string(tokens[:80]))

compare_pair(0)
compare_pair(1)

PAIR 0

--- BUGGY ---
TOKENS:
['<s>', 'x', 'Ġ=', 'Ġ[', '1', ',', 'Ġ2', ',', 'Ġ3', ']', 'Ċ', 'print', 'Ġx', 'Ċ', '#', 'ĠSample', 'ĠID', ':', 'Ġ1', '</s>']

RECONSTRUCTED:
<s>x = [1, 2, 3]
print x
# Sample ID: 1</s>

--- FIXED ---
TOKENS:
['<s>', 'x', 'Ġ=', 'Ġ[', '1', ',', 'Ġ2', ',', 'Ġ3', ']', 'Ċ', 'print', '(', 'x', ')', 'Ċ', '#', 'ĠSample', 'ĠID', ':', 'Ġ1', '</s>']

RECONSTRUCTED:
<s>x = [1, 2, 3]
print(x)
# Sample ID: 1</s>
PAIR 1

--- BUGGY ---
TOKENS:
['<s>', 'list', 'Ġ=', 'Ġ[', '1', ',', 'Ġ2', ',', 'Ġ3', ',', 'Ġ4', ']', 'Ċ', 'for', 'Ġi', 'Ġin', 'Ġlist', 'Ċ', 'ĠĠĠ', 'Ġprint', '(', 'i', ')', 'Ċ', '#', 'ĠSample', 'ĠID', ':', 'Ġ2', '</s>']

RECONSTRUCTED:
<s>list = [1, 2, 3, 4]
for i in list
    print(i)
# Sample ID: 2</s>

--- FIXED ---
TOKENS:
['<s>', 'lst', 'Ġ=', 'Ġ[', '1', ',', 'Ġ2', ',', 'Ġ3', ',', 'Ġ4', ']', 'Ċ', 'for', 'Ġi', 'Ġin', 'Ġlst', ':', 'Ċ', 'ĠĠĠ', 'Ġprint', '(', 'i', ')', 'Ċ', '#', 'ĠSample', 'ĠID', ':', 'Ġ2', '</s>']

RECONSTRUCTED:
<s>lst = [1, 2, 3, 4]
for i in lst

In [55]:
lengths = [
    len(tokenizer(t, truncation=False)["input_ids"])
    for t in texts_buggy
]

print("Max tokens:", max(lengths))


Max tokens: 41
