### A. T5Code model
Note that T5Code+ is available

In [1]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

# model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')
# tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-large')
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-large')



####  1. Code Completion

In [2]:
code_snippet = '''
def add(a, b):
    return a +
'''

inputs = tokenizer("complete: " + code_snippet, return_tensors="pt")

outputs = model.generate(**inputs, max_length=50)
completed_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Completed Code:\n", completed_code.strip())

Completed Code:
 + b















b):
  return a +

complete:


#### 2. Code summarization

In [3]:
code_snippet = '''
def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n - 1)
'''

inputs = tokenizer("summarize: " + code_snippet, return_tensors="pt")

outputs = model.generate(**inputs, max_length=50)
completed_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Code summary:\n", completed_code)

Code summary:
 






    return n * factorial(n - 1)


summarize: 






   factorial


#### 3. Bug fixing

In [4]:
# Example of buggy code snippet
buggy_code_snippet = """
def calculate_area(radius):
    pi = 3.14159
    return pi * radius * radius * radius
"""

# Tokenize input with "fix" task prefix
inputs = tokenizer("fix: " + buggy_code_snippet, return_tensors="pt")

# Generate fixed code
outputs = model.generate(**inputs, max_length=50)
fixed_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Fixed Code:\n", fixed_code)

Fixed Code:
  






 calculate_area(radius)













calculate_area


### B. Tufano dataset
Included in CodeXGLUE

In [5]:
data_folder = './data/Tufano_Small'

In [6]:
import os
from utils import find_buggy_token_positions

file_buggy_path = os.path.join(data_folder, 'train.buggy-fixed.buggy') 
file_fixed_path = os.path.join(data_folder, 'train.buggy-fixed.fixed')

with open(file_buggy_path, 'r') as f_buggy, open(file_fixed_path, 'r') as f_fixed:
    buggy_lines = f_buggy.readlines()
    fixed_lines = f_fixed.readlines()

assert len(buggy_lines) == len(fixed_lines), "Buggy and fixed files should have the same number of lines."

labeled_data = []

for buggy_code, fixed_code in zip(buggy_lines, fixed_lines):
    buggy_code = buggy_code.strip()
    fixed_code = fixed_code.strip()
    
    start_idx, end_idx = find_buggy_token_positions(buggy_code, fixed_code)
    
    if start_idx is not None and end_idx is not None:
        labeled_data.append({
            "buggy_code": buggy_code,
            "fixed_code": fixed_code,
            "start_pos": start_idx,
            "end_pos": end_idx,
            "start_token": buggy_code.split()[start_idx],
            "end_token": buggy_code.split()[end_idx],
            "buggy_tokens": ' '.join(buggy_code.split()[start_idx:end_idx+1])
        })

In [7]:
import random 

rand_list = []
for i in range(0,10):
    n = random.randint(0,len(labeled_data))
    rand_list.append(n)
         
for i, entry in enumerate(labeled_data):
    if i in rand_list:
        print(f"Sample {i+1}:")
        print("Buggy Code:", entry["buggy_code"])
        print("Fixed Code:", entry["fixed_code"])
        print("Buggy Token Start Position:", entry["start_pos"])
        print("Buggy Token End Position:", entry["end_pos"])
        print("Buggy Start Token:", entry["start_token"])
        print("Buggy End Token:", entry["end_token"])
        print("Buggy Tokens:", entry["buggy_tokens"])

Sample 114:
Buggy Code: public void METHOD_1 ( final android.content.Intent intent , int VAR_1 ) { VAR_2 . METHOD_2 ( new TYPE_1 ( ) { public void METHOD_3 ( ) { METHOD_4 ( intent , true ) ; } } , VAR_1 ) ; }
Fixed Code: public void METHOD_3 ( ) { METHOD_4 ( intent , true ) ; }
Buggy Token Start Position: 2
Buggy Token End Position: 37
Buggy Start Token: METHOD_1
Buggy End Token: VAR_1
Buggy Tokens: METHOD_1 ( final android.content.Intent intent , int VAR_1 ) { VAR_2 . METHOD_2 ( new TYPE_1 ( ) { public void METHOD_3 ( ) { METHOD_4 ( intent , true ) ; } } , VAR_1
Sample 2040:
Buggy Code: public void METHOD_1 ( TYPE_1 VAR_1 ) { if ( VAR_2 . getText ( ) . equals ( STRING_1 ) ) VAR_2 . setText ( STRING_2 ) ; }
Fixed Code: public void METHOD_1 ( TYPE_1 VAR_1 ) { java.lang.String text = VAR_2 . getText ( ) ; if ( ( text != null ) && ( text . equals ( STRING_1 ) ) ) VAR_2 . setText ( STRING_2 ) ; }
Buggy Token Start Position: 8
Buggy Token End Position: 18
Buggy Start Token: if
Buggy End Tok

In [8]:
import pandas as pd

df = pd.DataFrame(labeled_data)
csv_file_path = "train_tufano_small.csv"
df.to_csv(csv_file_path, index=False)
print(f"Data saved to {csv_file_path}")

Data saved to train_tufano_small.csv
