### Tufano dataset
(Included in CodeXGLUE)
Downloaded from: https://huggingface.co/datasets/semeru/code-code-CodeRefinement-Java-Small/tree/main

TODO: 
- https://huggingface.co/datasets/semeru/code-code-CodeRefinement-Java-Medium/tree/main
- https://huggingface.co/datasets/semeru/code-code-BugFixingSmall
- https://huggingface.co/datasets/semeru/code-code-BugFixingMed

In [10]:
import os
import random 
import pandas as pd
from utils import find_buggy_token_positions

In [11]:
data_folder = './raw_data/Tufano_Small'

### Train data

In [3]:
file_buggy_path = os.path.join(data_folder, 'train.buggy-fixed.buggy') 
file_fixed_path = os.path.join(data_folder, 'train.buggy-fixed.fixed')

with open(file_buggy_path, 'r') as f_buggy, open(file_fixed_path, 'r') as f_fixed:
    buggy_lines = f_buggy.readlines()
    fixed_lines = f_fixed.readlines()

assert len(buggy_lines) == len(fixed_lines), "Buggy and fixed files should have the same number of lines."

labeled_data = []

for buggy_code, fixed_code in zip(buggy_lines, fixed_lines):
    buggy_code = buggy_code.strip()
    fixed_code = fixed_code.strip()
    
    start_idx, end_idx = find_buggy_token_positions(buggy_code, fixed_code)
    
    if start_idx is not None and end_idx is not None:
        labeled_data.append({
            "buggy_code": buggy_code,
            "fixed_code": fixed_code,
            "start_pos": start_idx,
            "end_pos": end_idx,
            "start_token": buggy_code.split()[start_idx],
            "end_token": buggy_code.split()[end_idx],
            "buggy_tokens": ' '.join(buggy_code.split()[start_idx:end_idx+1])
        })

In [4]:
rand_list = []
for i in range(0,10):
    n = random.randint(0,len(labeled_data))
    rand_list.append(n)
         
for i, entry in enumerate(labeled_data):
    if i in rand_list:
        print(f"Sample {i+1}:")
        print("Buggy Code:", entry["buggy_code"])
        print("Fixed Code:", entry["fixed_code"])
        print("Buggy Token Start Position:", entry["start_pos"])
        print("Buggy Token End Position:", entry["end_pos"])
        print("Buggy Start Token:", entry["start_token"])
        print("Buggy End Token:", entry["end_token"])
        print("Buggy Tokens:", entry["buggy_tokens"])

Sample 8143:
Buggy Code: public void init ( ) { if ( ( VAR_1 ) == null ) { return ; } METHOD_1 ( ) ; METHOD_2 ( ) ; METHOD_3 ( ) ; VAR_2 = true ; }
Fixed Code: public boolean init ( ) { if ( ( VAR_1 ) == null ) { return false ; } METHOD_1 ( ) ; METHOD_2 ( ) ; if ( ! ( METHOD_3 ( ) ) ) { return false ; } VAR_2 = true ; return VAR_2 ; }
Buggy Token Start Position: 1
Buggy Token End Position: 32
Buggy Start Token: void
Buggy End Token: true
Buggy Tokens: void init ( ) { if ( ( VAR_1 ) == null ) { return ; } METHOD_1 ( ) ; METHOD_2 ( ) ; METHOD_3 ( ) ; VAR_2 = true
Sample 9018:
Buggy Code: public TYPE_1 < TYPE_2 > add ( TYPE_3 a , TYPE_1 < TYPE_2 > b ) { TYPE_4 c = new TYPE_4 ( ( ) - > new TYPE_5 ( a ) , b ) ; VAR_1 . append ( c ) ; return c ; }
Fixed Code: public TYPE_1 < TYPE_2 > add ( TYPE_3 a , TYPE_1 < TYPE_2 > b ) { TYPE_4 c = new TYPE_4 ( ( ) - > new TYPE_5 ( a ) , b ) ; return VAR_1 . append ( c ) ; }
Buggy Token Start Position: 36
Buggy Token End Position: 44
Buggy Start Token: VA

In [5]:
df = pd.DataFrame(labeled_data)
csv_file_path = "./data/train_tufano_small.csv"
df.to_csv(csv_file_path, index=False)
print(f"Data saved to {csv_file_path}")

Data saved to train_tufano_small.csv


### Validation data

In [6]:
file_buggy_path = os.path.join(data_folder, 'valid.buggy-fixed.buggy') 
file_fixed_path = os.path.join(data_folder, 'valid.buggy-fixed.fixed')

with open(file_buggy_path, 'r') as f_buggy, open(file_fixed_path, 'r') as f_fixed:
    buggy_lines = f_buggy.readlines()
    fixed_lines = f_fixed.readlines()

assert len(buggy_lines) == len(fixed_lines), "Buggy and fixed files should have the same number of lines."

labeled_data = []

for buggy_code, fixed_code in zip(buggy_lines, fixed_lines):
    buggy_code = buggy_code.strip()
    fixed_code = fixed_code.strip()
    
    start_idx, end_idx = find_buggy_token_positions(buggy_code, fixed_code)
    
    if start_idx is not None and end_idx is not None:
        labeled_data.append({
            "buggy_code": buggy_code,
            "fixed_code": fixed_code,
            "start_pos": start_idx,
            "end_pos": end_idx,
            "start_token": buggy_code.split()[start_idx],
            "end_token": buggy_code.split()[end_idx],
            "buggy_tokens": ' '.join(buggy_code.split()[start_idx:end_idx+1])
        })

In [7]:
rand_list = []
for i in range(0,10):
    n = random.randint(0,len(labeled_data))
    rand_list.append(n)
         
for i, entry in enumerate(labeled_data):
    if i in rand_list:
        print(f"Sample {i+1}:")
        print("Buggy Code:", entry["buggy_code"])
        print("Fixed Code:", entry["fixed_code"])
        print("Buggy Token Start Position:", entry["start_pos"])
        print("Buggy Token End Position:", entry["end_pos"])
        print("Buggy Start Token:", entry["start_token"])
        print("Buggy End Token:", entry["end_token"])
        print("Buggy Tokens:", entry["buggy_tokens"])

Sample 208:
Buggy Code: public static void METHOD_1 ( ) { VAR_1 = new TYPE_1 ( VAR_2 ) ; java.lang.String VAR_3 = VAR_1 . METHOD_2 ( VAR_4 . file ) ; java.lang.System.out.println ( VAR_3 ) ; TYPE_2 VAR_5 = TYPE_3 . METHOD_3 ( VAR_3 ) ; VAR_1 . METHOD_4 ( VAR_5 ) ; }
Fixed Code: public static void METHOD_1 ( ) { VAR_1 = new TYPE_1 ( ) ; java.lang.String VAR_3 = VAR_1 . METHOD_2 ( VAR_4 . file ) ; java.lang.System.out.println ( VAR_3 ) ; TYPE_2 VAR_5 = TYPE_3 . METHOD_3 ( VAR_3 ) ; VAR_1 . METHOD_4 ( VAR_5 ) ; }
Buggy Token Start Position: 12
Buggy Token End Position: 12
Buggy Start Token: VAR_2
Buggy End Token: VAR_2
Buggy Tokens: VAR_2
Sample 484:
Buggy Code: public static void i ( java.lang.String msg , TYPE_1 ... args ) { if ( TYPE_2 . METHOD_1 ( ) ) TYPE_3 . i ( msg , args ) ; }
Fixed Code: public static void i ( java.lang.String msg ) { if ( TYPE_2 . METHOD_1 ( ) ) TYPE_3 . i ( msg ) ; }
Buggy Token Start Position: 7
Buggy Token End Position: 27
Buggy Start Token: ,
Buggy End Token

In [8]:
df = pd.DataFrame(labeled_data)
csv_file_path = "./data/valid_tufano_small.csv"
df.to_csv(csv_file_path, index=False)
print(f"Data saved to {csv_file_path}")

Data saved to valid_tufano_small.csv


### Test data

In [12]:
file_buggy_path = os.path.join(data_folder, 'test.buggy-fixed.buggy') 
file_fixed_path = os.path.join(data_folder, 'test.buggy-fixed.fixed')

with open(file_buggy_path, 'r') as f_buggy, open(file_fixed_path, 'r') as f_fixed:
    buggy_lines = f_buggy.readlines()
    fixed_lines = f_fixed.readlines()

assert len(buggy_lines) == len(fixed_lines), "Buggy and fixed files should have the same number of lines."

labeled_data = []

for buggy_code, fixed_code in zip(buggy_lines, fixed_lines):
    buggy_code = buggy_code.strip()
    fixed_code = fixed_code.strip()
    
    start_idx, end_idx = find_buggy_token_positions(buggy_code, fixed_code)
    
    if start_idx is not None and end_idx is not None:
        labeled_data.append({
            "buggy_code": buggy_code,
            "fixed_code": fixed_code,
            "start_pos": start_idx,
            "end_pos": end_idx,
            "start_token": buggy_code.split()[start_idx],
            "end_token": buggy_code.split()[end_idx],
            "buggy_tokens": ' '.join(buggy_code.split()[start_idx:end_idx+1])
        })

In [14]:
rand_list = []
for i in range(0,10):
    n = random.randint(0,len(labeled_data))
    rand_list.append(n)
         
for i, entry in enumerate(labeled_data):
    if i in rand_list:
        print(f"Sample {i+1}:")
        print("Buggy Code:", entry["buggy_code"])
        print("Fixed Code:", entry["fixed_code"])
        print("Buggy Token Start Position:", entry["start_pos"])
        print("Buggy Token End Position:", entry["end_pos"])
        print("Buggy Start Token:", entry["start_token"])
        print("Buggy End Token:", entry["end_token"])
        print("Buggy Tokens:", entry["buggy_tokens"])

Sample 450:
Buggy Code: public static void METHOD_1 ( final TYPE_1 VAR_1 , final java.lang.Class < ? > VAR_2 ) { TYPE_2 . METHOD_1 ( VAR_1 , VAR_2 . getName ( ) . replace ( CHAR_1 , CHAR_2 ) , VAR_3 ) ; }
Fixed Code: public static void METHOD_1 ( final TYPE_1 VAR_1 , final java.lang.Class < ? > VAR_2 ) { TYPE_2 . METHOD_1 ( VAR_1 , VAR_2 . getName ( ) . replace ( CHAR_1 , CHAR_2 ) ) ; }
Buggy Token Start Position: 35
Buggy Token End Position: 36
Buggy Start Token: ,
Buggy End Token: VAR_3
Buggy Tokens: , VAR_3
Sample 1549:
Buggy Code: public void METHOD_1 ( int VAR_1 ) throws TYPE_1 { METHOD_1 ( METHOD_2 ( ) , VAR_1 ) ; }
Fixed Code: public void METHOD_1 ( int VAR_1 ) throws TYPE_1 { METHOD_1 ( METHOD_2 ( ) . getName ( ) , VAR_1 ) ; }
Buggy Token Start Position: 15
Buggy Token End Position: 12
Buggy Start Token: ,
Buggy End Token: METHOD_2
Buggy Tokens: 
Sample 2650:
Buggy Code: public static TYPE_1 METHOD_1 ( java.util.ArrayList < TYPE_2 > VAR_1 ) { TYPE_3 args = new TYPE_3 ( ) ; args

In [15]:
df = pd.DataFrame(labeled_data)
csv_file_path = "./data/test_tufano_small.csv"
df.to_csv(csv_file_path, index=False)
print(f"Data saved to {csv_file_path}")

Data saved to ./data/test_tufano_small.csv
