In [None]:
import pandas as pd
import json
import numpy as np
from transformers import RobertaTokenizer
from tqdm.notebook import tqdm


In [None]:
data_py = pd.read_csv('./Data/pysstubs.csv')
data_py['sstub_pattern'] = data_py['sstub_pattern'].apply(lambda x: x.replace(' ','_').lower())
data_py = data_py[['line_before','line_after','sstub_pattern']].values


In [None]:
dif_label = {'OVERLOAD_METHOD_MORE_ARGS':'Same Function More Args',
             'DIFFERENT_METHOD_SAME_ARGS':'Wrong Method/Function Name',
             'CHANGE_IDENTIFIER':'Change Identifier Used',
             'CHANGE_NUMERAL':'Change Numeric Literal',
             'CHANGE_OPERAND':'Change Operand',
             'OVERLOAD_METHOD_DELETED_ARGS':'Same Function Less Args',
             'MORE_SPECIFIC_IF':'More Specific If',
             'CHANGE_UNARY_OPERATOR':'Change Unary Operator',
             'SWAP_BOOLEAN_LITERAL':'Change Boolean Literal',
             'CHANGE_CALLER_IN_FUNCTION_CALL':'Same Function Wrong Caller',
             'CHANGE_OPERATOR':'Change Binary Operator',
             'LESS_SPECIFIC_IF':'Less Specific If',
             'SWAP_ARGUMENTS':'Same Function Swap Args'}
def changeLabel(x):
    if x in dif_label:
        return dif_label[x].replace(' ','_').lower()
    return x.lower()
with open('./Data/sstubsLarge.json') as f:
    data_j = json.load(f)
    data_j = pd.DataFrame(data_j)
    data_j['bugType'] = data_j['bugType'].apply(lambda x: changeLabel(x))
    data_j = data_j[['before','after','bugType']].values

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
max_len = 100

In [None]:
X_py = []
Y_py = []
Y2_py = []
for i in tqdm(range(len(data_py))):
    try:
        t1 = tokenizer.tokenize(data_py[i][0])
        t2 = tokenizer.tokenize(data_py[i][1])
        X_py += [t1]
        Y_py += [t2]
        Y2_py += [data_py[i][2]]
    except:
        pass

In [None]:
X_j = []
Y_j = []
Y2_j = []
for i in tqdm(range(len(data_j))):
    t1 = tokenizer.tokenize(data_j[i][0])
    t2 = tokenizer.tokenize(data_j[i][1])
    X_j += [t1]
    Y_j += [t2]
    Y2_j += [data_j[i][2]]


In [None]:
label_tok = ['Same Function More Args', 'Wrong Method/Function Name', 'Change Identifier Used',
             'Change Numeric Literal', 'Change Operand', 'Same Function Less Args', 'More Specific If',
             'Change Unary Operator', 'Change Boolean Literal', 'Same Function Wrong Caller',
             'Change Binary Operator', 'Less Specific If', 'Same Function Swap Args']
label_tok = {v.replace(' ','_').lower():i+1 for i,v in enumerate(label_tok)}
for i in Y2_j:
    if not i in label_tok:
        label_tok[i] = len(label_tok)+1
for i in Y2_py:
    if not i in label_tok:
        label_tok[i] = len(label_tok)+1
# label_tok

In [None]:
X_py_ = []
Y_py_ = []
Y2_py_ = []
for i in tqdm(range(len(X_py))):
    t1 = [tokenizer.cls_token] + X_py[i] + [tokenizer.sep_token]
    t2 = [tokenizer.cls_token] + Y_py[i] + [tokenizer.sep_token]
    if len(t1) > max_len:
        continue
    if len(t2) > max_len:
        continue
    X_py_ += [tokenizer.convert_tokens_to_ids(t1+[tokenizer.pad_token]*(max_len-len(t1)))]
    Y_py_ += [tokenizer.convert_tokens_to_ids(t2+[tokenizer.pad_token]*(max_len-len(t2)))]
    Y2_py_ += [label_tok[Y2_py[i]]]

In [None]:
X_j_ = []
Y_j_ = []
Y2_j_ = []
for i in tqdm(range(len(X_j))):
    t1 = [tokenizer.cls_token] + X_j[i] + [tokenizer.sep_token]
    t2 = [tokenizer.cls_token] + Y_j[i] + [tokenizer.sep_token]
    if len(t1) > max_len:
        continue
    if len(t2) > max_len:
        continue
    X_j_ += [tokenizer.convert_tokens_to_ids(t1+[tokenizer.pad_token]*(max_len-len(t1)))]
    Y_j_ += [tokenizer.convert_tokens_to_ids(t2+[tokenizer.pad_token]*(max_len-len(t2)))]
    Y2_j_ += [label_tok[Y2_j[i]]]

In [None]:
X_j_ = np.array(X_j_)
Y_j_ = np.array(Y_j_)
Y2_j_ = np.array(Y2_j_)
X_py_ = np.array(X_py_)
Y_py_ = np.array(Y_py_)
Y2_py_ = np.array(Y2_py_)
X_j_.shape,Y_j_.shape,Y2_j_.shape,X_py_.shape,Y_py_.shape,Y2_py_.shape

In [None]:
np.savez_compressed('./Data/java_processed_sstubs',X=X_j_,Y=Y_j_,Y2=Y2_j_)

In [None]:
np.savez_compressed('./Data/python_processed_sstubs',X=X_py_,Y=Y_py_,Y2=Y2_py_)