In [None]:
import os
from transformers import RobertaTokenizer
from tqdm.notebook import tqdm

In [None]:
os.getcwd()

# Data Loading

In [None]:
class Example(object):
    """A single training/test example."""
    def __init__(self,
                 source,
                 target,
                 lang
                 ):
        self.source = source
        self.target = target
        self.lang=lang

In [None]:
def read_examples(filename):
    """Read examples from filename."""
    examples=[]
    source,target=filename.split(',')
    lang='java'
    if source[-1]=='s':
        lang='c_sharp'
    with open(source,encoding="utf-8") as f1,open(target,encoding="utf-8") as f2:
        for idx, (line1,line2) in enumerate(zip(f1,f2)):
            line1=line1.strip()
            line2=line2.strip()
            examples.append(
                Example(
                    source=line1,
                    target=line2,
                    lang=lang
                        ) 
            )
            if idx == 5:
                break

    return examples

In [None]:
path_to_samples = "GraphCodeBERT/translation/data/train.java-cs.txt.java,GraphCodeBERT/translation/data/train.java-cs.txt.cs"
examples = read_examples(path_to_samples)[:1]

# Data Processing

In [None]:
from GraphCodeBERT.translation.parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
from GraphCodeBERT.translation.parser import (remove_comments_and_docstrings,
                   tree_to_token_index,
                   index_to_code_token,
                   tree_to_variable_index)
from tree_sitter import Language, Parser

In [None]:
parsers={}        
dfg_function={
    'python':DFG_python,
    'java':DFG_java,
    'ruby':DFG_ruby,
    'go':DFG_go,
    'php':DFG_php,
    'javascript':DFG_javascript,
    'c_sharp':DFG_csharp,
}

for lang in dfg_function:
    LANGUAGE = Language('GraphCodeBERT/translation/parser/my-languages.so', lang)
    parser = Parser()
    parser.set_language(LANGUAGE) 
    parser = [parser,dfg_function[lang]]    
    parsers[lang]= parser

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")

In [None]:
#remove comments, tokenize code and extract dataflow     
def extract_dataflow(code, parser,lang):
    #remove comments
    try:
        code=remove_comments_and_docstrings(code,lang)
    except:
        pass    
    #obtain dataflow
    if lang=="php":
        code="<?php"+code+"?>"    
    try:
        tree = parser[0].parse(bytes(code,'utf8'))    
        root_node = tree.root_node  
        tokens_index=tree_to_token_index(root_node)     
        code=code.split('\n')
        code_tokens=[index_to_code_token(x,code) for x in tokens_index]  
        index_to_code={}
        for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)):
            index_to_code[index]=(idx,code)
        # index_to_code = (  
        # ((0, 0), (0, 6)): (0, 'public'), 
        # ((0, 7), (0, 37)): (1, 'ListSpeechSynthesisTasksResult'), 
        # ((0, 38), (0, 62)): (2, 'listSpeechSynthesisTasks'), ...
        # )
        try:
            DFG,_=parser[1](root_node,index_to_code,{})
        except:
            DFG=[]
        DFG=sorted(DFG,key=lambda x:x[1])
        indexs=set()
        for d in DFG:
            if len(d[-1])!=0:
                indexs.add(d[1])
            for x in d[-1]:
                indexs.add(x)
        new_DFG=[]
        for d in DFG:
            if d[1] in indexs:
                new_DFG.append(d)
        dfg=new_DFG
    except:
        dfg=[]
    return code_tokens,dfg

In [None]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 example_id,
                 source_ids,
                 position_idx,
                 dfg_to_code,
                 dfg_to_dfg,                 
                 target_ids,
                 source_mask,
                 target_mask,

    ):
        self.example_id = example_id
        self.source_ids = source_ids
        self.position_idx = position_idx
        self.dfg_to_code = dfg_to_code
        self.dfg_to_dfg = dfg_to_dfg
        self.target_ids = target_ids
        self.source_mask = source_mask
        self.target_mask = target_mask  

def convert_examples_to_features(examples, tokenizer, args,stage=None):
    features = []
    for example_index, example in enumerate(tqdm(examples,total=len(examples))):
        ##extract data flow
        code_tokens,dfg=extract_dataflow(example.source,
                                         parsers["c_sharp" if args.source_lang == "cs" else "java"],
                                         "c_sharp" if args.source_lang == "cs" else "java")
        code_tokens=[tokenizer.tokenize('@ '+x)[1:] if idx!=0 else tokenizer.tokenize(x) for idx,x in enumerate(code_tokens)]
        print("After calling extract_dataflow function, the output is code_tokens and dfg")
        print("-"*20)
        print("code_tokens")
        print(code_tokens)
        print("\n")
        print("dfg")
        print(dfg)
        print()

        ori2cur_pos={}
        ori2cur_pos[-1]=(0,0)
        for i in range(len(code_tokens)):
            ori2cur_pos[i]=(ori2cur_pos[i-1][1],ori2cur_pos[i-1][1]+len(code_tokens[i]))    
        code_tokens=[y for x in code_tokens for y in x]  
        #truncating
        code_tokens=code_tokens[:args.max_source_length-3][:512-3]
        source_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
        # print("code_tokens tokenization")
        # print("-"*20)
        # print("source_tokens")
        # print(source_tokens)
        # print(f"len: {len(source_tokens)}")
        # print()
        source_ids =  tokenizer.convert_tokens_to_ids(source_tokens)
        # print("converting each element in code_tokens to its token id")
        # print("-"*20)
        # print("source_ids")
        # print(source_ids)
        # print(f"len: {len(source_ids)}")
        # print()
        position_idx = [i+tokenizer.pad_token_id + 1 for i in range(len(source_tokens))]
        dfg=dfg[:args.max_source_length-len(source_tokens)]
        source_tokens+=[x[0] for x in dfg]
        position_idx+=[0 for x in dfg]
        source_ids+=[tokenizer.unk_token_id for x in dfg]
        # print("insert a special token to represent the dfg in the source ids")
        # print("-"*20)
        # print("source_ids")
        # print(source_ids)
        # print(f"len: {len(source_ids)}")
        # print()
        padding_length=args.max_source_length-len(source_ids)
        position_idx+=[tokenizer.pad_token_id]*padding_length
        # print("geenerate position idx")
        # print("-"*20)
        # print("position_idx")
        # print(position_idx)
        # print(f"len: {len(position_idx)}")
        # print()
        source_ids+=[tokenizer.pad_token_id]*padding_length      
        # print("insert padding token")
        # print("-"*20)
        # print("source_ids")
        # print(source_ids)
        # print(f"len: {len(source_ids)}")
        # print()
        source_mask = [1] * (len(source_tokens))
        source_mask+=[0]*padding_length        
        # print("generate source mask")
        # print("-"*20)
        # print("source_mask")
        # print(source_mask)
        # print(f"len: {len(source_mask)}")
        # print()
        #reindex
        reverse_index={}
        for idx,x in enumerate(dfg):
            reverse_index[x[1]]=idx
        for idx,x in enumerate(dfg):
            dfg[idx]=x[:-1]+([reverse_index[i] for i in x[-1] if i in reverse_index],)    
        dfg_to_dfg=[x[-1] for x in dfg]
        dfg_to_code=[ori2cur_pos[x[1]] for x in dfg]
        length=len([tokenizer.cls_token])
        dfg_to_code=[(x[0]+length,x[1]+length) for x in dfg_to_code]        
        

        #target
        if stage=="test":
            target_tokens = tokenizer.tokenize("None")
        else:
            target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
        target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
        target_mask = [1] *len(target_ids)
        padding_length = args.max_target_length - len(target_ids)
        target_ids+=[tokenizer.pad_token_id]*padding_length
        target_mask+=[0]*padding_length   
       
        features.append(
            InputFeatures(
                 example_index,
                 source_ids,
                 position_idx,
                 dfg_to_code,
                 dfg_to_dfg,
                 target_ids,
                 source_mask,
                 target_mask,
            )
        )
    return features


In [None]:
args = dotdict({
    'source_lang': 'java',
    'max_source_length': 200,
    'max_target_length': 200
})

In [None]:
args.source_lang

In [None]:
examples[0].source

In [None]:
features = convert_examples_to_features(examples, tokenizer, args)

In [None]:
features

In [None]:
features[0]

In [None]:
print(features[0].example_id)


In [None]:
print(features[0].source_ids)


In [None]:
print(features[0].position_idx)

In [None]:
print(features[0].dfg_to_code)

In [None]:
print(features[0].dfg_to_dfg)

In [None]:
print(features[0].target_ids)

In [None]:
print(features[0].source_mask)

In [None]:
print(features[0].target_ids)

In [None]:
print(features[0].target_mask)