In [1]:
source="java"
target="cs"
lr=1e-4
batch_size=64
beam_size=10
source_length=320
target_length=256
output_dir=f"saved_models/{source}-{target}/"
train_file=f"data/train.java-cs.txt.{source},data/train.java-cs.txt.{target}"
dev_file=f"data/valid.java-cs.txt.{source},data/valid.java-cs.txt.{target}"
epochs=2
pretrained_model="microsoft/graphcodebert-base"

In [2]:
from __future__ import absolute_import
import os
import sys
import pickle
import torch
import json
import random
import logging
import argparse
import numpy as np
from io import open
from itertools import cycle
import torch.nn as nn
from model import Seq2Seq
from tqdm import tqdm, trange
from bleu import _bleu
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaModel, RobertaTokenizer)
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
from parser import (remove_comments_and_docstrings,
                   tree_to_token_index,
                   index_to_code_token,
                   tree_to_variable_index)
from tree_sitter import Language, Parser

In [3]:
dfg_function={
    'python': DFG_python,
    'java': DFG_java,
    'ruby': DFG_ruby,
    'go': DFG_go,
    'php': DFG_php,
    'javascript':DFG_javascript,
    'c_sharp':DFG_csharp,
}

logger = logging.getLogger(__name__)
#load parsers
parsers={}        
for lang in dfg_function:
    # print(Language)
    LANGUAGE = Language('parser/my-languages.so', lang)
    parser = Parser()
    parser.set_language(LANGUAGE) 
    parser = [parser,dfg_function[lang]]    
    parsers[lang]= parser



### Helper functions

In [4]:
class Example(object):
    """A single training/test example."""
    def __init__(self,
                 source,
                 target,
                 lang
                 ):
        self.source = source
        self.target = target
        self.lang=lang

In [5]:
def extract_dataflow(code, parser,lang):
    #remove comments
    try:
        code=remove_comments_and_docstrings(code,lang)
    except:
        pass    
    #obtain dataflow
    if lang=="php":
        code="<?php"+code+"?>"    
    try:
        tree = parser[0].parse(bytes(code,'utf8'))    
        root_node = tree.root_node  
        tokens_index=tree_to_token_index(root_node)     
        code=code.split('\n')
        code_tokens=[index_to_code_token(x,code) for x in tokens_index]  
        index_to_code={}
        for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)):
            index_to_code[index]=(idx,code)  
        try:
            DFG,_=parser[1](root_node,index_to_code,{}) 
        except:
            DFG=[]
        DFG=sorted(DFG,key=lambda x:x[1])
        indexs=set()
        for d in DFG:
            if len(d[-1])!=0:
                indexs.add(d[1])
            for x in d[-1]:
                indexs.add(x)
        new_DFG=[]
        for d in DFG:
            if d[1] in indexs:
                new_DFG.append(d)
        dfg=new_DFG
    except:
        dfg=[]
    return code_tokens,dfg

In [6]:
def extract_dataflow_and_cfg(code, parser, lang):
    """
    Extract dataflow (DFG) and control flow (CFG) from the code.
    """
    # Remove comments
    try:
        code = remove_comments_and_docstrings(code, lang)
    except Exception as e:
        print(f"Error removing comments: {e}")
        pass

    if lang == "php":
        code = "<?php" + code + "?>"

    code_tokens, dfg, cfg = [], [], []

    try:
        # Parse the code to obtain the AST
        tree = parser[0].parse(bytes(code, 'utf8'))
        root_node = tree.root_node

        # Extract tokens
        tokens_index = tree_to_token_index(root_node)
        code_lines = code.split('\n')
        code_tokens = [index_to_code_token(x, code_lines) for x in tokens_index]

        # Map tokens to positions
        index_to_code = {idx: (i, token) for idx, (i, token) in enumerate(zip(tokens_index, code_tokens))}

        # Extract dataflow
        try:
            DFG, _ = parser[1](root_node, index_to_code, {})
        except Exception as e:
            print(f"Error extracting DFG: {e}")
            DFG = []

        DFG = sorted(DFG, key=lambda x: x[1])

        # Filter DFG to include only relevant indices
        index_set = set()
        for d in DFG:
            if len(d[-1]) != 0:
                index_set.add(d[1])
            for x in d[-1]:
                index_set.add(x)

        filtered_DFG = [d for d in DFG if d[1] in index_set]
        dfg = filtered_DFG

        # Extract control flow (CFG)
        cfg = extract_control_flow_edges(root_node)

    except Exception as e:
        print(f"Error parsing tree or extracting flows: {e}")

    return code_tokens, dfg, cfg


def extract_control_flow_edges(root_node):
    """
    Extract control flow edges (e.g., branching, loops) from the AST.
    """
    edges = []

    def traverse(node):
        if node.type == "if_statement":
            # Extract the condition
            condition_node = next(
                (child for child in node.children if child.type == "parenthesized_expression"), None
            )
            # Extract the "then" block
            then_node = next(
                (child for child in node.children if child.type == "block"), None
            )
            # Add edges for condition -> then
            if condition_node and then_node:
                edges.append((condition_node.start_point, then_node.start_point))
            # Extract the "else" block, if present
            else_node = next(
                (child for child in node.children if child.type == "else_body"), None
            )
            if condition_node and else_node:
                edges.append((condition_node.start_point, else_node.start_point))
        elif node.type in ("for_statement", "while_statement"):
            # Extract loop condition and body
            condition_node = next(
                (child for child in node.children if child.type == "parenthesized_expression"), None
            )
            body_node = next(
                (child for child in node.children if child.type == "block"), None
            )
            if condition_node and body_node:
                edges.append((condition_node.start_point, body_node.start_point))
        elif node.type == "return_statement":
            # Add a return statement as a flow edge
            edges.append((node.start_point, node.end_point))

        # Recurse for all children
        for child in node.children:
            traverse(child)

    traverse(root_node)
    return edges

In [None]:
code = """
public class Factorial {
    public static long factorial(int n) {
        if (n <= 1) {
            return 1;
        }
        return n * factorial(n - 1);
    }
}
"""

# code_tokens, dfg, cfg = extract_dataflow_and_cfg(code, parser, "java")
code_tokens, dfg = extract_dataflow(code, parsers["java"], "java")
print("Code Tokens:", code_tokens)
print("Data Flow Graph (DFG):", dfg)

# print("Control Flow Graph (CFG):", cfg)


Code Tokens: ['public', 'class', 'Factorial', '{', 'public', 'static', 'long', 'factorial', '(', 'int', 'n', ')', '{', 'if', '(', 'n', '<=', '1', ')', '{', 'return', '1', ';', '}', 'return', 'n', '*', 'factorial', '(', 'n', '-', '1', ')', ';', '}', '}']
Data Flow Graph (DFG): [('factorial', 7, 'comesFrom', [], []), ('n', 10, 'comesFrom', [], []), ('n', 15, 'comesFrom', ['n'], [10]), ('n', 25, 'comesFrom', ['n'], [10]), ('factorial', 27, 'comesFrom', ['factorial'], [7]), ('n', 29, 'comesFrom', ['n'], [10])]


In [8]:
def extract_controlflow(code, parser, lang):
    """
    Extract control flow edges (CFG) from the code.
    """
    try:
        # Parse the code to obtain the AST
        tree = parser[0].parse(bytes(code, 'utf8'))
        root_node = tree.root_node

        # Extract control flow edges from the AST
        cfg = extract_control_flow_edges(root_node)
        return cfg
    except Exception as e:
        print(f"Error extracting CFG: {e}")
        return []


def extract_control_flow_edges(root_node):
    """
    Extract control flow edges (e.g., branching, loops) from the AST.
    """
    edges = []

    def traverse(node):
        if node.type == "if_statement":
            # Extract the condition
            condition_node = next(
                (child for child in node.children if child.type == "parenthesized_expression"), None
            )
            # Extract the "then" block
            then_node = next(
                (child for child in node.children if child.type == "block"), None
            )
            # Add edges for condition -> then
            if condition_node and then_node:
                edges.append((condition_node.start_point, then_node.start_point))
            # Extract the "else" block, if present
            else_node = next(
                (child for child in node.children if child.type == "else_body"), None
            )
            if condition_node and else_node:
                edges.append((condition_node.start_point, else_node.start_point))
        elif node.type in ("for_statement", "while_statement"):
            # Extract loop condition and body
            condition_node = next(
                (child for child in node.children if child.type == "parenthesized_expression"), None
            )
            body_node = next(
                (child for child in node.children if child.type == "block"), None
            )
            if condition_node and body_node:
                edges.append((condition_node.start_point, body_node.start_point))
        elif node.type == "return_statement":
            # Add a return statement as a flow edge
            edges.append((node.start_point, node.end_point))

        # Recurse for all children
        for child in node.children:
            traverse(child)

    traverse(root_node)
    return edges


In [9]:
code = """
public class Factorial {
    public static long factorial(int n) {
        if (n <= 1) {
            return 1;
        }
        return n * factorial(n - 1);
    }
}
"""

# Extract CFG
cfg = extract_controlflow(code, parsers["java"], "java")

print("Control Flow Graph (CFG):", cfg)

Control Flow Graph (CFG): [((3, 11), (3, 20)), ((4, 12), (4, 21)), ((6, 8), (6, 36))]


In [10]:
def read_examples(filename):
    """Read examples from filename."""
    examples=[]
    source,target=filename.split(',')
    lang='java'
    if source[-1]=='s':
        lang='c_sharp'
        
    with open(source,encoding="utf-8") as f1,open(target,encoding="utf-8") as f2:
        for line1,line2 in zip(f1,f2):
            line1=line1.strip()
            line2=line2.strip()
            examples.append(
                Example(
                    source=line1,
                    target=line2,
                    lang=lang
                        ) 
            )

    return examples

**Define the input for BERT model**

In [11]:
class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 example_id,
                 source_ids,
                 position_idx,
                 dfg_to_code,
                 dfg_to_dfg,                 
                 target_ids,
                 source_mask,
                 target_mask,

    ):
        self.example_id = example_id
        self.source_ids = source_ids
        self.position_idx = position_idx
        self.dfg_to_code = dfg_to_code
        self.dfg_to_dfg = dfg_to_dfg
        self.target_ids = target_ids
        self.source_mask = source_mask
        self.target_mask = target_mask       
        

In [12]:
def convert_examples_to_features(examples, tokenizer, args,stage=None):
    features = []
    for example_index, example in enumerate(tqdm(examples,total=len(examples))):
        ##extract data flow
        code_tokens,dfg = extract_dataflow(example.source,
                                         parsers["c_sharp" if args.source_lang == "cs" else "java"],
                                         "c_sharp" if args.source_lang == "cs" else "java")
        cfg = extract_controlflow(example.source,
                                    parsers["c_sharp" if args.source_lang == "cs" else "java"],
                                    "c_sharp" if args.source_lang == "cs" else "java")
        # print("Code Tokens:", code_tokens)
        code_tokens=[tokenizer.tokenize('@ '+x)[1:] if idx!=0 else tokenizer.tokenize(x) for idx,x in enumerate(code_tokens)]
        ori2cur_pos={}
        ori2cur_pos[-1]=(0,0)
        for i in range(len(code_tokens)):
            ori2cur_pos[i]=(ori2cur_pos[i-1][1],ori2cur_pos[i-1][1]+len(code_tokens[i]))    
        code_tokens=[y for x in code_tokens for y in x]  
        
        #truncating
        code_tokens=code_tokens[:args.max_source_length-3][:512-3]
        source_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
        source_ids =  tokenizer.convert_tokens_to_ids(source_tokens)
        position_idx = [i+tokenizer.pad_token_id + 1 for i in range(len(source_tokens))]
        
        # process dfg
        dfg=dfg[:args.max_source_length-len(source_tokens)]
        source_tokens+=[x[0] for x in dfg]
        position_idx+=[0 for x in dfg]
        
        # process cfg
        # Map CFG to token positions
        cfg_mapped = []
        for start, end in cfg:
            if start in ori2cur_pos and end in ori2cur_pos:
                start_pos = ori2cur_pos[start][0]
                end_pos = ori2cur_pos[end][1]
                cfg_mapped.append((start_pos, end_pos))

        # Flatten CFG into the source structure
        for start_pos, end_pos in cfg_mapped:
            source_tokens.append("[CFG_EDGE]")
            position_idx.append(0)
            source_ids.append(tokenizer.unk_token_id)  # Add an unknown token ID for CFG edge
        
        source_ids+=[tokenizer.unk_token_id for x in dfg]
        padding_length=args.max_source_length-len(source_ids)
        
        position_idx+=[tokenizer.pad_token_id]*padding_length
        source_ids+=[tokenizer.pad_token_id]*padding_length   
           
        source_mask = [1] * (len(source_tokens))
        source_mask+=[0]*padding_length        
        
        #reindex
        reverse_index={}
        for idx,x in enumerate(dfg):
            reverse_index[x[1]]=idx
        for idx,x in enumerate(dfg):
            dfg[idx]=x[:-1]+([reverse_index[i] for i in x[-1] if i in reverse_index],)    
        dfg_to_dfg=[x[-1] for x in dfg]
        dfg_to_code=[ori2cur_pos[x[1]] for x in dfg]
        length=len([tokenizer.cls_token])
        
        dfg_to_code=[(x[0]+length,x[1]+length) for x in dfg_to_code]        
      

        #target
        if stage=="test":
            target_tokens = tokenizer.tokenize("None")
        else:
            target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2]
        target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]            
        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
        target_mask = [1] *len(target_ids)
        padding_length = args.max_target_length - len(target_ids)
        target_ids+=[tokenizer.pad_token_id]*padding_length
        target_mask+=[0]*padding_length   
   
        if example_index < 5:
            if stage=='train':
                logger.info("*** Example ***")
                logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
                logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
                logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))
                logger.info("position_idx: {}".format(position_idx))
                logger.info("dfg_to_code: {}".format(' '.join(map(str, dfg_to_code))))
                logger.info("dfg_to_dfg: {}".format(' '.join(map(str, dfg_to_dfg))))
                
                logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
                logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
                logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))
       
        features.append(
            InputFeatures(
                 example_index,
                 source_ids,
                 position_idx,
                 dfg_to_code,
                 dfg_to_dfg,
                 target_ids,
                 source_mask,
                 target_mask,
            )
        )
    return features

### Incorporate Semantic Information

### Prepare fine-tuning parameters

In [13]:
import argparse

# Simulate argparse for Jupyter
class Args:
    model_type = "roberta"
    model_name_or_path = "roberta-base"
    output_dir = "./output"
    load_model_path = None
    train_filename = None
    dev_filename = None
    test_filename = None
    source_lang = "en"
    config_name = ""
    tokenizer_name = ""
    max_source_length = 64
    max_target_length = 32
    do_train = True
    do_eval = True
    do_test = False
    do_lower_case = False
    no_cuda = False
    train_batch_size = 16
    eval_batch_size = 64
    gradient_accumulation_steps = 1
    learning_rate = 5e-5
    beam_size = 10
    weight_decay = 0.0
    adam_epsilon = 1e-8
    max_grad_norm = 1.0
    num_train_epochs = 3
    max_steps = -1
    eval_steps = -1
    train_steps = -1
    warmup_steps = 0
    local_rank = -1
    seed = 42

# Create an instance of Args
args = Args()

# Print the arguments for verification
print(f"Arguments: {args}")
args.model_type = "roberta"
args.source_lang = source
args.target_lang = target
args.model_name_or_path = pretrained_model
args.tokenizer_name = "microsoft/graphcodebert-base"
args.config_name = "microsoft/graphcodebert-base"
args.train_filename = train_file
args.dev_filename = dev_file
args.output_dir = output_dir
args.learning_rate = lr
args.num_train_epochs = epochs
args.train_batch_size = batch_size
args.eval_batch_size = batch_size
args.max_source_length = source_length
args.max_target_length = target_length

Arguments: <__main__.Args object at 0x7f79bc49bcd0>


In [14]:
# prepare parsers for each language
parsers={}        
for lang in dfg_function:
    LANGUAGE = Language('parser/my-languages.so', lang)
    parser = Parser()
    parser.set_language(LANGUAGE) 
    parser = [parser,dfg_function[lang]]    
    parsers[lang]= parser

In [15]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(args.seed)

Define dataset

In [16]:
class TextDataset(Dataset):
    def __init__(self, examples, args):
        self.examples = examples
        self.args=args  
        
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, item):
        #calculate graph-guided masked function
        attn_mask=np.zeros((self.args.max_source_length,self.args.max_source_length),dtype=np.bool_)
        #calculate begin index of node and max length of input
        node_index=sum([i>1 for i in self.examples[item].position_idx])
        max_length=sum([i!=1 for i in self.examples[item].position_idx])
        #sequence can attend to sequence
        attn_mask[:node_index,:node_index]=True
        #special tokens attend to all tokens
        for idx,i in enumerate(self.examples[item].source_ids):
            if i in [0,2]:
                attn_mask[idx,:max_length]=True
        #nodes attend to code tokens that are identified from
        for idx,(a,b) in enumerate(self.examples[item].dfg_to_code):
            if a<node_index and b<node_index:
                attn_mask[idx+node_index,a:b]=True
                attn_mask[a:b,idx+node_index]=True
        #nodes attend to adjacent nodes         
        for idx,nodes in enumerate(self.examples[item].dfg_to_dfg):
            for a in nodes:
                if a+node_index<len(self.examples[item].position_idx):
                    attn_mask[idx+node_index,a+node_index]=True  
                    
        return (torch.tensor(self.examples[item].source_ids),
                torch.tensor(self.examples[item].source_mask),
                torch.tensor(self.examples[item].position_idx),
                torch.tensor(attn_mask), 
                torch.tensor(self.examples[item].target_ids),
                torch.tensor(self.examples[item].target_mask),)
    


In [17]:
# Setup CUDA, GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args.n_gpu = torch.cuda.device_count()
args.device = device

In [18]:
# make dir if output_dir not exist
if os.path.exists(args.output_dir) is False:
    os.makedirs(args.output_dir)

**Training helpers**

In [19]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name )



In [20]:
def save_model(model, output_dir, step):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_to_save = model.module if hasattr(model, 'module') else model 
    output_model_file = os.path.join(output_dir, "model.{}.bin".format(step))
    torch.save(model_to_save.state_dict(), output_model_file)

In [21]:
def calculate_bleu(model, dev_dataset):
    if 'dev_bleu' in dev_dataset:
        eval_examples,eval_data=dev_dataset['dev_bleu']
    else:
        eval_examples = read_examples(args.dev_filename)
        eval_examples = random.sample(eval_examples,min(1000,len(eval_examples)))
        eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
        eval_data = TextDataset(eval_features,args)
        dev_dataset['dev_bleu']=eval_examples,eval_data
        
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=4)
    model.eval() 
    p=[]
    for batch in eval_dataloader:
        batch = tuple(t.to(device) for t in batch)
        source_ids,source_mask,position_idx,att_mask,target_ids,target_mask = batch                 
        with torch.no_grad():
            preds = model(source_ids,source_mask,position_idx,att_mask)  
            for pred in preds:
                t=pred[0].cpu().numpy()
                t=list(t)
                if 0 in t:
                    t=t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                p.append(text)
    model.train()
    predictions=[]
    accs = []
    with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
        for ref,gold in zip(p,eval_examples):
            predictions.append(ref)
            f.write(ref+'\n')
            f1.write(gold.target+'\n')     
            accs.append(ref==gold.target)

    dev_bleu=round(_bleu(os.path.join(args.output_dir, "dev.gold"), os.path.join(args.output_dir, "dev.output")),2)
    xmatch=round(np.mean(accs)*100,4)
    return dev_bleu,xmatch
    

In [22]:
def calculate_bleu(model, eval_examples, eval_dataloader):
    
    model.eval() 
    p=[]
    for batch in eval_dataloader:
        batch = tuple(t.to(device) for t in batch)
        source_ids,source_mask,position_idx,att_mask,target_ids,target_mask = batch                 
        with torch.no_grad():
            preds = model(source_ids, source_mask, position_idx, att_mask, target_ids, target_mask) 
            for pred in preds:
                t=pred[0].cpu().numpy()
                t=list(t)
                if 0 in t:
                    t=t[:t.index(0)]
                text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                p.append(text)
    model.train()
    predictions=[]
    accs = []
    with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
        for ref,gold in zip(p, eval_examples):
            predictions.append(ref)
            f.write(ref+'\n')
            f1.write(gold.target+'\n')     
            accs.append(ref==gold.target)

    dev_bleu=round(_bleu(os.path.join(args.output_dir, "dev.gold"), os.path.join(args.output_dir, "dev.output")),2)
    xmatch=round(np.mean(accs)*100,4)
    
    return dev_bleu,xmatch

In [23]:
# # push to huggingface
# !huggingface-cli login
# repo_url = "https://huggingface.co/judynguyen16/graphcodebert--code-translation-java-cs"

## Training with Lightning Pytorch

### Prepare the dataloaders

In [24]:
# # ---------*****---------
# # Prepare training loader and fine-tuning

# config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
# config = config_class.from_pretrained(args.config_name)
# tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name )

# train_examples = read_examples(args.train_filename)
# train_features = convert_examples_to_features(train_examples, tokenizer, args,stage='train')
# train_data = TextDataset(train_features,args)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps,num_workers=54)
# num_train_optimization_steps =  args.train_steps

# #Start training
# logger.info("***** Running training *****")
# logger.info("  Num examples = %d", len(train_examples))
# logger.info("  Batch size = %d", args.train_batch_size)
# logger.info("  Num epoch = %d", args.num_train_epochs)

In [25]:
# validation loader
dev_dataset={}
if 'dev_loss' in dev_dataset:
    eval_examples,eval_data=dev_dataset['dev_loss']
else:
    eval_examples = read_examples(args.dev_filename)
    eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
    eval_data = TextDataset(eval_features,args)
    dev_dataset['dev_loss']=eval_examples,eval_data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=54)

logger.info("\n***** Running evaluation *****")
logger.info("  Num examples = %d", len(eval_examples))
logger.info("  Batch size = %d", args.eval_batch_size)

100%|██████████| 500/500 [00:01<00:00, 381.97it/s]
11/27/2024 21:23:07 - INFO - __main__ -   
***** Running evaluation *****
11/27/2024 21:23:07 - INFO - __main__ -     Num examples = 500
11/27/2024 21:23:07 - INFO - __main__ -     Batch size = 64


In [26]:
# test loader
files=[]
if args.dev_filename is not None:
    files.append(args.dev_filename)
if args.test_filename is not None:
    files.append(args.test_filename)
    
for idx,file in enumerate(files):   
    logger.info("Test file: {}".format(file))
    eval_examples = read_examples(file)
    eval_examples = eval_examples[:50]
    eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
    eval_data = TextDataset(eval_features,args) 

    # Calculate bleu
    eval_sampler = SequentialSampler(eval_data)
    test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=16)

11/27/2024 21:23:07 - INFO - __main__ -   Test file: data/valid.java-cs.txt.java,data/valid.java-cs.txt.cs
100%|██████████| 50/50 [00:00<00:00, 546.64it/s]


In [27]:
eval_examples[0]

<__main__.Example at 0x7f78ec6212d0>

In [28]:
custom_file = "custom_data/valid.source.txt.java,custom_data/valid.target.txt.cs"
custom_eval_examples = read_examples(custom_file)

In [29]:
eval_example = custom_eval_examples[0]
print(eval_example.source)
print(eval_example.target)
eval_features = convert_examples_to_features(custom_eval_examples, tokenizer, args,stage='test')
eval_data = TextDataset(eval_features,args)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=2,num_workers=16)
eval_sampler = SequentialSampler(eval_data)

public static long factorial(int n) { if (n <= 1) { return 1; } return n * factorial(n - 1); }
public static long Factorial(int n) => n <= 1 ? 1 : n * Factorial(n - 1);


100%|██████████| 7/7 [00:00<00:00, 691.18it/s]


### Define the LightningModule model for training

In [30]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name )

#budild model
encoder = model_class.from_pretrained(args.model_name_or_path,config=config)    
decoder_layer = nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
model=Seq2Seq(encoder=encoder,decoder=decoder,config=config,
                beam_size=args.beam_size,max_length=args.max_target_length,
                sos_id=tokenizer.cls_token_id,eos_id=tokenizer.sep_token_id)

# if args.load_model_path is not None:
# pretrained_model_path = "saved_models/java-cs/checkpoint-best-ppl/pytorch_model.bin"
# print("reload model from {}".format(pretrained_model_path))
# model.load_state_dict(torch.load(pretrained_model_path))
    
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Seq2Seq(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

### Training

In [None]:
if args.do_train:
    # Prepare training data loader
    train_examples = read_examples(args.train_filename)
    train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train')
    train_data = TextDataset(train_features,args)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps,num_workers=4)

    num_train_optimization_steps =  args.train_steps

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=len(train_dataloader)*args.num_train_epochs*0.1,num_training_steps=len(train_dataloader)*args.num_train_epochs)

    #Start training
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num epoch = %d", args.num_train_epochs)
    
    model.train()
    dev_dataset={}
    nb_tr_examples, nb_tr_steps,tr_loss,global_step,best_bleu,best_loss = 0, 0,0,0,0,1e6 
    for epoch in range(args.num_train_epochs):
        bar = tqdm(train_dataloader,total=len(train_dataloader))
        for batch in bar:
            batch = tuple(t.to(device) for t in batch)
            source_ids,source_mask,position_idx,att_mask,target_ids,target_mask = batch
            loss,_,_ = model(source_ids,source_mask,position_idx,att_mask,target_ids,target_mask)

            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
                
            tr_loss += loss.item()
            train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
            bar.set_description("epoch {} loss {}".format(epoch,train_loss))
            nb_tr_examples += source_ids.size(0)
            nb_tr_steps += 1
            loss.backward()

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                #Update parameters
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1

        if args.do_eval and epoch in [ int(args.num_train_epochs*(i+1)//20) for i in range(20)]:
                                                                    #Eval model with dev dataset
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0                     
            eval_flag=False    
            if 'dev_loss' in dev_dataset:
                eval_examples,eval_data=dev_dataset['dev_loss']
            else:
                eval_examples = read_examples(args.dev_filename)
                eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev')
                eval_data = TextDataset(eval_features,args)
                dev_dataset['dev_loss']=eval_examples,eval_data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=4)

            logger.info("\n***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            #Start Evaling model
            model.eval()
            eval_loss,tokens_num = 0,0
            for batch in eval_dataloader:
                batch = tuple(t.to(device) for t in batch)               
                source_ids,source_mask,position_idx,att_mask,target_ids,target_mask = batch
                with torch.no_grad():
                    _,loss,num = model(source_ids,source_mask,position_idx,att_mask,target_ids,target_mask)     
                eval_loss += loss.sum().item()
                tokens_num += num.sum().item()
            #Pring loss of dev dataset    
            model.train()
            eval_loss = eval_loss / tokens_num
            result = {'eval_ppl': round(np.exp(eval_loss),5),
                        'global_step': global_step+1,
                        'train_loss': round(train_loss,5)}
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
            logger.info("  "+"*"*20)   

            #save last checkpoint
            last_output_dir = os.path.join(args.output_dir, 'checkpoint-last')
            if not os.path.exists(last_output_dir):
                os.makedirs(last_output_dir)
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(last_output_dir, "pytorch_model.bin")
            torch.save(model_to_save.state_dict(), output_model_file)                    
            if eval_loss<best_loss:
                logger.info("  Best ppl:%s",round(np.exp(eval_loss),5))
                logger.info("  "+"*"*20)
                best_loss=eval_loss
                # Save best checkpoint for best ppl
                output_dir = os.path.join(args.output_dir, 'checkpoint-best-ppl')
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(output_dir, "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)  


            #Calculate bleu  
            if 'dev_bleu' in dev_dataset:
                eval_examples,eval_data=dev_dataset['dev_bleu']
            else:
                eval_examples = read_examples(args.dev_filename)
                eval_examples = random.sample(eval_examples,min(1000,len(eval_examples)))
                eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test')
                eval_data = TextDataset(eval_features,args)
                dev_dataset['dev_bleu']=eval_examples,eval_data

            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=4)
            model.eval() 
            p=[]
            for batch in eval_dataloader:
                batch = tuple(t.to(device) for t in batch)
                source_ids,source_mask,position_idx,att_mask,target_ids,target_mask = batch                 
                with torch.no_grad():
                    preds = model(source_ids,source_mask,position_idx,att_mask)  
                    for pred in preds:
                        t=pred[0].cpu().numpy()
                        t=list(t)
                        if 0 in t:
                            t=t[:t.index(0)]
                        text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
                        p.append(text)
            model.train()
            predictions=[]
            accs=[]
            with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
                for ref,gold in zip(p,eval_examples):
                    predictions.append(ref)
                    f.write(ref+'\n')
                    f1.write(gold.target+'\n')     
                    accs.append(ref==gold.target)

            dev_bleu=round(_bleu(os.path.join(args.output_dir, "dev.gold"), os.path.join(args.output_dir, "dev.output")),2)
            xmatch=round(np.mean(accs)*100,4)
            logger.info("  %s = %s "%("bleu-4",str(dev_bleu)))
            logger.info("  %s = %s "%("xMatch",str(round(np.mean(accs)*100,4))))
            logger.info("  "+"*"*20)    
            if dev_bleu+xmatch>best_bleu:
                logger.info("  Best BLEU+xMatch:%s",dev_bleu+xmatch)
                logger.info("  "+"*"*20)
                best_bleu=dev_bleu+xmatch
                # Save best checkpoint for best bleu
                output_dir = os.path.join(args.output_dir, 'checkpoint-best-bleu')
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(output_dir, "pytorch_model_ast.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
        

  0%|          | 0/10300 [00:00<?, ?it/s]11/27/2024 21:23:08 - INFO - __main__ -   *** Example ***
11/27/2024 21:23:08 - INFO - __main__ -   source_tokens: ['<s>', 'public', '_List', 'Spe', 'ech', 'Sy', 'nt', 'hesis', 'T', 'asks', 'Result', '_list', 'Spe', 'ech', 'Sy', 'nt', 'hesis', 'T', 'asks', '_', '_(', '_List', 'Spe', 'ech', 'Sy', 'nt', 'hesis', 'T', 'asks', 'Request', '_request', '_)', '_{', '_request', '_=', '_before', 'Client', 'Exec', 'ution', '_(', '_request', '_)', '_;', '_return', '_execute', 'List', 'Spe', 'ech', 'Sy', 'nt', 'hesis', 'T', 'asks', '_(', '_request', '_)', '_;', '_}', '</s>', 'request', 'request', 'request', 'beforeClientExecution', 'request', 'request']
11/27/2024 21:23:08 - INFO - __main__ -   source_ids: 0 15110 9527 29235 7529 35615 3999 35571 565 40981 48136 889 29235 7529 35615 3999 35571 565 40981 1437 36 9527 29235 7529 35615 3999 35571 565 40981 45589 2069 4839 25522 2069 5457 137 47952 46891 15175 36 2069 4839 25606 671 11189 36583 29235 7529 35615 

OutOfMemoryError: CUDA out of memory. Tried to allocate 300.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 274.44 MiB is free. Process 1804376 has 1.66 GiB memory in use. Process 1805973 has 1.45 GiB memory in use. Including non-PyTorch memory, this process has 18.59 GiB memory in use. Of the allocated memory 18.20 GiB is allocated by PyTorch, and 98.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 

### Evaluation Starts

In [None]:
def get_predictions(model, dataloader):
    model.eval() 
    model.to(device)
    p=[]
    batch = next(iter(dataloader))
    batch = tuple(t.to(device) for t in batch)
    source_ids,source_mask,position_idx,att_mask,target_ids,target_mask = batch         
          
    with torch.no_grad():
        preds = model(source_ids,source_mask,position_idx,att_mask, None, None)  
        for pred in preds:
            t=pred[0].cpu().numpy()
            t=list(t)
            if 0 in t:
                t=t[:t.index(0)]
            text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
            p.append(text)
    return p
    

In [None]:
model.eval() 
np.bool = np.bool_
p=[]
for batch in tqdm(test_dataloader,total=len(test_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    source_ids,source_mask,position_idx,att_mask,target_ids,target_mask = batch                    
    with torch.no_grad():
        preds = model(source_ids,source_mask,position_idx,att_mask, None, None)  
        for pred in preds:
            t=pred[0].cpu().numpy()
            t=list(t)
            if 0 in t:
                t=t[:t.index(0)]
            text = tokenizer.decode(t,clean_up_tokenization_spaces=False)
            p.append(text)
# model.train()

In [None]:
predictions=[]
accs = []
with open(os.path.join(args.output_dir,"dev.output"),'w') as f, open(os.path.join(args.output_dir,"dev.gold"),'w') as f1:
    for ref, gold in zip(p, eval_examples):
        predictions.append(ref)
        f.write(ref+'\n')
        f1.write(gold.target+'\n')     
        accs.append(ref==gold.target)

In [None]:
predictions

In [None]:
dev_bleu=round(_bleu(os.path.join(args.output_dir, "dev.gold"), os.path.join(args.output_dir, "dev.output")),2)
xmatch=round(np.mean(accs)*100,4)

In [None]:
print(f"BLEU: {dev_bleu}, X-Match: {xmatch}")

In [None]:
print(p[1])

In [None]:
# import wandb
# wandb.login()

In [None]:
# login huggerface
# !huggingface-cli login

In [None]:
# repo_id = "judynguyen16/graphcodebert-code-translation-java-cs"
# !huggingface-cli repo create "judynguyen16/graphcodebert-code-translation-java-cs"

In [None]:
from transformers import AutoModel, AutoTokenizer
# Push model and tokenizer to Hugging Face Hub
from huggingface_hub import upload_file

repo_id = "judynguyen16/graphcodebert-code-translation-java-cs"

files_to_upload = {
    "pytorch_model.bin": "/home/ubuntu/judy/transformer-code-translation/GraphCodeBERT/translation/saved_models/java-cs/checkpoint-best-ppl/pytorch_model.bin",
}

for dest_path, local_path in files_to_upload.items():
    upload_file(
        path_or_fileobj=local_path,
        path_in_repo=dest_path,
        repo_id=repo_id,
        repo_type="model",
    )

### Load model from hugging face

In [None]:
import requests

# URL of the file
url = "https://huggingface.co/judynguyen16/graphcodebert-code-translation-java-cs/resolve/main/pytorch_model.bin"

# Path to save the file
save_path = "pytorch_model.bin"

# Download the file
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(save_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"File downloaded and saved to {save_path}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")