In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
%%capture
!pip install datasets
!pip install transformers
!pip install javalang
!pip install pyminifier

In [None]:
from transformers import RobertaTokenizer, EncoderDecoderModel
import torch
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
import os
import javalang
import tokenize
import re

In [None]:
BPE_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/preprocessed_files/BPE"
OUTPUT_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/output_files"

SRC_LANGUAGE = "pn"
TGT_LANGUAGE = "ja"
NUM_EPOCHS = 100
LEARNING_RATE = 2e-5
NUM_LAYERS = 6
BATCH_SIZE = 16
SRC_FILE = "2.py"

SRC_TOK_FILE = f"test_tok.{SRC_LANGUAGE}"
SRC_CB_FILE = f"test.{SRC_LANGUAGE}"

TEST_MODEL = f"codebert_sourcecode_nmt_{SRC_LANGUAGE}2{TGT_LANGUAGE}_{NUM_EPOCHS}E_{LEARNING_RATE}LR_{BATCH_SIZE}B_{NUM_LAYERS}E_{NUM_LAYERS}D"

In [None]:
def minify(file):
   mini_filepath = "mini_" + file
   os.popen(f"pyminifier {file} > {mini_filepath}")
   return mini_filepath

In [None]:
def tokenize_java(filepath):
    file = open(filepath, "r", encoding = "ISO-8859-1")
    tokens = javalang.tokenizer.tokenize(file.read())
    code = []
    for token in tokens:
        code.append(token.value)
#     print(f"Java Tokens Count: {len(code)}")
    return " ".join(code)

In [None]:
def detokenize_java(s):
    try:
        tokens = javalang.tokenizer.tokenize(s)
        return javalang.tokenizer.reformat_tokens(tokens)
    except:
        return s

In [None]:
def tokenize_python(filepath):
    code = ""
    with tokenize.open(filepath) as f:
        tokens = tokenize.generate_tokens(f.readline)
        pre_token = None
        for token in tokens:
            if (pre_token != None and pre_token.type == tokenize.COMMENT and token.type == tokenize.NL) or (token.type == tokenize.COMMENT):
                pre_token = token
                continue
            elif token.type == tokenize.NEWLINE:
                temp = token.string.replace("\n", "NEWLINE")
                val = " " + temp
            elif token.type == tokenize.NL:
                temp = "NL"
                val = " " + temp
            elif token.type == tokenize.INDENT and token.string.isspace():
                no = int(len(token.string))
                temp = "INDENT" * no
                val = " " + temp
            elif token.type == tokenize.INDENT:
                temp = token.string.replace("\t", "INDENT")
                val = " " + temp
            elif token.type == tokenize.DEDENT:
                temp = "DEDENT"
                val = " " + temp + " "
            elif token.type == tokenize.ENDMARKER:
                temp = "ENDMARKER"
                val = " " + temp
            else:
                start = token.start
                line = token.line
                space_idx = start[1] - 1
                if line[space_idx] == " " or (pre_token != None and pre_token.type == tokenize.NEWLINE):
                    val = " " + token.string
                else:
                    val = token.string
                
            pre_token = token
            code += val
#     print(f"Length of Python code: {len(code)}")
    return code

In [None]:
def tokenize_code(filepath, lang):
  if lang == "ja":
    return " ".join(detokenize_java(tokenize_java(filepath)).split())
  else:
    mini_filepath = minify(filepath)
    return tokenize_python(mini_filepath)

In [None]:
# pre-tokenization of source language code
src = tokenize_code(SRC_FILE, SRC_LANGUAGE)
print(src)
with open(SRC_TOK_FILE, "w") as f:
  f.write(src)

In [None]:
def save_codebert_tokens(input_ids, tokenizer):
  with open(SRC_CB_FILE, "w") as f:
    f.write(" ".join(tokenizer.convert_ids_to_tokens(input_ids[0].tolist(), skip_special_tokens = True)))

In [None]:
%%capture
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = EncoderDecoderModel.from_pretrained(f"joshanashakya/{TEST_MODEL}")
model.to("cuda")

In [None]:
def translate(src):
  inputs = tokenizer(src, padding="max_length", truncation=True, max_length=450, return_tensors="pt")
  input_ids = inputs.input_ids.to("cuda")
  save_codebert_tokens(input_ids, tokenizer)
  attention_mask = inputs.attention_mask.to("cuda")

  outputs = model.generate(input_ids, attention_mask=attention_mask)

  # all special tokens including will be removed
  output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  return output_str[0]

In [None]:
def detokenize_java(s):
  try:
    tokens = javalang.tokenizer.tokenize(s)
    return javalang.tokenizer.reformat_tokens(tokens)
  except:
    return s

In [None]:
def detokenize_python(s):
    cleaned_lines = []
    lines = s.split("NEWLINE")
    for line in lines:
        line = line.strip()
        if line.startswith("INDENT"):
            idn_count = line.count("INDENT")
            for i in range(idn_count):
                if i == idn_count:
                    line = line.replace("INDENT ", "    ")
                else:
                    line = line.replace("INDENT", "    ")
        line = line.replace("INDENT", "")
        line = line.replace("DEDENT ", "")
        line = line.replace("DEDENT", "")
        line = line.replace("NL", "")
        line = line.replace("ENDMARKER", "")
        cleaned_lines.append(line)
    code = "\n".join(cleaned_lines)
    code = code.replace(". ", ".").replace(" .", ".")
    return code

In [None]:
def detokenize(s, lang):
  if lang == "ja":
    return detokenize_java(s)
  elif lang == "pn":
    return detokenize_python(s)

In [None]:
def cleanup(s):
  l = re.compile("newline", re.IGNORECASE).sub("NEWLINE", s)
  l = re.compile("new line", re.IGNORECASE).sub("NEWLINE", l)
  l = re.compile("indent", re.IGNORECASE).sub("INDENT", l)
  l = re.compile("dedent", re.IGNORECASE).sub("DEDENT", l)
  return l

In [None]:
SRC_LANG = "Java" if SRC_LANGUAGE == "ja" else "Python"
TGT_LANG = "Java" if TGT_LANGUAGE == "ja" else "Python"

# try on single file
with open(SRC_FILE, "r") as f:
  src_code = f.read()
print(f"Program in \"{SRC_LANG}\":")
print(src_code)
print("\n\n")

translated_code = translate(src)
print(f"Translated program in the target language \"{TGT_LANG}\":")
print(translated_code)
print("\n\n")

detokenized_code = detokenize(cleanup(translated_code), TGT_LANGUAGE)
print(f"Detokenized program in the target language \"{TGT_LANG}\":")
print(detokenized_code)

TGT_EXT = "java" if TGT_LANGUAGE == "ja" else "py"
TGT_FILE = f"translate.{TGT_EXT}"
with open(TGT_FILE, "w") as f:
  f.write(detokenized_code)

if TGT_LANGUAGE == "pn":
  # construct minified file and store
  !pip install pyminifier
  os.popen("autopep8 --in-place --aggressive --aggressive translate.py")
  os.popen("pyminifier translate.py > mini_translate.py")
      

In [None]:
# write everything to a file
txt =""
txt1 = f"Program in \"{SRC_LANG}\":\n"
txt += f"{txt1}{'=' * len(txt1)}\n{src_code}\n\n"
txt2 = f"Translated program in the target language \"{TGT_LANG}\":\n"
txt += f"{txt2}{'=' * len(txt2)}\n{translated_code}\n\n\n"
txt3 = f"Detokenized program in the target language \"{TGT_LANG}\":\n"
txt += f"{txt3}{'=' * len(txt3)}\n{detokenized_code}\n\n\n"

if TGT_LANGUAGE == "pn":
  txt6 = f"Minified program:\n"
  with open("mini_translate.py", "r") as f:
    mini = f.read()
  txt += f"{txt6}{'=' * len(txt6)}\n{mini}\n\n\n"

with open("details.txt", "w") as f:
  f.write(txt)
