In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
%%capture
!pip install datasets
!pip install transformers
!pip install javalang

In [None]:
from transformers import RobertaTokenizer, EncoderDecoderModel
import torch
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
import javalang
import json
import os
import re

In [None]:
# load datasets
OUTPUT_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/codebert_output_files"
PREPROCESSED_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/preprocessed_files/codebert"

SRC_LANGUAGE = "pn"
TGT_LANGUAGE = "ja"
NUM_EPOCHS = 100
LEARNING_RATE = 2e-5
NUM_LAYERS = 12
BATCH_SIZE = 16
MODEL_NAME = f"codebert_sourcecode_nmt_{SRC_LANGUAGE}2{TGT_LANGUAGE}_{NUM_EPOCHS}E_{LEARNING_RATE}LR_{BATCH_SIZE}B_{NUM_LAYERS}E_{NUM_LAYERS}D"
TEST_MODEL_OUTPUT_PATH = f"{OUTPUT_FILEPATH}/{MODEL_NAME}"

# Testing data
with open(f"{PREPROCESSED_FILEPATH}/test.ja", "r") as f:
  java_codes = f.readlines()

with open(f"{PREPROCESSED_FILEPATH}/test.pn", "r") as f:
  python_codes = f.readlines()

In [None]:
if not os.path.exists(TEST_MODEL_OUTPUT_PATH):
  os.makedirs(TEST_MODEL_OUTPUT_PATH)

In [None]:
test_codes = java_codes if SRC_LANGUAGE == "ja" else python_codes
reference_codes = java_codes if TGT_LANGUAGE == "ja" else python_codes

In [None]:
print("Hello world how is the world going?")

In [None]:
%%capture
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = EncoderDecoderModel.from_pretrained(f"joshanashakya/{MODEL_NAME}")
model.to("cuda")

In [None]:
predicted_codes = []
for test_code in test_codes:
  inputs = tokenizer(test_code, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
  input_ids = inputs.input_ids.to("cuda")
  attention_mask = inputs.attention_mask.to("cuda")

  outputs = model.generate(input_ids, attention_mask=attention_mask)

  # all special tokens including will be removed
  output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  predicted_codes.append(output_str[0])

In [None]:
s = "System.out.println()"

inputs = tokenizer(test_code, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to("cuda")
attention_mask = inputs.attention_mask.to("cuda")

outputs = model.generate(input_ids, attention_mask=attention_mask)

# all special tokens including will be removed
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(output_str)


In [None]:
def detokenize_java(s):
  try:
    tokens = javalang.tokenizer.tokenize(s)
    return javalang.tokenizer.reformat_tokens(tokens)
  except:
    return s

In [None]:
def detokenize_python(s):
    cleaned_lines = []
    lines = s.split("NEWLINE")
    for line in lines:
        line = line.strip()
        if line.startswith("INDENT"):
            idn_count = line.count("INDENT")
            for i in range(idn_count):
                if i == idn_count:
                    line = line.replace("INDENT ", "    ")
                else:
                    line = line.replace("INDENT", "    ")
        line = line.replace("INDENT", "")
        line = line.replace("DEDENT ", "")
        line = line.replace("DEDENT", "")
        line = line.replace("NL", "")
        line = line.replace("ENDMARKER", "")
        cleaned_lines.append(line)
    code = "\n".join(cleaned_lines)
    code = code.replace(". ", ".").replace(" .", ".")
    return code

In [None]:
def detokenize(s, lang):
  if lang == "ja":
    return detokenize_java(s)
  elif lang == "pn":
    return detokenize_python(s)

In [None]:
def prepare_eval(s):
    cleaned_tokens = []
    tokens = s.split(" ")
    # tokens.replace("NEWLINE", "\\n")
    for token in tokens:
      if token == "NL" or token == "DEDENT" or token == "ENDMARKER" or len(token) == 0 or token == "<unk>" or token == "ENDMARKER\n":
        continue
      elif token.startswith("NEWLINE") and len(token) > len("NEWLINE"):
        token = token.replace("NEWLINE", "")
      elif token == "NEWLINE":
        cleaned_tokens.append("\\n")
      elif token.startswith("INDENT"):
        idn_count = token.count("INDENT")
        for i in range(idn_count):
          cleaned_tokens.append("\\t")      
      else:
        cleaned_tokens.append(token)
    return " ".join(cleaned_tokens)


In [None]:
def cleanup(s):
  l = re.compile("newline", re.IGNORECASE).sub("NEWLINE", s)
  l = re.compile("new line", re.IGNORECASE).sub("NEWLINE", l)
  l = re.compile("indent", re.IGNORECASE).sub("INDENT", l)
  l = re.compile("dedent", re.IGNORECASE).sub("DEDENT", l)
  return l

In [None]:
# format predicted codes
predicted_lines = []
for code in predicted_codes:
  predicted_lines.append(code + "\n")

In [None]:
# try on test data
detokenized_codes = []
eval = []

length = len(reference_codes)
idx = 0

for idx in range(length):
  predicted_code = predicted_codes[idx]
  reference_code = reference_codes[idx]
  cleaned_predicted_code = cleanup(predicted_code) if TGT_LANGUAGE == "pn" else predicted_code
  detokenized_code = detokenize(cleaned_predicted_code, TGT_LANGUAGE)
  detokenized_codes.append(detokenized_code + "\n")

  # prepare text for evaluation
  if TGT_LANGUAGE == "pn":
    ref = prepare_eval(reference_code)
    tgt = prepare_eval(cleaned_predicted_code)
  else:
    ref = reference_code
    tgt = cleaned_predicted_code
  eval.append({"id": idx + 1 , "ref": ref, "hyp": tgt})
  idx += 1

In [None]:
# check on test data
print("Reference code:")
print(reference_codes[0])
print("\n")

print("Predicted coce:")
print(predicted_codes[0]) 
print("\n")

In [None]:
# store bpe decoded translated source codes in the text file
translate_file = open(f"{TEST_MODEL_OUTPUT_PATH}/translates.txt", "w")
translate_file.writelines(predicted_lines)
translate_file.close()

# store translated source codes in the text file
count = 1
de_translate_file = open(f"{TEST_MODEL_OUTPUT_PATH}/detokenized_translates.txt", "w")
for i in detokenized_codes:
  de_translate_file.writelines([f"Solution {count}\n", "---" * 30, "\n"])
  de_translate_file.writelines(i)
  de_translate_file.writelines(["\n\n\n"])
  count += 1
de_translate_file.close()

# store translates in one file
output_file = open(f"{TEST_MODEL_OUTPUT_PATH}/output_translates.json", "w")
json.dump({"output": eval}, output_file)
output_file.close()  

In [None]:
if TGT_LANGUAGE == "pn":
  # construct minified file and store
  !pip install pyminifier
  of = open(f"{TEST_MODEL_OUTPUT_PATH}/mini_translates.txt", "w")
  count = 1

  for s in detokenized_codes:
      testfile = open("test.py", "w")
      testfile.writelines(detokenize(s, TGT_LANGUAGE))
      testfile.close()
      os.popen("autopep8 --in-place --aggressive --aggressive test.py")
      os.popen("pyminifier test.py > mini_testfile.py")
      
      mini_testfile = open("mini_testfile.py", "r")
      of.writelines([f"Solution {count}\n", "---" * 30, "\n"])
      of.writelines(mini_testfile.readlines())
      of.writelines(["\n\n\n"])
      count += 1
  of.close()