In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!pip install javalang
!pip install pyminifier

In [None]:
# install fastBPE
!git clone https://github.com/glample/fastBPE.git
%cd fastBPE
!g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
%cd ..

In [None]:
import math
import io
import os
import torchtext
import torch
import torch.nn as nn
from torchtext.vocab import vocab
from torch import Tensor
from torch.nn import (TransformerEncoder, TransformerDecoder,TransformerEncoderLayer, TransformerDecoderLayer)
from collections import OrderedDict
import javalang
import tokenize
import re

In [None]:
BPE_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/preprocessed_files/BPE"
OUTPUT_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/output_files"

SRC_LANGUAGE = "pn"
TGT_LANGUAGE = "ja"
MAX_COUNT = 10000
NUM_EPOCHS = 100
LEARNING_RATE = 2e-5
ACTIVATION = "gelu"
BATCH_SIZE = 16
NUM_ENCODER_LAYERS = 6
NUM_ENCODER_LAYERS = 6
SRC_FILE = "2_650.py"

SRC_TOK_FILE = f"test_tok.{SRC_LANGUAGE}"
SRC_BPE_FILE = f"test.{SRC_LANGUAGE}"

TEST_MODEL = f"{OUTPUT_FILEPATH}/sourcecode_nmt_{SRC_LANGUAGE}2{TGT_LANGUAGE}_{MAX_COUNT}C_{NUM_EPOCHS}E_{LEARNING_RATE}LR_{ACTIVATION}_{BATCH_SIZE}B_{NUM_ENCODER_LAYERS}E_{NUM_ENCODER_LAYERS}D.pth"

In [None]:
def minify(file):
   mini_filepath = "mini_" + file
   os.popen(f"pyminifier {file} > {mini_filepath}")
   return mini_filepath

In [None]:
def tokenize_java(filepath):
    file = open(filepath, "r", encoding = "ISO-8859-1")
    tokens = javalang.tokenizer.tokenize(file.read())
    code = []
    for token in tokens:
        code.append(token.value)
#     print(f"Java Tokens Count: {len(code)}")
    return " ".join(code)

In [None]:
def tokenize_python(filepath):
    code = []
    with tokenize.open(filepath) as f:
        tokens = tokenize.generate_tokens(f.readline)
        pre_token = None
        for token in tokens:
            if (pre_token != None and pre_token.type == tokenize.COMMENT and token.type == tokenize.NL) or (token.type == tokenize.COMMENT):
                pre_token = token
                continue
            elif token.type == tokenize.NEWLINE:
                val = token.string.replace("\n", "NEWLINE")
            elif token.type == tokenize.NL:
                val = "NL"
            elif token.type == tokenize.INDENT and token.string.isspace():
                no = int(len(token.string))
                val = "INDENT" * no
            elif token.type == tokenize.INDENT:
                val = token.string.replace("\t", "INDENT")
            elif token.type == tokenize.DEDENT:
                val = "DEDENT"
            elif token.type == tokenize.ENDMARKER:
                val = "ENDMARKER"
            else:
                val = token.string
            pre_token = token
            code.append(val)
#     print(f"Python Tokens Count: {len(code)}")
    return " ".join(code)

In [None]:
def tokenize_code(filepath, lang):
  if lang == "ja":
    return tokenize_java(filepath)
  else:
    mini_filepath = minify(filepath)
    return tokenize_python(mini_filepath)

In [None]:
# pre-tokenization of source language code
src = tokenize_code(SRC_FILE, SRC_LANGUAGE)
print(src)
with open(SRC_TOK_FILE, "w") as f:
  f.write(src)

In [None]:
# BPE tokenization of source language code
SRC_VOCAB_FILE = f"{BPE_FILEPATH}/vocab.{SRC_LANGUAGE}.{MAX_COUNT}"
!./fastBPE/fast applybpe $SRC_BPE_FILE $SRC_TOK_FILE $BPE_FILEPATH/codes $SRC_VOCAB_FILE

In [None]:
# Place-holders
vocab_transform = {}

In [None]:
def build_vocab(filename):
  ordered_dict = OrderedDict()
  with io.open(filename) as file:
    for string_ in file:
      word_feq = string_.rstrip("\n").split(" ")
      word = word_feq[0]
      feq = int(word_feq[1])
      ordered_dict[word] = feq
  vocabulary = vocab(ordered_dict)
  unk_token = "<unk>"
  if unk_token not in vocabulary: vocabulary.insert_token(unk_token, 0)
  vocabulary.insert_token("<pad>", 1)
  vocabulary.insert_token("<bos>", 2)
  vocabulary.insert_token("<eos>", 3)
  vocabulary.set_default_index(vocabulary[unk_token])
  return vocabulary

In [None]:
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln] = build_vocab(f"{BPE_FILEPATH}/vocab.{ln}.{MAX_COUNT}")

In [None]:
PAD_IDX = vocab_transform[SRC_LANGUAGE]["<pad>"]
BOS_IDX = vocab_transform[SRC_LANGUAGE]["<bos>"]
EOS_IDX = vocab_transform[SRC_LANGUAGE]["<eos>"]

In [None]:
# transformer
class Seq2SeqTransformer(nn.Module):
    def __init__(self, 
                 num_encoder_layers: int, 
                 num_decoder_layers: int,
                 emb_size: int, 
                 src_vocab_size: int, 
                 tgt_vocab_size: int,
                 dim_feedforward:int, 
                 activation:str,
                 layer_norm_eps:float,
                 dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model = emb_size, 
                                                nhead = NHEAD,
                                                dim_feedforward = dim_feedforward, 
                                                activation = activation, 
                                                layer_norm_eps = layer_norm_eps)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers = num_encoder_layers)

        decoder_layer = TransformerDecoderLayer(d_model = emb_size, 
                                                nhead = NHEAD, 
                                                dim_feedforward = dim_feedforward,
                                                activation = activation, 
                                                layer_norm_eps = layer_norm_eps)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout = dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
      
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None, tgt_padding_mask, memory_key_padding_mask)

        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout, maxlen: int = 450):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer("pos_embedding", pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding +
                            self.pos_embedding[:token_embedding.size(0),:])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
        
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
    return mask

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
transformer = torch.load(TEST_MODEL)

In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        memory_mask = torch.zeros(ys.shape[0], memory.shape[0]).to(DEVICE).type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
          break
    return ys

In [None]:
def translate(model, src, src_vocab, tgt_vocab):
    model.eval()
    src_tokens = src.split(" ")
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[tok] if tok in src_vocab.get_stoi() else 0 for tok in src_tokens] + [EOS_IDX]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(num_tokens, 1))
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    out = " ".join([tgt_vocab.get_itos()[tok] for tok in tgt_tokens]).replace("<bos>", "").replace("<eos>", "")
    return out

In [None]:
def decode_bpe(x):
  return x.replace("@@ ", "")

In [None]:
def detokenize_java(s):
  try:
    tokens = javalang.tokenizer.tokenize(s)
    return javalang.tokenizer.reformat_tokens(tokens)
  except:
    return s


In [None]:
def detokenize_python(s):
    cleaned_lines = []
    lines = s.split("NEWLINE")
    for line in lines:
        line = line.strip()
        if line.startswith("INDENT"):
            idn_count = line.count("INDENT")
            for i in range(idn_count):
                if i == idn_count:
                    line = line.replace("INDENT ", "    ")
                else:
                    line = line.replace("INDENT", "    ")
        line = line.replace("INDENT", "")
        line = line.replace("DEDENT ", "")
        line = line.replace("DEDENT", "")
        line = line.replace("NL", "")
        line = line.replace("ENDMARKER", "")
        cleaned_lines.append(line)
    code = "\n".join(cleaned_lines)
    code = code.replace(". ", ".").replace(" .", ".")
    return code

In [None]:
def detokenize_code(s, lang):
  if lang == "ja":
    return detokenize_java(s)
  elif lang == "pn":
    return detokenize_python(s)

In [None]:
def cleanup(s):
  l = re.compile("newline", re.IGNORECASE).sub("NEWLINE", s)
  l = re.compile("new line", re.IGNORECASE).sub("NEWLINE", l)
  l = re.compile("indent", re.IGNORECASE).sub("INDENT", l)
  l = re.compile("dedent", re.IGNORECASE).sub("DEDENT", l)
  return l

In [None]:
SRC_LANG = "Java" if SRC_LANGUAGE == "ja" else "Python"
TGT_LANG = "Java" if TGT_LANGUAGE == "ja" else "Python"

# try on single file
with open(SRC_FILE, "r") as f:
  src_code = f.read()
print(f"Program in \"{SRC_LANG}\":")
print(src_code)
print("\n\n")

with open(SRC_BPE_FILE, "r") as f:
  bpe_code = f.read()
print(f"BPE of program in \"{SRC_LANG}\":")
print(bpe_code)
print("\n\n")

translated_bpe = translate(transformer, bpe_code, vocab_transform[SRC_LANGUAGE], vocab_transform[TGT_LANGUAGE])
print(f"Translated BPE of program in the target language \"{TGT_LANG}\":")
print(translated_bpe)
print("\n\n")

decoded_code = decode_bpe(translated_bpe)
print(f"BPE decoded program in the target language \"{TGT_LANG}\":")
print(decoded_code)
print("\n\n")

detokenized_code = detokenize_code(cleanup(decoded_code), TGT_LANGUAGE);
print(f"Detokenized program in the target language \"{TGT_LANG}\":")
print(detokenized_code)

TGT_EXT = "java" if TGT_LANGUAGE == "ja" else "py"
TGT_FILE = f"translate.{TGT_EXT}"
with open(TGT_FILE, "w") as f:
  f.write(detokenized_code)

if TGT_LANGUAGE == "pn":
  # construct minified file and store
  !pip install pyminifier
  os.popen("autopep8 --in-place --aggressive --aggressive translate.py")
  os.popen("pyminifier translate.py > mini_translate.py")
      

In [None]:
# write everything to a file
txt =""
txt1 = f"Program in \"{SRC_LANG}\":\n"
txt += f"{txt1}{'=' * len(txt1)}\n{src_code}\n\n"
txt2 = f"BPE of program in \"{SRC_LANG}\":\n"
txt += f"{txt2}{'=' * len(txt2)}\n{bpe_code}\n\n\n"
txt3 = f"Translated BPE of program in the target language \"{TGT_LANG}\":\n"
txt += f"{txt3}{'=' * len(txt3)}\n{translated_bpe}\n\n\n"
txt4 = f"BPE decoded program in the target language \"{TGT_LANG}\":\n"
txt += f"{txt4}{'=' * len(txt4)}\n{decoded_code}\n\n\n"
txt5 = f"Detokenized program in the target language \"{TGT_LANG}\":\n"
txt += f"{txt5}{'=' * len(txt5)}\n{detokenized_code}\n\n\n"

if TGT_LANGUAGE == "pn":
  txt6 = f"Minified program:\n"
  with open("mini_translate.py", "r") as f:
    mini = f.read()
  txt += f"{txt6}{'=' * len(txt6)}\n{mini}\n\n\n"

with open("details.txt", "w") as f:
  f.write(txt)
