In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
import json

In [None]:
# SYS = "transformer"
SYS = "codebert"

In [None]:
def read_json(filepath):
  json_file = open(filepath, 'r')
  json_data = json.load(json_file)

  java_codes, python_codes = [], []
  
  for item in json_data['codes']:
    java_codes.append(item['java_code'] + "\n") 
    python_codes.append(item['python_code'] + "\n")

  json_file.close()
  return java_codes, python_codes

In [None]:
def write(filepath, data):
  f = open(filepath, "w")
  f.writelines(data)
  f.close()

In [None]:
# load datasets
if SYS == "transformer":
  JSON_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/dataset/code450.json"
  OUTPUT_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/preprocessed_files"
else:
  JSON_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/dataset/codebert/codebert_code450.json"
  OUTPUT_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/preprocessed_files/codebert"

TRAIN_SIZE = 2506     # 80:20 [3133 (2506:627)]
java_codes, python_codes = read_json(JSON_FILEPATH)

# write to files
# Training data
write(f"{OUTPUT_FILEPATH}/train.ja", java_codes[:TRAIN_SIZE])
write(f"{OUTPUT_FILEPATH}/train.pn", python_codes[:TRAIN_SIZE])

# Test data
write(f"{OUTPUT_FILEPATH}/test.ja", java_codes[TRAIN_SIZE:])
write(f"{OUTPUT_FILEPATH}/test.pn", python_codes[TRAIN_SIZE:])

In [None]:
len(java_codes)

In [None]:
if SYS == "codebert":
  raise SystemExit("Stop right there! No further processing is required for CodeBERT.")

In [None]:
# install fastBPE
!git clone https://github.com/glample/fastBPE.git
%cd fastBPE
!g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
%cd ..

In [None]:
BPE_FILEPATH = f"{OUTPUT_FILEPATH}/BPE"
MAX_COUNT = 10000

# learn bpe codes
!./fastBPE/fast learnbpe $MAX_COUNT $OUTPUT_FILEPATH/train.ja $OUTPUT_FILEPATH/train.pn > $BPE_FILEPATH/codes

# apply bpe 
!./fastBPE/fast applybpe $BPE_FILEPATH/train.ja.$MAX_COUNT $OUTPUT_FILEPATH/train.ja $BPE_FILEPATH/codes
!./fastBPE/fast applybpe $BPE_FILEPATH/train.pn.$MAX_COUNT $OUTPUT_FILEPATH/train.pn $BPE_FILEPATH/codes

# construct vocabulary
!./fastBPE/fast getvocab $BPE_FILEPATH/train.ja.$MAX_COUNT > $BPE_FILEPATH/vocab.ja.$MAX_COUNT
!./fastBPE/fast getvocab $BPE_FILEPATH/train.pn.$MAX_COUNT > $BPE_FILEPATH/vocab.pn.$MAX_COUNT

# apply codes to test
!./fastBPE/fast applybpe $BPE_FILEPATH/test.ja.$MAX_COUNT  $OUTPUT_FILEPATH/test.ja  $BPE_FILEPATH/codes $BPE_FILEPATH/vocab.ja.$MAX_COUNT
!./fastBPE/fast applybpe $BPE_FILEPATH/test.pn.$MAX_COUNT  $OUTPUT_FILEPATH/test.pn  $BPE_FILEPATH/codes $BPE_FILEPATH/vocab.pn.$MAX_COUNT