In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

In [None]:
%%capture
!pip install datasets
!pip install transformers

In [None]:
from transformers import RobertaTokenizer, EncoderDecoderModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
import csv

In [None]:
# load datasets

##### local device path #####
# PREPROCESSED_FILEPATH = "code_translation/preprocessed_files/codebert"
# OUTPUT_FILEPATH = "code_translation/codebert_output_files"

##### drive path #####
# PREPROCESSED_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/preprocessed_files/codebert"
# OUTPUT_FILEPATH = "drive/MyDrive/dissertation_workplace/code_translation/codebert_output_files"

##### kaggle path #####
PREPROCESSED_FILEPATH = "../input/code-translation-v3/preprocessed_files/codebert"
OUTPUT_FILEPATH = "./"

SRC_LANGUAGE = "pn"
TGT_LANGUAGE = "ja"
NUM_EPOCHS = 50
LEARNING_RATE = 5e-5

REPOSITORY_ID = f"mini_codebert_sourcecode_nmt_{SRC_LANGUAGE}2{TGT_LANGUAGE}_{NUM_EPOCHS}E_{LEARNING_RATE}LR"

# Training data
with open(f"{PREPROCESSED_FILEPATH}/train.ja", "r") as f:
  java_codes = f.readlines()

with open(f"{PREPROCESSED_FILEPATH}/train.pn", "r") as f:
  python_codes = f.readlines()

In [None]:
translation = []
for (java_code, python_code) in zip(java_codes, python_codes):
  translation.append({"ja": java_code.rstrip(), "pn": python_code.rstrip()})

datasets = Dataset.from_dict({"translation": translation})

In [None]:
%%capture
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

In [None]:
max_input_length = 450
max_target_length = 450

def preprocess_function(batch):
    inputs = [ex[SRC_LANGUAGE] for ex in batch["translation"]]
    targets = [ex[TGT_LANGUAGE] for ex in batch["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = datasets.map(preprocess_function, batched = True, remove_columns = datasets.column_names)

In [None]:
# set encoder decoder tying to True
codebert_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("microsoft/codebert-base", "microsoft/codebert-base", tie_encoder_decoder = True)

In [None]:
# set special tokens
codebert_shared.config.decoder_start_token_id = tokenizer.bos_token_id                                             
codebert_shared.config.eos_token_id = tokenizer.eos_token_id
codebert_shared.config.pad_token_id = tokenizer.pad_token_id
                           
codebert_shared.config.max_length = max_target_length # The maximum length of the sequence to be generated.
codebert_shared.config.no_repeat_ngram_size = 3 #  If set to int > 0, all ngrams of that size can only occur once.
codebert_shared.config.vocab_size = codebert_shared.config.encoder.vocab_size

BATCH_SIZE = 4

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = codebert_shared)

In [None]:
# example to test
example = data_collator([tokenized_datasets[i] for i in range(1, 3)])
print(example.keys())
print(example["labels"])
print(example["decoder_input_ids"])

In [None]:
args = Seq2SeqTrainingArguments(
    f"model",
    evaluation_strategy = "no",
    do_train=True,
    save_strategy = "epoch",
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    save_total_limit = 3,
    num_train_epochs = NUM_EPOCHS,
    predict_with_generate=True,
    fp16 = True,
    logging_strategy = "epoch"
)

In [None]:
trainer = Seq2SeqTrainer(
    mini_codebert_shared,
    args,
    train_dataset = tokenized_datasets,
    data_collator = data_collator,
    tokenizer = tokenizer
)

In [None]:
from timeit import default_timer as timer
start_training_time = timer()
trainer.train()
end_training_time = timer()

In [None]:
# trainer.save_model("./model_test3")
trainer.save_model("model")

In [None]:
time_taken_to_train = end_training_time - start_training_time
mins = time_taken_to_train/60 
if mins < 60:
    train_time = f"Training time: {mins} mins"
    print(f"Training time taken: {mins} mins")
else:
    hrs = int(mins/60)
    mins = mins - hrs * 60
    train_time = f"Training time taken: {hrs} hrs {mins} mins"
    print(f"Training time taken: {hrs} hrs {mins} mins")

In [None]:
# write loss to csv file
LOSS_FILE = "loss_data.csv"
log_history = trainer.state.log_history
loss_data = []

idx = 0
total = len(log_history) - 1

for idx in range(total):
    data = log_history[idx]
    loss_data.append({
            "epoch": data["epoch"],
            "loss": data["loss"],
            "learning_rate": data["learning_rate"],
            "step": data["step"]
        })
       

field_names = ["epoch", "loss", "learning_rate", "step"]
with open(f"./{LOSS_FILE}", "w") as outfile:
  writer = csv.DictWriter(outfile, fieldnames = field_names)
  writer.writeheader()
  writer.writerows(loss_data)

In [None]:
# save model to HuggingFace hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!git config --global credential.helper store

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id = REPOSITORY_ID, private = False, repo_type = "model")

In [None]:
import os

arr = os.listdir("model")
for a in arr:
    if a.startswith("checkpoint") or a.startswith("run"):
        continue
    url = api.upload_file(
        path_or_fileobj = f"model/{a}", 
        path_in_repo = f"{a}", 
        repo_id = f"joshanashakya/{REPOSITORY_ID}",
    )

In [None]:
url = api.upload_file(
        path_or_fileobj = f"./{LOSS_FILE}", 
        path_in_repo = f"{LOSS_FILE}", 
        repo_id = f"joshanashakya/{REPOSITORY_ID}",
    )

In [None]:
url = api.upload_file(
        path_or_fileobj = f"./{TIME_FILE}", 
        path_in_repo = f"{TIME_FILE}", 
        repo_id = f"joshanashakya/{REPOSITORY_ID}",
    )