### Download data

In [None]:
! gdown 1L1v_wwa8GwEGUy39Xls1JF2VoU0hSDJ7

In [2]:
! cp -r /content/drive/MyDrive/RuEnImageCaptioning /content/

In [6]:
import os
os.chdir("RuEnImageCaptioning")

In [None]:
! pwd

In [None]:
! pip install -r requirements.txt

In [None]:
! pip install transformers

In [None]:
! pip install adapter-transformers

In [12]:
! gunzip /content/ted_raw.tar.gz

In [None]:
! tar -xvf /content/ted_raw.tar

### Russian Data Creation

In [14]:
import json
import os

def convert_to_jsonl(source_filepath, target_filepath, source_lang_code, target_lang_code, limit=-1):
    jsonl_dicts = []
    with open(source_filepath, "r", encoding="utf-8") as stream:
        for idx, line in enumerate(stream.readlines()[:limit]):
            source_sent, target_sent = line.split(" ||| ")
            target_sent = target_sent.replace("\n", "")
            jsonl_object = { "translation": { source_lang_code: target_sent, target_lang_code: source_sent } }
            jsonl_dicts.append(jsonl_object)

    if os.path.exists(target_filepath):
        with open(target_filepath, "a", encoding="utf-8") as stream:
            for jsonl_dict in jsonl_dicts:
                stream.write(json.dumps(jsonl_dict, ensure_ascii=False).encode('utf8').decode()+"\n")
    else:
        with open(target_filepath, "w", encoding="utf-8") as stream:
            for jsonl_dict in jsonl_dicts:
                stream.write(json.dumps(jsonl_dict, ensure_ascii=False).encode('utf8').decode()+"\n")

source_lang_code = "en"
target_lang_code = "ru"

train_source_filepath = "ted_raw/rus_eng/ted-train.orig.rus-eng"
train_target_filepath = "ru_en_train.json"

dev_source_filepath = "ted_raw/rus_eng/ted-dev.orig.rus-eng"
dev_target_filepath = "ru_en_dev.json"

test_source_filepath = "ted_raw/rus_eng/ted-test.orig.rus-eng"
test_target_filepath = "ru_en_test.json"

convert_to_jsonl(train_source_filepath, train_target_filepath, source_lang_code, target_lang_code)
convert_to_jsonl(dev_source_filepath, dev_target_filepath, source_lang_code, target_lang_code)
convert_to_jsonl(test_source_filepath, test_target_filepath, source_lang_code, target_lang_code)

In [15]:
import json

with open("eng_predicted_captions.json", "r") as stream:
    data = json.loads(stream.read())

In [None]:
len(data)

In [None]:
print(data[0])

In [18]:
eval_data = []
for item in data:
    eval_data.append({
        "translation": {"en": item["hypothesis"], "ru": item["hypothesis"]}
    })

In [None]:
eval_data[0]

In [20]:
with open("en-to-ru-test.json", "w", encoding="utf-8") as stream:
    for jsonl_dict in eval_data:
        stream.write(json.dumps(jsonl_dict, ensure_ascii=False).encode('utf8').decode()+"\n")

In [21]:
combined_mr_data = []

with open("ru_en_train.json", "r") as stream_in:
    with open("en-to-ru-train.json", "w", encoding="utf-8") as stream_out:
        for line in stream_in.readlines():
            stream_out.write(line)

with open("ru_en_dev.json", "r") as stream_in:
    with open("en-to-ru-train.json", "a", encoding="utf-8") as stream_out:
        for line in stream_in.readlines():
            stream_out.write(line)

with open("ru_en_test.json", "r") as stream_in:
    with open("en-to-ru-train.json", "a", encoding="utf-8") as stream_out:
        for line in stream_in.readlines():
            stream_out.write(line)

with open("en-to-ru.json", "r") as stream_in:
    with open("en-to-ru-train.json", "a", encoding="utf-8") as stream_out:
        for line in stream_in.readlines():
            stream_out.write(line)

### Adapter fine-tuning

In [None]:
! python finetune_adapter.py \
    --model_name_or_path "facebook/mbart-large-50-many-to-many-mmt" \
    --train_adapter \
    --do_train \
    --do_eval \
    --do_predict \
    --evaluation_strategy "steps" \
    --save_total_limit 1 \
    --fp16 \
    --eval_steps 5000 \
    --train_file en-to-ru-train.json \
    --validation_file en-to-ru-test.json \
    --test_file en-to-ru-test.json \
    --source_lang en_XX \
    --target_lang ru_RU \
    --output_dir /content/mbart/en-ru \
    --per_device_train_batch_size=8 \
    --per_device_eval_batch_size=8 \
    --overwrite_output_dir \
    --num_train_epochs 1 \
    --predict_with_generate

In [None]:
from google.colab import files
files.download("mbart/en-ru/generated_predictions.txt")

### Formatting Predictions

In [None]:
with open("ru20k_predicted_captions.json", "r") as stream:
  ru_preds = json.loads(stream.read())

ru_test = []
with open("mbart/en-ru/generated_predictions.txt", "r") as stream:
  for line in stream.readlines():
    ru_test.append(line.strip())

assert len(ru_preds) == len(ru_test)

for idx, (gt, pred) in enumerate(zip(ru_preds, ru_test)):
  ru_preds[idx]["hypothesis"] = pred

with open("mBART50_ru_preds.json", "w", encoding="utf-8") as stream:
  stream.write(json.dumps(ru_preds, ensure_ascii=False).encode('utf8').decode())

files.download("mBART50_ru_preds.json")