In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)


In [None]:
train = pd.read_csv(
    "/kaggle/input/deep-past-initiative-machine-translation/train.csv"
)
test = pd.read_csv(
    "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
)
sample = pd.read_csv(
    "/kaggle/input/deep-past-initiative-machine-translation/sample_submission.csv"
)


In [None]:
train_ds = Dataset.from_pandas(
    train[["transliteration", "translation"]]
)


In [None]:
MAX_LEN = 256

def preprocess(batch):
    src = ["translate Akkadian to English: " + x for x in batch["transliteration"]]
    return tokenizer(
        src,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        text_target=batch["translation"],
    )

train_ds = train_ds.map(
    preprocess,
    batched=True,
    remove_columns=train_ds.column_names
)


In [None]:
args = TrainingArguments(
    output_dir="./byt5",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=6,     # use 8 if time allows
    fp16=True,
    logging_steps=100,
    save_total_limit=1,
    report_to="none",
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
)

trainer.train()


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def generate(text):
    inputs = tokenizer(
        "translate Akkadian to English: " + text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LEN
    ).to(device)

    with torch.no_grad():
        out = model.generate(
            inputs["input_ids"],
            num_beams=5,
            max_length=MAX_LEN
        )

    return tokenizer.decode(out[0], skip_special_tokens=True)


In [None]:
translations = []

for i in range(len(sample)):
    if i < len(test):
        translations.append(generate(test.loc[i, "transliteration"]))
    else:
        translations.append(" ")

submission = sample.copy()
submission["translation"] = translations

submission.to_csv("submission.csv", index=False)


In [None]:
print(submission.head())
print(submission.shape, sample.shape)

assert submission.shape == sample.shape
assert submission.isna().sum().sum() == 0
