In [None]:
!pip install datasets
!pip install tokenizers
!pip install rouge_score
!pip install transformers==4.28.0
!pip install pytorch-transformers
!pip install accelerate -U
!pip install accelerate
!pip install accelerate==0.17.1
!pip install pytorch-accelerated
!pip install sacrebleu
!pip install evaluate

In [None]:
import moxing as mox

obs_folder_path= 'obs://hakan/mt-dataset.csv'
local_folder_path='/home/ma-user/work/mt-dataset.csv'

mox.file.copy(obs_folder_path, local_folder_path)

In [None]:
import os
import torch
import evaluate
import numpy as np
import pandas as pd

from datasets import Dataset
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq

In [None]:
## Load Dataset
dataset = load_dataset("csv", data_files=local_folder_path)


## Split Dataset
## 80% train, 20% validation
train_valid = dataset['train'].train_test_split(test_size=0.2, shuffle=False)
train_valid_dataset = DatasetDict({
    'train': train_valid['train'],
    'valid': train_valid['test'],
    })


train_data = train_valid_dataset['train']
val_data = train_valid_dataset['valid']

print(train_data)
print(type(train_data))

In [None]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
## Preprocess ##

source_lang = "en"
target_lang = "de"
prefix = "translate English to German: "


def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[source_lang]]
    targets = [ex for ex in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs


train_tokenized_datasets = train_data.map(preprocess_function, batched=True)
val_tokenized_datasets = val_data.map(preprocess_function, batched=True)



data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
## Evaluate ##

metric = load_metric("sacrebleu")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
## Training ##

model_dir="/home/ma-user/work/model"


training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    disable_tqdm=False,
    num_train_epochs = 5,
    overwrite_output_dir=True,
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_datasets,
    eval_dataset=val_tokenized_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
text = "We do not know what is happening."
tokenizer = AutoTokenizer.from_pretrained("/home/ma-user/work/model/checkpoint-1500")
inputs = tokenizer(text, return_tensors="pt").input_ids
model = AutoModelForSeq2SeqLM.from_pretrained("/home/ma-user/work/model/checkpoint-1500")
outputs = model.generate(inputs, max_new_tokens=60, do_sample=True, top_k=10, top_p=0.95)
tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
mox.file.copy_parallel(model_dir, obs_folder_path)