In [1]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install sentencepiece
!pip install transformers[sentencepiece]
!pip install sacrebleu
!pip install sacremoses

import os
import numpy as np
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, TrainingArguments
from datasets import load_metric
import torch

source_language = 'es'
target_language = 'en'

model_checkpoint = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# auth.authenticate_user()

# drive.mount('/content/drive')
# %cd /MyDrive/final_project

os.environ["WANDB_DISABLED"]="true"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!ls /content/drive/MyDrive/final_project/data/UFAL

extra  medical.all  medical.test  medical.train_val  UFAL_CORPUS_INFO


In [3]:
with open('/content/medical.all.es-en') as f:
    l = f.readlines()

source_text = [i.strip().split('\t')[0] for i in l]
target_text = [i.strip().split('\t')[1] for i in l]

train_source = source_text[:630000]
train_target = target_text[:630000]

val_source = source_text[630000:710000]
val_target = target_text[630000:710000]

test_source = source_text[710000:]
test_target = target_text[710000:]

In [4]:
class BatchData(torch.utils.data.Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_length= 128):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_token_length = max_length


    def convert_data(self):
        self.model_inputs = self.tokenizer(self.source_texts, max_length = self.max_token_length, truncation = True) #, padding = True)
        with tokenizer.as_target_tokenizer():
            self.labels = tokenizer(self.target_texts, max_length= self.max_token_length, truncation = True) #, padding = True)
        self.model_inputs['labels'] = self.labels['input_ids']


    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.model_inputs.items()}
        return item

    def __len__(self):
        return len(self.model_inputs['labels'])

TRAIN_DATASET = BatchData(train_source, train_target, tokenizer = tokenizer)
VAL_DATASET = BatchData(val_source, val_target, tokenizer = tokenizer)
TEST_DATASET = BatchData(test_source, test_target, tokenizer = tokenizer)

TRAIN_DATASET.convert_data()
VAL_DATASET.convert_data()
TEST_DATASET.convert_data()

In [5]:
BATCH_SIZE = 32

model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_language}-to-{target_language}",
    evaluation_strategy ='steps',
    eval_steps = 3000,
    optim = 'adafactor',
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    #weight_decay=0.01,
    save_total_limit=3,
    save_steps = 3000,
    num_train_epochs=1,
    predict_with_generate=True   
)

#https://huggingface.co/docs/transformers/main_classes/trainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [6]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=TRAIN_DATASET,
    eval_dataset=VAL_DATASET,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 630000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 19688


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 80000
  Batch size = 32


Step,Training Loss,Validation Loss
