In [1]:
# !pip install transformers 
# !pip install torch 
# !pip install datasets 
# !pip install sentencepiece 
# !pip install transformers[sentencepiece] 
# !pip install sacrebleu 
# !pip install sacremoses 



In [11]:
# from google.colab import auth, drive
import os
import numpy as np

from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, TrainingArguments
from datasets import load_metric

import torch


In [16]:
!wc -l data/UFAL/medical.all
!wc -l data/UFAL/medical.train_val
!wc -l data/UFAL/medical.test

790915 data/UFAL/medical.all
711824 data/UFAL/medical.train_val
79091 data/UFAL/medical.test


In [18]:
SOURCE_LANGUAGE = 'es'
TARGET_LANGUAGE = 'en'

In [117]:
def load_model(source_language, target_language):
  model_checkpoint = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  print(f'Model Checkpoint Name: {model_checkpoint}')
  lang = {'es':'Spanish', 'en':'English'}
  print(f'Translation: {lang[source_language]} to {lang[target_language]}')
  return tokenizer, model, model_checkpoint


In [118]:
tokenizer, model, model_checkpoint = load_model(source_language = SOURCE_LANGUAGE, target_language = TARGET_LANGUAGE)

Model Checkpoint Name: Helsinki-NLP/opus-mt-es-en
Translation: Spanish to English


In [152]:
def load_data(source_language, corpus = 'data/UFAL/medical.train_val'):

  with open(corpus) as f:
    l = f.readlines()
  
  pairs = []
  for i in l:
      pairs.append(i.strip().split('\t'))

  total_lines = len(l)
  split = int(total_lines * .8)

  if source_language == 'es':
    train_sources= [i[0] for i in pairs[0:split]] 
    train_targets = [i[1] for i in pairs[0:split]]
    val_sources = [i[0] for i in pairs[split:]]
    val_targets = [i[1] for i in pairs[split:]]
  else:
    train_sources= [i[1] for i in pairs[-1:-1* split:-1]] 
    train_targets = [i[0] for i in pairs[-1:-1* split:-1]]
    val_sources = [i[1] for i in pairs[-1*split::-1]]
    val_targets = [i[0] for i in pairs[-1*split::-1]]

  print('Sample Sources:\n----------')
  for i in train_sources[:2]:
    print(i)
  print('\nSample Targets:\n----------')
  for i in train_targets[:2]:
    print(i)
  return train_sources, train_targets, val_sources, val_targets


In [153]:
train_sources, train_targets, val_sources, val_targets = load_data(source_language = SOURCE_LANGUAGE)

Sample Sources:
----------
Poco frecuentes
Lietuva Merck Serono Atstovybė C/ o Ares Trading SA Baltic States Zamenhofo 11-3, LT-44287 Kaunas Tel: +370 37320603

Sample Targets:
----------
Uncommon
256 Lietuva Merck Serono Atstovybė C/ o Ares Trading SA Baltic States Zamenhofo 11-3, LT-44287 Kaunas Tel: +370 37320603


In [132]:
assert len(train_targets)==len(train_sources)
assert len(val_targets)==len(val_sources)

In [121]:
class BatchData(torch.utils.data.Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_length= 512):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_token_length = max_length


    def convert_data(self):
        self.model_inputs = self.tokenizer(self.source_texts, max_length = self.max_token_length, truncation = True) #, padding = True)
        with tokenizer.as_target_tokenizer():
            self.labels = tokenizer(self.target_texts, max_length= self.max_token_length, truncation = True) #, padding = True)
        self.model_inputs['labels'] = self.labels['input_ids']


    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.model_inputs.items()}
        return item

    def __len__(self):
        return len(self.model_inputs['labels'])


In [135]:
train_dataset = BatchData(train_sources, train_targets, tokenizer = tokenizer)
val_dataset = BatchData(val_sources, val_targets, tokenizer = tokenizer)


In [136]:
train_dataset.convert_data()
val_dataset.convert_data()

In [124]:
BATCH_SIZE = 8

In [137]:
assert len(train_dataset) == len(train_sources)
assert len(val_dataset) == len(val_sources)

In [141]:
model_name = model_checkpoint.split("/")[-1]



#https://huggingface.co/docs/transformers/main_classes/trainer
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{SOURCE_LANGUAGE}-to-{TARGET_LANGUAGE}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    save_steps = 50,
    num_train_epochs=1,
    predict_with_generate=True   
)

print('\nFine-tuned Model Directory:')
print(f"{model_name}-finetuned-{SOURCE_LANGUAGE}-to-{TARGET_LANGUAGE}/")



Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Fine-tuned Model Directory:
opus-mt-es-en-finetuned-es-to-en/


In [143]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = load_metric("sacrebleu")

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [144]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [145]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 100


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,3.123492,13.9121,28.075


Saving model checkpoint to opus-mt-en-es-finetuned-en-to-es/checkpoint-50
Configuration saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-50/config.json
Model weights saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-50/pytorch_model.bin
tokenizer config file saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-50/tokenizer_config.json
Special tokens file saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-50/special_tokens_map.json
Saving model checkpoint to opus-mt-en-es-finetuned-en-to-es/checkpoint-100
Configuration saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-100/config.json
Model weights saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-100/pytorch_model.bin
tokenizer config file saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-100/tokenizer_config.json
Special tokens file saved in opus-mt-en-es-finetuned-en-to-es/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


Training completed. Do not forget to

TrainOutput(global_step=100, training_loss=3.4800796508789062, metrics={'train_runtime': 1259.7788, 'train_samples_per_second': 0.635, 'train_steps_per_second': 0.079, 'total_flos': 18654693949440.0, 'train_loss': 3.4800796508789062, 'epoch': 1.0})

## Import Fine-Tuned Model and Compare with Baseline

Fine-Tuned Model:

In [None]:
last_epoch = 100

tuned_model_name = f'opus-mt-en-es-finetuned-en-to-es/checkpoint-{last_epoch}'
tokenizer = MarianTokenizer.from_pretrained(tuned_model_name)
model = MarianMTModel.from_pretrained(tuned_model_name)

Didn't find file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/target_vocab.json. We won't load it.
Didn't find file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/added_tokens.json. We won't load it.
loading file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/source.spm
loading file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/target.spm
loading file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/vocab.json
loading file None
loading file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/tokenizer_config.json
loading file None
loading file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/special_tokens_map.json
loading configuration file opus-mt-en-es-finetuned-en-to-es/checkpoint-100/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-en-es",
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_i

In [None]:
src_txt = ['Of these, 51% had generalised peritonitis at baseline.']

#correct translation: De estos, el 51% presentaba peritonitis generalizada en el momento basal.

In [None]:
translated = model.generate(**tokenizer(src_txt, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

['De ellos, el 51% tenía peritonitis generalizada al inicio.']

Compare to Base Model Results

In [None]:
base_model_name = f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
base_tokenizer = MarianTokenizer.from_pretrained(base_model_name)
base_model = MarianMTModel.from_pretrained(base_model_name)

In [None]:
translated = base_model.generate(**tokenizer(src_txt, return_tensors="pt", padding=True))
[base_tokenizer.decode(t, skip_special_tokens=True) for t in translated]

['De ellos, el 51% tenía peritonitis generalizada al inicio.']