# Translation notebook

This is the notebook for translation

(More descriptions)

#### (Only for  Google Colab Execution)

If you are running the notebook in Google Colab, run the cell below to download the repository witht he required files to run the models and the requirements file.

In [None]:
!git clone https://github.com/ijauregiCMCRC/ALTA2021_tutorial.git
%cd /ALTA2021_tutorial/translation

#### Install requirements

In [None]:
!pip install -r requirements.txt

### 1. Import packages

In [3]:
import os
os.getcwd()
import random
import numpy as np

import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TestTubeLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import gdown

from src.translation_lightning_model import LmForTranslation

### 2. Define parameters

In [4]:
args ={
    'train_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/train_pr',  # Path to training data
    'validation_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/dev2010',  # Path to validation data
    'test_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/test_joined',  # Path to test data
    'src': 'en',  # Source language prefix
    'tgt': 'es',  # Target language prefix
    'max_src_len': 170,  # Maximum number of tokens in the source sentence
    'max_tgt_len': 170,  # Maximum number of tokens in the target sentence
    'save_dir': '../models/iwslt_2014/es-en/mBART_large_pretrained_mt_plus_adapter',  # Path to save the model and logs
    'tokenizer': 'facebook/mbart-large-cc25',  # Pretrained tokenizer
    'model': 'mrm8488/mbart-large-finetuned-opus-en-es-translation',  # Pretrained model
    'add_adapter': False,  # Include adapter training
    'reduction_factor': 1,  # Adapter's reduction factor (>= 1)
    'label_smoothing': 0.1, # Label smoothing 
    'epochs': 1,  # Number of epochs during training
    'batch_size': 8,  # Batch size
    'grad_accum': 1,  # Gradient accumulation
    'lr': 0.00003,  # Training learning rate
    'warmup': 500,  # Number of warmup steps
    'weight_decay': 0.00003,  # Adam weight decay
    'gpus': 1,  # Number of gpus. 0 for CPU
    'precision': 32,  # Double precision (64), full precision (32) 
                      # or half precision (16). Can be used on CPU, GPU or TPUs.
}

### 3. Initialize Lightning module

In [5]:
# Initialize with a seed
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    
# dataset size. Needed to compute number of steps for the lr scheduler
args['dataset_size'] = sum(1 for line in open(args['train_data'] + '.' + args['src']))

# Define PyTorch Lightning model
model = LmForTranslation(args)
# Include datasets
#model.hf_datasets = {'train': args['train_data'],
#                     'validation': args['validation_data'],
#                     'test': args['test_data']}
print(model.hf_datasets)

# Define logger
logger = TestTubeLogger(
    save_dir=args['save_dir'],
    name='training',
    version=0  # always use version=0
)

# Define checkpoint saver
checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(args['save_dir'], "training", "checkpoints"),  # Dir path
    filename='check-{epoch:02d}-{BLEU:.2f}',  # Filename
    save_top_k=1,  # Maximum number of checkpoints to be saved
    verbose=True,  # Verbose
    monitor='BLEU',  # Checkpointing measurement (BLEU validation)
    mode='max',      # Maximize measurement over the validation
    period=1         # Save every epoch
)

print(args)


# Define lightning trainer
trainer = pl.Trainer(gpus=args['gpus'], distributed_backend='dp' if torch.cuda.is_available() else None,
                     track_grad_norm=-1,
                     max_epochs=args['epochs'],
                     max_steps=None,
                     replace_sampler_ddp=False,
                     accumulate_grad_batches=args['grad_accum'],
                     gradient_clip_val=1.0,
                     val_check_interval=1.0,
                     num_sanity_val_steps=2,
                     check_val_every_n_epoch=1,
                     logger=logger,
                     callbacks=checkpoint_callback,
                     progress_bar_refresh_rate=10,
                     precision=args['precision'],
                     amp_backend='native', amp_level='O2',
                     )

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
  "Argument `period` in `ModelCheckpoint` is deprecated in v1.3 and will be removed in v1.5."
  f"`Trainer(distributed_backend={distributed_backend})` has been deprecated and will be removed in v1.5."
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


{'train': './my_datasets/IWSLT_2014_TEDtalks/es-en/train_pr', 'validation': './my_datasets/IWSLT_2014_TEDtalks/es-en/dev2010', 'test': './my_datasets/IWSLT_2014_TEDtalks/es-en/test_joined'}
{'train_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/train_pr', 'validation_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/dev2010', 'test_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/test_joined', 'src': 'en', 'tgt': 'es', 'max_src_len': 170, 'max_tgt_len': 170, 'save_dir': '../models/iwslt_2014/es-en/mBART_large_pretrained_mt_plus_adapter', 'tokenizer': 'facebook/mbart-large-cc25', 'model': 'mrm8488/mbart-large-finetuned-opus-en-es-translation', 'add_adapter': False, 'reduction_factor': 1, 'label_smoothing': 0.1, 'epochs': 1, 'batch_size': 8, 'grad_accum': 1, 'lr': 3e-05, 'warmup': 500, 'weight_decay': 3e-05, 'gpus': 1, 'precision': 32, 'dataset_size': 180850}


#### 4. Train model

In [10]:
import time
start_time = time.time()
# Train model
trainer.fit(model)
print(time.time() - start_time)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | MBartForConditionalGeneration | 661 M 
--------------------------------------------------------
50.4 M    Trainable params
610 M     Non-trainable params
661 M     Total params
2,644.931 Total estimated model params size (MB)


{'train_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/train_pr', 'validation_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/dev2010', 'test_data': './my_datasets/IWSLT_2014_TEDtalks/es-en/test_joined', 'src': 'en', 'tgt': 'es', 'max_src_len': 170, 'max_tgt_len': 170, 'save_dir': '../models/iwslt_2014/es-en/mBART_large_pretrained_mt_plus_adapter', 'tokenizer': 'mrm8488/mbart-large-finetuned-opus-en-es-translation', 'model': 'mrm8488/mbart-large-finetuned-opus-en-es-translation', 'add_adapter': True, 'reduction_factor': 1, 'label_smoothing': 0.1, 'epochs': 1, 'batch_size': 8, 'grad_accum': 1, 'lr': 3e-05, 'warmup': 500, 'weight_decay': 3e-05, 'gpus': 1, 'precision': 32, 'dataset_size': 180850}


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 22594: BLEU reached 0.00000 (best 0.00000), saving model to "/data/injaureg/Desktop/CMCRC/ALTA/ALTA2021_tutorial/models/iwslt_2014/es-en/mBART_large_pretrained_mt_plus_adapter/training/checkpoints/check-epoch=00-BLEU=0.00.ckpt" as top 1


4490.29577255249


### 5. Test model

In [5]:
# Test model
trainer.test(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

{'avg_val_loss': tensor(2.5985, device='cuda:0'), 'avg_accuracy': tensor(0.3291, device='cuda:0'), 'log': {'vloss': tensor(2.5985, device='cuda:0'), 'vaccuracy': tensor(0.3291, device='cuda:0')}, 'progress_bar': {'vloss': tensor(2.5985, device='cuda:0'), 'vaccuracy': tensor(0.3291, device='cuda:0')}}
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'BLEU': 29.53417205810547}
--------------------------------------------------------------------------------


[{'BLEU': 29.53417205810547}]

### 6. Inference

In [2]:
# Define PyTorch Lightning model
model = LmForTranslation.load_from_checkpoint('../models/iwslt_2014/es-en/es_en_translation_bart_en_jupyter/'
                                              'training/checkpoints/check-epoch=00-BLEU=30.43.ckpt')

sentence = 'Hello my name is Inigo.'
translation = model.translate_example(sentence)
translation

['Hola, mi nombre es Inigo.']

## Comparing models

In [16]:
# Create models folder
!mkdir models
# Download them from google drive
# BART_base
bart_base_url = 'https://drive.google.com/uc?id=1g3uNMlfEO6IsOxQ_KIVnAJ5E73jAe8cN'
bart_base_out = './models/bart_base.zip'
gdown.download(bart_base_url, bart_base_out, quiet=False)
!unzip './models/bart_base_url.zip' -d './models/'
!rm './models/bart_base_url.zip'
# mBART_large
mbart_large_url = 'https://drive.google.com/uc?id=1mHS7n7og00ZD3u9TD-CpyKefvYjgvPxn'
mbart_large_out = './models/mbart_large.zip'
gdown.download(mbart_large_url, mbart_large_out, quiet=False)
!unzip './models/mbart_large.zip' -d './models/'
!rm './models/mbart_large.zip'
# mBART_large_with_adapter
mbart_large_wa_url = 'https://drive.google.com/uc?id=1kTrcD-9J8XWP-jpSwuVPN8B94XtRCZVZ'
mbart_large_wa_out = './models/mbart_large_wa.zip'
gdown.download(mbart_large_wa_url, mbart_large_wa_out, quiet=False)
!unzip './models/mbart_large_wa.zip' -d './models/'
!rm './models/mbart_large_wa.zip'

In [17]:
import time

model = LmForTranslation.load_from_checkpoint('models/BART_base/training/checkpoints/'
                                              'check-epoch=00-BLEU=20.97.ckpt')

start_time = time.time()
test_bleu_bart_base = trainer.test(model)[0]['BLEU']
training_time_bart_base = 1880
inference_time_bart_base = time.time() - start_time
print('BART_base:')
print('-----------------')
print('Test BLEU: ', test_bleu_bart_base)
print('Training time: ', training_time_bart_base / 60, ' mins')
print('Inference time: ', inference_time_bart_base / 60, ' mins')

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'BLEU': 20.49509048461914}
--------------------------------------------------------------------------------
BART_base:
-----------------
Test BLEU:  20.49509048461914
Training time:  31.333333333333332  mins
Inference time:  6.744376571973165  mins


In [5]:
model = LmForTranslation.load_from_checkpoint('../models/iwslt_2014/es-en/mBART_large/training/checkpoints/'
                                              'check-epoch=00-BLEU=36.47.ckpt')

start_time = time.time()
test_bleu_mbart_large = trainer.test(model)[0]['BLEU']
training_time_mbart_large = 1880
inference_time_mbart_large = time.time() - start_time
print('mBART_large:')
print('-----------------')
print('Test BLEU: ', test_bleu_mbart_large)
print('Training time: ', training_time_mbart_large / 60, ' mins')
print('Inference time: ', inference_time_mbart_large / 60, ' mins')

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'BLEU': 34.681053161621094}
--------------------------------------------------------------------------------
mBART_large:
-----------------
Test BLEU:  34.681053161621094
Training time:  31.333333333333332  mins
Inference time:  8.115819756189982  mins


In [7]:
model = LmForTranslation.load_from_checkpoint('../models/iwslt_2014/es-en/mBART_large_plus_adapter/training/'
                                              'checkpoints/check-epoch=00-BLEU=34.48.ckpt')

start_time = time.time()
test_bleu_mbart_large_plus_adapter = trainer.test(model)[0]['BLEU']
training_time_mbart_large_plus_adapter = 4258
inference_time_mbart_large_plus_adapter = time.time() - start_time
print('mBART_large_plus_adapter:')
print('-----------------')
print('Test BLEU: ', test_bleu_mbart_large_plus_adapter)
print('Training time: ', training_time_mbart_large_plus_adapter / 60, ' mins')
print('Inference time: ', inference_time_mbart_large_plus_adapter / 60, ' mins')

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'BLEU': 32.785400390625}
--------------------------------------------------------------------------------
mBART_large_plus_adapter:
-----------------
Test BLEU:  32.785400390625
Training time:  31.333333333333332  mins
Inference time:  8.449832185109456  mins
