# Translation notebook

This is the notebook for translation

(More descriptions)

#### (Only for  Google Colab Execution)

If you are running the notebook in Google Colab, run the cell below to download the repository witht he required files to run the models and the requirements file.

In [None]:
!git clone https://github.com/ijauregiCMCRC/ALTA2021_tutorial.git
%cd /ALTA2021_tutorial/translation

#### Install requirements

In [None]:
!pip install -r requirements.txt

### 1. Import packages

In [1]:
import os
import time
os.getcwd()
import random
import numpy as np

import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TestTubeLogger
from pytorch_lightning.callbacks import ModelCheckpoint

# For dataset and pretrained model download
import gdown

# For plotting
%matplotlib inline
import matplotlib.pyplot as plt

from src.translation_lightning_model import LmForTranslation

### 2. Download dataset

In [None]:
# Create models folder
!mkdir translation_dataset
# Download dataset from google drive
dataset_link_drive = 'https://drive.google.com/uc?id=1MxrReEXbJPWa3OobANwfzak5rbs5kyNz'
dataset_path = './translation_dataset/IWSLT_2014.zip'
gdown.download(dataset_link_drive, dataset_path, quiet=False)
!unzip './translation_dataset/IWSLT_2014.zip' -d './translation_dataset/'
!rm './translation_dataset/IWSLT_2014.zip'

### 3. Define parameters

In [12]:
args ={
    'train_data': './translation_dataset/IWSLT_2014/es-en/train',  # Path to training data
    'validation_data': './translation_dataset/IWSLT_2014/es-en/dev',  # Path to validation data
    'test_data': './translation_dataset/IWSLT_2014/es-en/test',  # Path to test data
    'src': 'en',  # Source language prefix
    'tgt': 'es',  # Target language prefix
    'max_src_len': 170,  # Maximum number of tokens in the source sentence
    'max_tgt_len': 170,  # Maximum number of tokens in the target sentence
    'save_dir': '../models/iwslt_2014/es-en/sshleifer_tiny-mbart',  # Path to save the model and logs
    'tokenizer': 'sshleifer/tiny-mbart',  # Pretrained tokenizer
    'model': 'sshleifer/tiny-mbart',  # Pretrained model
    'add_adapter': False,  # Include adapter training
    'reduction_factor': 1,  # Adapter's reduction factor (>= 1)
    'label_smoothing': 0.1, # Label smoothing 
    'epochs': 1,  # Number of epochs during training
    'batch_size': 8,  # Batch size
    'grad_accum': 1,  # Gradient accumulation
    'lr': 0.000003,  # Training learning rate
    'warmup': 500,  # Number of warmup steps
    'weight_decay': 0.00003,  # Adam weight decay
    'gpus': 1,  # Number of gpus. 0 for CPU
    'precision': 32,  # Double precision (64), full precision (32) 
                      # or half precision (16). Can be used on CPU, GPU or TPUs.
}

### 4. Initialize Lightning module

In [None]:
# Initialize with a seed
seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    
# dataset size. Needed to compute number of steps for the lr scheduler
args['dataset_size'] = sum(1 for line in open(args['train_data'] + '.' + args['src']))

# Define PyTorch Lightning model
model = LmForTranslation(args)
print(model.hf_datasets)

# Define logger
logger = TestTubeLogger(
    save_dir=args['save_dir'],
    name='training',
    version=0  # always use version=0
)

# Define checkpoint saver
checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(args['save_dir'], "training", "checkpoints"),  # Dir path
    filename='check-{epoch:02d}-{BLEU:.2f}',  # Filename
    save_top_k=1,    # Maximum number of checkpoints to be saved
    verbose=True,    # Verbose
    monitor='BLEU',  # Checkpointing measurement (BLEU validation)
    mode='max',      # Maximize measurement over the validation
    period=1         # Save every epoch
)

print(args)


# Define lightning trainer
trainer = pl.Trainer(gpus=args['gpus'], distributed_backend='dp' if torch.cuda.is_available() else None,
                     track_grad_norm=-1,
                     max_epochs=args['epochs'],
                     max_steps=None,
                     replace_sampler_ddp=False,
                     accumulate_grad_batches=args['grad_accum'],
                     gradient_clip_val=1.0,
                     val_check_interval=1.0,
                     num_sanity_val_steps=2,
                     check_val_every_n_epoch=1,
                     logger=logger,
                     callbacks=checkpoint_callback,
                     progress_bar_refresh_rate=10,
                     precision=args['precision'],
                     amp_backend='native', amp_level='O2',
                     )

### 5. Train model

In [15]:
start_time = time.time()
# Train model
trainer.fit(model)
print((time.time() - start_time)/60, ' mins')

### 6. Test model

In [16]:
# Test model
trainer.test(model)

## Comparing models

In [None]:
# Create models folder
!mkdir models
# Download them from google drive
# BART_base
bart_base_url = 'https://drive.google.com/uc?id=1_VA85J5OOf3PltRqhbVLDuDNBtJ-9vOR'
bart_base_out = './models/bart_base.zip'
gdown.download(bart_base_url, bart_base_out, quiet=False)
!unzip './models/bart_base.zip' -d './models/'
!rm './models/bart_base.zip'
# BART_base_with_adapter
bart_base_with_adapter_url = 'https://drive.google.com/uc?id=1Rojznogzr6cMGmi3wt1BeTaputmf6jv4'
bart_base_with_adapter_out = './models/bart_base_with_adapter.zip'
gdown.download(bart_base_with_adapter_url, bart_base_with_adapter_out, quiet=False)
!unzip './models/bart_base_with_adapter.zip' -d './models/'
!rm './models/bart_base_with_adapter.zip'
# mBART_large
mbart_large_url = 'https://drive.google.com/uc?id=115hpgCILR5FTVD-A972xVR3PdqkT9QZ7'
mbart_large_out = './models/mbart_large.zip'
gdown.download(mbart_large_url, mbart_large_out, quiet=False)
!unzip './models/mbart_large.zip' -d './models/'
!rm './models/mbart_large.zip'
# mBART_large_with_adapter
mbart_large_wa_url = 'https://drive.google.com/uc?id=1tCGSk021m8aMkYEd7tm_j-Hp2fYwY5e_'
mbart_large_wa_out = './models/mbart_large_wa.zip'
gdown.download(mbart_large_wa_url, mbart_large_wa_out, quiet=False)
!unzip './models/mbart_large_wa.zip' -d './models/'
!rm './models/mbart_large_wa.zip'

#### English sentence example

In [None]:
# Sentence by Alan Turin
sentence = 'Sometimes it is the people no one can imagine anything of who do the things no one can imagine.'

In [None]:
print('Loading model...')
model = LmForTranslation.load_from_checkpoint('../models/iwslt_2014/es-en/BART_base/training/checkpoints/'
                                              'check-epoch=00-BLEU=32.51.ckpt')
tp_bart_base, ntp_bart_base = model.num_parameters()
start_time = time.time()
test_bleu_bart_base = trainer.test(model)[0]['BLEU']
training_time_bart_base = 2972
inference_time_bart_base = (time.time() - start_time) / 60
translation_example_bart_base = model.translate_example(sentence)
print('BART_base:')
print('-----------------')
print('Trainable parameters: ', tp_bart_base)
print('Non-trainable parameters: ', ntp_bart_base)
print('Total parameters: ', tp_bart_base + ntp_bart_base)
print('-----------------')
print('Test BLEU: ', test_bleu_bart_base)
print('Training time: ', training_time_bart_base, ' mins')
print('Inference time: ', inference_time_bart_base, ' mins')
print('Translation example-> ', translation_example_bart_base)

In [None]:
print('Loading model...')
model = LmForTranslation.load_from_checkpoint('../models/iwslt_2014/es-en/BART_base_with_adapter/training/checkpoints'
                                              'check-epoch=00-BLEU=20.97.ckpt')
tp_bart_base_plus_adapter, ntp_bart_base_plus_adapter = model.num_parameters()
start_time = time.time()
test_bleu_bart_base_plus_adapter = trainer.test(model)[0]['BLEU']
training_time_bart_base_plus_adapter = 1880
inference_time_bart_base_plus_adapter = (time.time() - start_time) / 60
translation_example_bart_base_plus_adapter = model.translate_example(sentence)
print('BART_base_plus_adapter:')
print('-----------------')
print('Trainable parameters: ', tp_bart_base_plus_adapter)
print('Non-trainable parameters: ', ntp_bart_base_plus_adapter)
print('Total parameters: ', tp_bart_base_plus_adapter + ntp_bart_base_plus_adapter)
print('-----------------')
print('Test BLEU: ', test_bleu_bart_base_plus_adapter)
print('Training time: ', training_time_bart_base_plus_adapter, ' mins')
print('Inference time: ', inference_time_bart_base_plus_adapter, ' mins')
print('Translation example-> ', translation_example_bart_base_plus_adapter)

In [None]:
print('Loading model...')
model = LmForTranslation.load_from_checkpoint('../models/iwslt_2014/es-en/mBART_large/training/checkpoints/'
                                              'check-epoch=00-BLEU=36.47.ckpt')
tp_mbart_large, ntp_mbart_large = model.num_parameters()
start_time = time.time()
test_bleu_mbart_large = trainer.test(model)[0]['BLEU']
training_time_mbart_large = 8923
inference_time_mbart_large = (time.time() - start_time) / 60
translation_example_mbart_large = model.translate_example(sentence)
print('mBART_large:')
print('-----------------')
print('Trainable parameters: ', tp_mbart_large)
print('Non-trainable parameters: ', ntp_mbart_large)
print('Total parameters: ', tp_mbart_large + ntp_mbart_large)
print('-----------------')
print('Test BLEU: ', test_bleu_mbart_large)
print('Training time: ', training_time_mbart_large, ' mins')
print('Inference time: ', inference_time_mbart_large, ' mins')
print('Translation example-> ', translation_example_mbart_large)

In [None]:
print('Loading model...')
model = LmForTranslation.load_from_checkpoint('../models/iwslt_2014/es-en/mBART_large_plus_adapter/training/'
                                              'checkpoints/check-epoch=00-BLEU=34.48.ckpt')
tp_mbart_large_plus_adapter, ntp_mbart_large_plus_adapter = model.num_parameters()
start_time = time.time()
test_bleu_mbart_large_plus_adapter = trainer.test(model)[0]['BLEU']
training_time_mbart_large_plus_adapter = 4258
inference_time_mbart_large_plus_adapter = (time.time() - start_time) / 60
translation_example_mbart_large_plus_adapter = model.translate_example(sentence)
print('mBART_large_plus_adapter:')
print('-----------------')
print('Trainable parameters: ', tp_mbart_large_plus_adapter)
print('Non-trainable parameters: ', ntp_mbart_large_plus_adapter)
print('Total parameters: ', tp_mbart_large_plus_adapter + ntp_mbart_large_plus_adapter)
print('-----------------')
print('Test BLEU: ', test_bleu_mbart_large_plus_adapter)
print('Training time: ', training_time_mbart_large_plus_adapter, ' mins')
print('Inference time: ', inference_time_mbart_large_plus_adapter, ' mins')
print('Translation example-> ', translation_example_mbart_large_plus_adapter)

#### Ploting results

In [None]:
x_names = ['BART_base', 'BART_base (wa)', 'mBART_large', 'mBART_large (wa)']
colors = ['black', 'red', 'blue', 'green']

In [None]:
# BLEU Scores
plt.bar(x_names,
       [test_bleu_bart_base, test_bleu_bart_base_plus_adapter, test_bleu_mbart_large, test_bleu_mbart_large_plus_adapter],
       color=colors)
plt.ylim((30,36))
plt.ylabel('BLEU')
plt.xticks(rotation = 45)
plt.title('Test set evaluation')

In [None]:
# Training time vs inference time
fig, axs = plt.subplots(1,2, figsize=(15,5))
axs[0].bar(x_names,
       [training_time_bart_base, training_time_bart_base_plus_adapter, training_time_mbart_large, training_time_mbart_large_plus_adapter],
       color=colors)
axs[0].set_ylabel('mins')
axs[0].set_xticklabels(rotation = 45)
axs[0].set_title('training time')
axs[1].bar(x_names,
       [inference_time_bart_base, inference_time_bart_base_plus_adapter, inference_time_mbart_large, inference_time_mbart_large_plus_adapter],
       color=colors)
axs[1].set_ylabel('mins')
axs[1].set_xticklabels(rotation = 45)
axs[1].set_title('inference time')
#plt.ylim((30,36))

In [None]:
# Model size
fig, axs = plt.subplots(1,3, figsize=(15,5))
axs[0].bar(x_names,
       [tp_bart_base, tp_bart_base_plus_adapter, tp_mbart_large, tp_mbart_large_plus_adapter],
       color=colors)
axs[0].set_ylabel('Million')
axs[0].set_xticklabels(rotation = 45)
axs[0].set_title('# trainable parameters')
axs[1].bar(x_names,
       [ntp_bart_base, ntp_bart_base_plus_adapter, ntp_mbart_large, ntp_mbart_large_plus_adapter],
       color=colors)
axs[1].set_ylabel('Million')
axs[1].set_xticklabels(rotation = 45)
axs[1].set_title('# non-trainable parameters')
axs[2].bar(x_names,
       [tp_bart_base + ntp_bart_base, tp_bart_base_plus_adapter + ntp_bart_base_plus_adapter, tp_mbart_large + ntp_mbart_large, tp_mbart_large_plus_adapter + ntp_mbart_large_plus_adapter],
       color=colors)
axs[2].set_ylabel('Million')
axs[2].set_xticklabels(rotation = 45)
axs[2].set_title('# total parameters')

In [None]:
# Compare translation examples
print('BART_base:')
print(' -> ', translation_example_bart_base)
print('BART_base_plus_adaptr:')
print(' -> ', translation_example_bart_base_plus_adapter)
print('mBART_large:')
print(' -> ', translation_example_mbart_large)
print('mBART_large_plus_adapter:')
print(' -> ', translation_example_mbart_large_plus_adapter)