# Translation notebook

This is the notebook for translation

(More descriptions)

### 1. Import packages

In [1]:
import random
import numpy as np

import torch
import pytorch_lightning as pl
from pytorch_lightning.loggers import TestTubeLogger
from pytorch_lightning.callbacks import ModelCheckpoint
# from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel


from src.translation_lightning_model import LmForTranslation
from src.translation_metrics import translation_accuracy

### 2. Define parameters

In [None]:
parser.add_argument("--train_data", type=str, required=True, help='Path to training data')
parser.add_argument("--validation_data", type=str, required=True, help='Path to validation data')
parser.add_argument("--test_data", type=str, required=True, help='Path to testing data')
parser.add_argument("--src", type=str, required=True, help='Source language.')
parser.add_argument("--tgt", type=str, required=True, help='Target language.')
parser.add_argument("--save_dir", type=str, default='translation')
parser.add_argument("--save_prefix", type=str, default='test')
parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
parser.add_argument("--grad_accum", type=int, default=1, help="number of gradient accumulation steps")
parser.add_argument("--max_grad_norm", type=float, default=1.0, help="number of gradient accumulation steps")
parser.add_argument("--gpus", type=int, default=0,
                    help="Number of gpus. 0 for CPU")
parser.add_argument("--warmup", type=int, default=500, help="Number of warmup steps")
parser.add_argument("--lr", type=float, default=0.00003, help="Maximum learning rate")
parser.add_argument("--weight_decay", type=float, default=0.01, help="Adam weight decay")
parser.add_argument("--val_every", type=float, default=1.0, help="Number of training steps between validations")
parser.add_argument("--val_percent_check", default=1.00, type=float, help='Percent of validation data used')
parser.add_argument("--num_workers", type=int, default=0, help="Number of data loader workers")
parser.add_argument("--seed", type=int, default=1234, help="Seed")
parser.add_argument("--epochs", type=int, default=1, help="Number of epochs")
parser.add_argument("--disable_checkpointing", action='store_true', help="No logging or checkpointing")
parser.add_argument("--max_input_len", type=int, default=170,
                    help="maximum num of wordpieces/summary. Used for training and testing")
parser.add_argument("--max_output_len", type=int, default=170,
                    help="maximum num of wordpieces/summary. Used for training and testing")
parser.add_argument("--test", action='store_true', help="Test only, no training")
parser.add_argument("--model_lm_path", type=str, default='../pretrained_lms/sshleifer-tiny-mbart',
                    help="Path to the checkpoint directory or model name")
parser.add_argument("--tokenizer", type=str, default='../pretrained_lms/sshleifer-tiny-mbart')
parser.add_argument("--add_adapter", action='store_true', help="Add an adapter.")
parser.add_argument("--progress_bar", type=int, default=10, help="Progress bar. Good for printing")
parser.add_argument("--precision", type=int, default=32, help="Double precision (64), full precision (32) "
                                                              "or half precision (16). Can be used on CPU, "
                                                              "GPU or TPUs.")
parser.add_argument("--amp_backend", type=str, default='native', help="The mixed precision backend to "
                                                                      "use ('native' or 'apex')")
parser.add_argument("--debug", action='store_true', help="debug run")
parser.add_argument("--resume_ckpt", type=str, help="Path of a checkpoint to resume from")
parser.add_argument("--from_pretrained", type=str, default=None,
                    help="Path to a checkpoint to load model weights but not training state")
parser.add_argument('--grad_ckpt', action='store_true', help='Enable gradient checkpointing to save memory')
parser.add_argument("--attention_dropout", type=float, default=0.1, help="attention dropout")
parser.add_argument("--attention_mode", type=str, default='sliding_chunks', help="Longformer attention mode")
parser.add_argument("--attention_window", type=int, default=512, help="Attention window")
parser.add_argument("--label_smoothing", type=float, default=0.0, required=False)
parser.add_argument("--adafactor", action='store_true', help="Use adafactor optimizer")


args ={
    'train_data': '',
    'validation_data': '',
    'test_data': '',
    'src': '',
    'tgt': '',
    'save_dir': '',
}

In [None]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(args.seed)

model = LmForTranslation(args)

model.hf_datasets = {'train': args.train_data,
                     'validation': args.validation_data,
                     'test': args.test_data}
print(model.hf_datasets)

logger = TestTubeLogger(
    save_dir=args.save_dir,
    name=args.save_prefix,
    version=0  # always use version=0
)

checkpoint_callback = ModelCheckpoint(
    dirpath=os.path.join(args.save_dir, args.save_prefix, "checkpoints"),
    save_top_k=1,
    verbose=True,
    monitor='BLEU',
    mode='max',
    period=0
)

print(args)

args.dataset_size = 203037  # hardcode dataset size. Needed to compute number of steps for the lr scheduler

trainer = pl.Trainer(gpus=args.gpus, distributed_backend='ddp' if torch.cuda.is_available() else None,
                     track_grad_norm=-1,
                     max_epochs=args.epochs if not args.debug else 100,
                     max_steps=None if not args.debug else 1,
                     replace_sampler_ddp=False,
                     accumulate_grad_batches=args.grad_accum,
                     gradient_clip_val=args.max_grad_norm,
                     val_check_interval=args.val_every if not args.debug else 1,
                     num_sanity_val_steps=2 if not args.debug else 0,
                     check_val_every_n_epoch=1 if not args.debug else 1,
                     logger=logger,
                     callbacks=checkpoint_callback if not args.disable_checkpointing else False,
                     progress_bar_refresh_rate=args.progress_bar,
                     precision=args.precision,
                     amp_backend=args.amp_backend, amp_level='O2',
                     resume_from_checkpoint=args.resume_ckpt,
                     )
if not args.test:
    trainer.fit(model)
trainer.test(model)