In [None]:
# installing appropriate modules
!pip install git+https://github.com/huggingface/transformers.git@master
!pip install git+https://github.com/huggingface/datasets.git@master
!pip install sentencepiece

In [1]:
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration, MBartTokenizer, 
    Seq2SeqTrainingArguments, Seq2SeqTrainer
  )

import torch
from torch.utils.data import random_split

## Preparing data

We will be using IITB parallel corpus (hi - en) for this example .

In [2]:
## lets download and load the datasets using ü§ó

# data = load_dataset("iitb_en_hi")
# data = data["train"]
# data

In [3]:
# since ü§ó dataset is yet to merge PR for IITB Parallel corpus
# we will be downloading dataset from link directly ...
!wget -c "http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz"

--2020-12-25 05:45:37--  http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz
Resolving www.cfilt.iitb.ac.in (www.cfilt.iitb.ac.in)... 103.21.127.130
Connecting to www.cfilt.iitb.ac.in (www.cfilt.iitb.ac.in)|103.21.127.130|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 100235015 (96M) [application/x-gzip]
Saving to: ‚Äòparallel.tgz‚Äô


2020-12-25 05:46:04 (3.65 MB/s) - ‚Äòparallel.tgz‚Äô saved [100235015/100235015]



In [10]:
# extracting from .tgz file
import tarfile
my_tar = tarfile.open('parallel.tgz')
my_tar.extractall('.') # specify which folder to extract to
my_tar.close()

In [11]:
data = []
with open("parallel/IITB.en-hi.en") as f2, open("parallel/IITB.en-hi.hi") as f1:
    for src, tgt in zip(f1, f2):
      data.append(
          {
              "translation": {
                  "hi": src.strip(),
                  "en": tgt.strip()
              }
          }
      )
print(f'total size of data is {len(data)}')

total size of data is 1609682


In [17]:
# splitting dataset into train, validation
split = 0.2
train_dataset, eval_dataset = random_split(data, lengths=[int((1-split)*len(data))+1, int(split*len(data))])

In [18]:
# defining collator functioon for preparing batches on the fly ..

def data_collator(features:list):

  labels = [f["translation"]["en"] for f in features]
  inputs = [f["translation"]["hi"] for f in features]

  batch = tokenizer.prepare_seq2seq_batch(src_texts=inputs, src_lang="hi_IN", tgt_lang="en_XX", tgt_texts=labels, max_length=32, max_target_length=32)

  for k in batch:
    batch[k] = torch.tensor(batch[k])

  return batch

## Initiating model and trainer for training

In [19]:
# initiating model, tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")

# defining training related arguments
args = Seq2SeqTrainingArguments(output_dir="indic-mbart",
                        do_train=True,
                        do_eval=True,
                        evaluation_strategy="epoch",
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        learning_rate=5e-5,
                        num_train_epochs=2,
                        logging_dir="/logs")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1184.0, style=ProgressStyle(description‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2444517405.0, style=ProgressStyle(descr‚Ä¶




Some weights of MBartForConditionalGeneration were not initialized from the model checkpoint at facebook/mbart-large-cc25 and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript‚Ä¶




In [21]:
# defining trainer using ü§ó
trainer = Seq2SeqTrainer(model=model, 
                args=args, 
                data_collator=data_collator, 
                train_dataset=train_dataset, 
                eval_dataset=eval_dataset)

## Training time

In [None]:
trainer.train()
# It will take hours to train this model on this dataset

In [27]:
## lets save model

# trainer.save_model("mbart-iitb-hin-eng")

## remember to link gdrive before saving, else session may crash


**Voila :)** Model is trained finally.. Let's push weights to ü§ó hub.



# Inference time. 
Let's load the model from hub and use it for inference using ü§ó pipeline.

In [None]:
from transformers import pipeline

model_id = "vasudevgupta/mbart-iitb-hin-eng"
translator = pipeline("translation_hi_to_en", model=model_id, tokenizer=model_id)

In [2]:
# lets see how our model performs
inputs = "‡§Ö‡§Ç‡§§‡§ø‡§Æ ‡§™‡•ç‡§∞‡§µ‡§ø‡§∑‡•ç‡§ü ‡§ò‡§ü‡§®‡§æ ‡§ï‡•ã ‡§π‡§æ‡§á‡§≤‡§æ‡§á‡§ü ‡§ï‡§∞‡•ã"

translation = translator(inputs, return_text=True)
translation = [t["translation_text"] for t in translation]
print(translation)

['Highlight last visited event']
