In [1]:
!pip install translate-toolkit transformers sentencepiece


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting translate-toolkit
  Downloading translate_toolkit-3.7.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.3 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 52.7 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 58.2 MB/s 
[?25hCollecting lxml>=4.6.3
  Downloading lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 35.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>

In [2]:
!wget -O dataset.txt.zip https://opus.nlpl.eu/download.php?f=TEP/v1/moses/en-fa.txt.zip
!unzip dataset.txt.zip


--2022-07-14 00:08:33--  https://opus.nlpl.eu/download.php?f=TEP/v1/moses/en-fa.txt.zip
Resolving opus.nlpl.eu (opus.nlpl.eu)... 193.166.25.9
Connecting to opus.nlpl.eu (opus.nlpl.eu)|193.166.25.9|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://object.pouta.csc.fi/OPUS-TEP/v1/moses/en-fa.txt.zip [following]
--2022-07-14 00:08:35--  https://object.pouta.csc.fi/OPUS-TEP/v1/moses/en-fa.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16353318 (16M) [application/zip]
Saving to: ‘dataset.txt.zip’


2022-07-14 00:08:37 (10.3 MB/s) - ‘dataset.txt.zip’ saved [16353318/16353318]

Archive:  dataset.txt.zip
  inflating: TEP.en-fa.en            
  inflating: TEP.en-fa.fa            
  inflating: README                  


In [3]:
from google.colab import drive
from IPython.display import display
from IPython.html import widgets
import matplotlib.pyplot as plt
import numpy as np
import random
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook

sns.set()



In [4]:

with open("TEP.en-fa.en", 'rb') as enfile:
  en_lines_list = [line.decode("UTF-8").rstrip() for line in enfile]

with open('TEP.en-fa.fa') as fafile:
  fa_lines_list = [line.rstrip() for line in fafile]

# { sentence_id: 1, translation: {"en": "hello", "fa": "سلام"}
sentences = []
for i in range(0, len(en_lines_list)):
  sentences.append({
        "en": en_lines_list[i],
        "fa": fa_lines_list[i]
  })

print("len en lines: ", len(en_lines_list))
print("len fa lines: ", len(fa_lines_list))
print("first en sentence: ", en_lines_list[0])
print("first fa sentence: ", fa_lines_list[0])

len en lines:  612086
len fa lines:  612086
first en sentence:  raspy breathing .
first fa sentence:  صداي خر خر .


In [5]:
sentences[2:6]

[{'en': 'maybe its the wind .', 'fa': 'شايد صداي باد باشه .'},
 {'en': 'no .', 'fa': 'نه .'},
 {'en': 'stop please stop .',
  'fa': 'دست نگه داريد خواهش ميکنم دست نگه داريد .'},
 {'en': 'you have a week , evans then well burn the house .',
  'fa': 'اوانز تو فقط يک هفته وقت داري وگرنه خونتو خواهيم سوزوند .'}]

In [6]:
random.shuffle(sentences)
train_slice = int(len(sentences) * 0.8)
train_dataset = sentences[:train_slice]
test_dataset = sentences[train_slice:]

In [7]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
model_repo = 'google/mt5-small'
model_path = "/content/gdrive/MyDrive/mt5_translation.pt"
max_seq_len = 20
# model.config.maxlength = 40

# Load Tokenizer & Model


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=False)

Downloading:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [10]:
# Model description: https://huggingface.co/google/mt5-base
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)
model = model.cuda()

Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [11]:
LANG_TOKEN_MAPPING = {
    'en': '<en>',
    'fa': '<fa>',
}

In [12]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(250102, 512)

In [13]:
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]
  
def encode_target_str(text, tokenizer, seq_len,
                      lang_token_map=LANG_TOKEN_MAPPING):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

def format_translation_data(translations, input_lang, target_lang,
                            tokenizer, seq_len=128):

  # Get the translations for the batch
  input_text = translations[input_lang]
  target_text = translations[target_lang]

  if input_text is None or target_text is None:
    return None

  input_token_ids = encode_input_str(
      input_text, target_lang, tokenizer, seq_len, LANG_TOKEN_MAPPING)
  
  target_token_ids = encode_target_str(
      target_text, tokenizer, seq_len, LANG_TOKEN_MAPPING)

  return input_token_ids, target_token_ids

def transform_batch(batch, input_lang, target_lang, tokenizer):
  inputs = []
  targets = []
  for translation_set in batch:
    formatted_data = format_translation_data(
        translation_set, input_lang, target_lang, tokenizer, max_seq_len)
    
    if formatted_data is None:
      continue
    
    input_ids, target_ids = formatted_data
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))
    
  batch_input_ids = torch.cat(inputs).cuda()
  batch_target_ids = torch.cat(targets).cuda()

  return batch_input_ids, batch_target_ids

def get_data_generator(dataset, input_lang, target_lang, tokenizer, batch_size=32):
  random.shuffle(dataset)
  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i:i+batch_size]
    yield transform_batch(raw_batch, input_lang, target_lang, tokenizer)

In [None]:
# Testing `data_transform`
in_ids, out_ids = format_translation_data(
    train_dataset[0], "en", "fa", tokenizer)

print(' '.join(tokenizer.convert_ids_to_tokens(in_ids)))
print(' '.join(tokenizer.convert_ids_to_tokens(out_ids)))

# Testing data generator
data_gen = get_data_generator(train_dataset, "en", "fa", tokenizer, 8)
data_batch = next(data_gen)
print('Input shape:', data_batch[0].shape)
print('Output shape:', data_batch[1].shape)

<fa> ▁we ▁know ▁just ▁what ▁you ▁need ▁ . </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
▁ما ▁ دقيق ا ▁مي دوني م ▁تو ▁چي ▁مي خوا ي ▁ . </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

In [14]:
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [15]:
# Constants
n_epochs = 4
batch_size = 64
print_freq = 50
checkpoint_freq = 1000
lr = 5e-3
n_batches = int(np.ceil(len(train_dataset) / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)

In [16]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, n_warmup_steps, total_steps)



In [17]:
losses = []

In [18]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset, "en", "fa",
                                      tokenizer, batch_size)
  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [None]:
for epoch_idx in range(n_epochs):
  # Randomize data order
  data_generator = get_data_generator(train_dataset, "en", "fa",
                                      tokenizer, batch_size)
                
  for batch_idx, (input_batch, label_batch) \
      in tqdm_notebook(enumerate(data_generator), total=n_batches):
    optimizer.zero_grad()

    # Forward pass
    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)

    # Calculate loss and update weights
    loss = model_out.loss
    losses.append(loss.item())
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Print training update info
    if (batch_idx + 1) % print_freq == 0:
      avg_loss = np.mean(losses[-print_freq:])
      print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {}'.format(
          epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]))
      
    if (batch_idx + 1) % checkpoint_freq == 0:
      test_loss = eval_model(model, test_dataset)
      print('Saving model with test loss of {:.3f}'.format(test_loss))
      if abs(test_loss - avg_loss) < 2:
        torch.save(model.state_dict(), model_path)

torch.save(model.state_dict(), '/content/gdrive/MyDrive/final.pt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/7652 [00:00<?, ?it/s]

Epoch: 1 | Step: 50 | Avg. loss: 1.078 | lr: 0.0008169934640522876
Epoch: 1 | Step: 100 | Avg. loss: 1.130 | lr: 0.0016339869281045752
Epoch: 1 | Step: 150 | Avg. loss: 1.326 | lr: 0.0024509803921568627
Epoch: 1 | Step: 200 | Avg. loss: 1.449 | lr: 0.0032679738562091504
Epoch: 1 | Step: 250 | Avg. loss: 1.561 | lr: 0.004084967320261438
Epoch: 1 | Step: 300 | Avg. loss: 1.607 | lr: 0.004901960784313725
Epoch: 1 | Step: 350 | Avg. loss: 1.620 | lr: 0.004992739753151607
Epoch: 1 | Step: 400 | Avg. loss: 1.606 | lr: 0.0049844894726420695
Epoch: 1 | Step: 450 | Avg. loss: 1.611 | lr: 0.0049762391921325325
Epoch: 1 | Step: 500 | Avg. loss: 1.571 | lr: 0.004967988911622995
Epoch: 1 | Step: 550 | Avg. loss: 1.552 | lr: 0.0049597386311134575
Epoch: 1 | Step: 600 | Avg. loss: 1.567 | lr: 0.0049514883506039204
Epoch: 1 | Step: 650 | Avg. loss: 1.562 | lr: 0.004943238070094383
Epoch: 1 | Step: 700 | Avg. loss: 1.533 | lr: 0.004934987789584846
Epoch: 1 | Step: 750 | Avg. loss: 1.561 | lr: 0.0049267

  0%|          | 0/7652 [00:00<?, ?it/s]

Epoch: 2 | Step: 50 | Avg. loss: 1.872 | lr: 0.0037796185070292387
Epoch: 2 | Step: 100 | Avg. loss: 1.842 | lr: 0.003771368226519702
Epoch: 2 | Step: 150 | Avg. loss: 1.837 | lr: 0.0037631179460101646
Epoch: 2 | Step: 200 | Avg. loss: 1.808 | lr: 0.003754867665500627
Epoch: 2 | Step: 250 | Avg. loss: 1.786 | lr: 0.0037466173849910896
Epoch: 2 | Step: 300 | Avg. loss: 1.801 | lr: 0.0037383671044815526
Epoch: 2 | Step: 350 | Avg. loss: 1.798 | lr: 0.003730116823972015
Epoch: 2 | Step: 400 | Avg. loss: 1.770 | lr: 0.0037218665434624776
Epoch: 2 | Step: 450 | Avg. loss: 1.747 | lr: 0.0037136162629529406
Epoch: 2 | Step: 500 | Avg. loss: 1.782 | lr: 0.003705365982443403
Epoch: 2 | Step: 550 | Avg. loss: 1.745 | lr: 0.003697115701933866
Epoch: 2 | Step: 600 | Avg. loss: 1.717 | lr: 0.0036888654214243285
Epoch: 2 | Step: 650 | Avg. loss: 1.732 | lr: 0.0036806151409147915
Epoch: 2 | Step: 700 | Avg. loss: 1.696 | lr: 0.003672364860405254
Epoch: 2 | Step: 750 | Avg. loss: 1.669 | lr: 0.0036641

  0%|          | 0/7652 [00:00<?, ?it/s]

Epoch: 3 | Step: 50 | Avg. loss: 1.158 | lr: 0.0025169955778496468
Epoch: 3 | Step: 100 | Avg. loss: 1.143 | lr: 0.0025087452973401097
Epoch: 3 | Step: 150 | Avg. loss: 1.153 | lr: 0.0025004950168305722
Epoch: 3 | Step: 200 | Avg. loss: 1.152 | lr: 0.0024922447363210348
Epoch: 3 | Step: 250 | Avg. loss: 1.149 | lr: 0.0024839944558114977
Epoch: 3 | Step: 300 | Avg. loss: 1.157 | lr: 0.0024757441753019602
Epoch: 3 | Step: 350 | Avg. loss: 1.135 | lr: 0.002467493894792423
Epoch: 3 | Step: 400 | Avg. loss: 1.149 | lr: 0.0024592436142828857
Epoch: 3 | Step: 450 | Avg. loss: 1.156 | lr: 0.002450993333773348
Epoch: 3 | Step: 500 | Avg. loss: 1.139 | lr: 0.0024427430532638107
Epoch: 3 | Step: 550 | Avg. loss: 1.139 | lr: 0.0024344927727542737
Epoch: 3 | Step: 600 | Avg. loss: 1.157 | lr: 0.002426242492244736
Epoch: 3 | Step: 650 | Avg. loss: 1.144 | lr: 0.002417992211735199
Epoch: 3 | Step: 700 | Avg. loss: 1.145 | lr: 0.0024097419312256617
Epoch: 3 | Step: 750 | Avg. loss: 1.145 | lr: 0.00240