### Pre-requisite installations (RESTART RUNTIME AFTER THESE STEPS)

In [None]:
pip install transformers[sentencepiece] datasets evaluate sacrebleu

In [2]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


### After restarting runtime, login to huggingface

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Training models for 3 datasets

1.   Sample dataset (English to codemixed, 5 sentence pairs)
2.   English to codemixed, 1m+ sentence pairs
3.   Hindi to codemixed, 1m+ sentence pairs




### Common helper functions

In [2]:
from transformers import AutoTokenizer

checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]



In [3]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [4]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [5]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [6]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

### 1. Sample dataset (English to codemixed, 5 sentence pairs)

In [8]:
arr = {'id': ['0', '1', '2', '3', '4'],
 'translation': [{'en': 'She takes a job of teacher in the town .',
   'en-hi': 'She the town में teacher की नौकरी करती है।'},
  {'en': 'The scheduled tribes communities are concentrated in two belts .',
   'en-hi': 'अनुसूचित जनजातियां in two belts ही केंद्रित हैं।'},
  {'en': 'They will declare the names in next few days .',
   'en-hi': 'will declare the इनके नाम सार्वजनिक in next कुछ दिनों'},
  {'en': 'It is a pleasure working with him .',
   'en-hi': 'उनके साथ काम करके a pleasure It है।'},
  {'en': 'Committees have been formed in every district of the state .',
   'en-hi': 'वार्ड have been formed in सभी जिलों of the प्रदेश .'}]}

In [9]:
import pandas as pd
dt = pd.DataFrame(arr)
dt

Unnamed: 0,id,translation
0,0,{'en': 'She takes a job of teacher in the town...
1,1,{'en': 'The scheduled tribes communities are c...
2,2,{'en': 'They will declare the names in next fe...
3,3,"{'en': 'It is a pleasure working with him .', ..."
4,4,{'en': 'Committees have been formed in every d...


In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(dt)
dataset

Dataset({
    features: ['id', 'translation'],
    num_rows: 5
})

In [11]:
dataset = dataset.train_test_split(test_size = 0.2)

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 4
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1
    })
})

In [13]:
dataset["train"][0]

{'id': '4',
 'translation': {'en': 'Committees have been formed in every district of the state .',
  'en-hi': 'वार्ड have been formed in सभी जिलों of the प्रदेश .'}}

In [14]:
source_lang = "en"
target_lang = "en-hi"
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [15]:
tkdataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
tkdataset["train"][:5]

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="sample_en_enhi_mt_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tkdataset["train"],
    eval_dataset=tkdataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Cloning https://huggingface.co/kapilrk04/sample_en_enhi_mt_model into local empty directory.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,29.286041,0.0,2.0
2,No log,28.89543,0.0,2.0


TrainOutput(global_step=2, training_loss=26.426279067993164, metrics={'train_runtime': 24.2326, 'train_samples_per_second': 0.33, 'train_steps_per_second': 0.083, 'total_flos': 140449136640.0, 'train_loss': 26.426279067993164, 'epoch': 2.0})

## English to codemixed, 1m+ sentence pairs

In [19]:
from datasets import load_dataset

en_dataset = load_dataset("kapilrk04/codemix-en_enhi", use_auth_token=True)

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/kapilrk04___parquet/kapilrk04--codemix-en_enhi-8fa8773b3690eaa4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/37.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1253785 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/kapilrk04___parquet/kapilrk04--codemix-en_enhi-8fa8773b3690eaa4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
en_dataset["train"][0]

In [20]:
en_dataset = en_dataset["train"].train_test_split(test_size=0.2)

In [21]:
en_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1003028
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 250757
    })
})

In [22]:
source_lang = "en"
target_lang = "en-hi"
prefix = ""


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [23]:
tokenized_en_dataset = en_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1003028 [00:00<?, ? examples/s]

Map:   0%|          | 0/250757 [00:00<?, ? examples/s]

In [24]:
tokenized_en_dataset["train"][:5]

{'id': ['171259', '397185', '474547', '800965', '186531'],
 'translation': [{'en': 'He was apparently referring to the Congress and left parties .',
   'en-hi': 'वह संभवत: referring to the कांग्रेस and वामपंथी parties थे।'},
  {'en': 'Sister , daughter of Farooq Abdullah released on bail',
   'en-hi': 'Farooq अब्दुल्ला की Sister और daughter bail पर रिहा'},
  {'en': 'This so-called Gandhi family can never think beyond benefits .',
   'en-hi': 'यह कथित गाँधी family can नहीं think beyond फायदों'},
  {'en': 'Kerala has been facing the worst flooding in a century .',
   'en-hi': 'केरल a century में सबसे flooding जूझ रहा है।'},
  {'en': 'Now a nation-wide campaign for LED lights is being run .',
   'en-hi': 'अभी a पूरे देश में campaign for एलईडी lights is being चल'}],
 'input_ids': [[1669,
   639,
   259,
   56725,
   484,
   259,
   108564,
   288,
   287,
   26483,
   305,
   12255,
   259,
   48051,
   259,
   260,
   1],
  [92871,
   259,
   261,
   46378,
   304,
   76119,
   26082,
   

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="mt5_based_en_enhi_mt_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_en_dataset["train"],
    eval_dataset=tokenized_en_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### Hindi to codemixed, 1m+ sentence pairs

In [26]:
from datasets import load_dataset

hi_dataset = load_dataset("kapilrk04/codemix-hi_enhi", use_auth_token=True)

Downloading readme:   0%|          | 0.00/470 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/kapilrk04___parquet/kapilrk04--codemix-hi_enhi-f28f98485267408c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1254032 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/kapilrk04___parquet/kapilrk04--codemix-hi_enhi-f28f98485267408c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
hi_dataset["train"][0]

In [27]:
hi_dataset = hi_dataset["train"].train_test_split(test_size=0.2)

In [28]:
hi_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1003225
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 250807
    })
})

In [29]:
source_lang = "hi"
target_lang = "en-hi"
prefix = ""


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [30]:
tokenized_hi_dataset = hi_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1003225 [00:00<?, ? examples/s]

Map:   0%|          | 0/250807 [00:00<?, ? examples/s]

In [None]:
tokenized_hi_dataset["train"][:5]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="mt5_based_en_enhi_mt_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hi_dataset["train"],
    eval_dataset=tokenized_hi_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()