### Pre-requisite installations (RESTART RUNTIME AFTER THESE STEPS)

In [None]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/scratch/prashantk/cache'
os.environ['HF_DATASETS_CACHE']="/scratch/prashantk/cache"


In [None]:
conda install --yes pytorch torchvision -c pytorch

In [None]:
pip install transformers sentencepiece datasets evaluate sacrebleu

In [None]:
pip install protobuf

In [None]:
pip install accelerate -U

### After restarting runtime, login to huggingface

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
%env WANDB_NOTEBOOK_NAME = "mt_train_model.ipynb"

env: WANDB_NOTEBOOK_NAME="mt_train_model.ipynb"


In [6]:
import wandb
wandb.login()

True

## Training models for 3 datasets

1.   Sample dataset (English to codemixed, 5 sentence pairs)
2.   English to codemixed, 1m+ sentence pairs
3.   Hindi to codemixed, 1m+ sentence pairs




### Common helper functions

In [1]:
from transformers import AutoTokenizer

checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [6]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

### 1. Sample dataset (English to codemixed, 5 sentence pairs)

In [13]:
arr = {'id': ['0', '1', '2', '3', '4'],
 'translation': [{'en': 'She takes a job of teacher in the town .',
   'en-hi': 'She the town में teacher की नौकरी करती है।'},
  {'en': 'The scheduled tribes communities are concentrated in two belts .',
   'en-hi': 'अनुसूचित जनजातियां in two belts ही केंद्रित हैं।'},
  {'en': 'They will declare the names in next few days .',
   'en-hi': 'will declare the इनके नाम सार्वजनिक in next कुछ दिनों'},
  {'en': 'It is a pleasure working with him .',
   'en-hi': 'उनके साथ काम करके a pleasure It है।'},
  {'en': 'Committees have been formed in every district of the state .',
   'en-hi': 'वार्ड have been formed in सभी जिलों of the प्रदेश .'}]}

In [14]:
import pandas as pd
dt = pd.DataFrame(arr)
dt

Unnamed: 0,id,translation
0,0,{'en': 'She takes a job of teacher in the town...
1,1,{'en': 'The scheduled tribes communities are c...
2,2,{'en': 'They will declare the names in next fe...
3,3,"{'en': 'It is a pleasure working with him .', ..."
4,4,{'en': 'Committees have been formed in every d...


In [15]:
from datasets import Dataset

dataset = Dataset.from_pandas(dt)
dataset

Dataset({
    features: ['id', 'translation'],
    num_rows: 5
})

In [16]:
dataset = dataset.train_test_split(test_size = 0.2)

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 4
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1
    })
})

In [18]:
dataset["train"][0]

{'id': '1',
 'translation': {'en': 'The scheduled tribes communities are concentrated in two belts .',
  'en-hi': 'अनुसूचित जनजातियां in two belts ही केंद्रित हैं।'}}

In [19]:
source_lang = "en"
target_lang = "en-hi"
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [20]:
tkdataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [21]:
tkdataset["train"][:5]

{'id': ['1', '2', '3', '4'],
 'translation': [{'en': 'The scheduled tribes communities are concentrated in two belts .',
   'en-hi': 'अनुसूचित जनजातियां in two belts ही केंद्रित हैं।'},
  {'en': 'They will declare the names in next few days .',
   'en-hi': 'will declare the इनके नाम सार्वजनिक in next कुछ दिनों'},
  {'en': 'It is a pleasure working with him .',
   'en-hi': 'उनके साथ काम करके a pleasure It है।'},
  {'en': 'Committees have been formed in every district of the state .',
   'en-hi': 'वार्ड have been formed in सभी जिलों of the प्रदेश .'}],
 'input_ids': [[486,
   31499,
   285,
   92346,
   299,
   259,
   74540,
   418,
   81113,
   345,
   281,
   2956,
   65032,
   263,
   259,
   260,
   1],
  [259,
   10837,
   898,
   35656,
   265,
   287,
   41616,
   281,
   6844,
   259,
   6924,
   5382,
   259,
   260,
   1],
  [1385, 339, 259, 262, 259, 82427, 10357, 514, 4065, 259, 260, 1],
  [44663,
   263,
   783,
   2101,
   259,
   41525,
   281,
   6338,
   32388,
   304,


In [22]:
%env WANDB_PROJECT=sample_en_enhi_mt

env: WANDB_PROJECT=sample_en_enhi_mt


In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../scratch/prashantk/sample_en_enhi_mt_model",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    eval_steps = 1,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
    report_to="wandb",
    run_name="sample_en_enhi_model"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tkdataset["train"],
    eval_dataset=tkdataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

/home/prashantk/mt-model-training-scripts/../scratch/prashantk/sample_en_enhi_mt_model is already a clone of https://huggingface.co/kapilrk04/sample_en_enhi_mt_model. Make sure you pull the latest changes with `repo.git_pull()`.


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,33.25856,0.0,2.0
2,No log,32.914196,0.0,2.0
3,No log,33.044575,0.0,2.0


TrainOutput(global_step=3, training_loss=26.773841857910156, metrics={'train_runtime': 18.2119, 'train_samples_per_second': 0.659, 'train_steps_per_second': 0.165, 'total_flos': 210673704960.0, 'train_loss': 26.773841857910156, 'epoch': 3.0})

## English to codemixed, 1m+ sentence pairs

In [8]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("/scratch/prashantk/mt5_based_en_enhi_mt_model")

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [10]:
model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [11]:
from datasets import load_dataset

en_dataset = load_dataset("kapilrk04/codemix-en_enhi", use_auth_token=True)

Found cached dataset parquet (/scratch/prashantk/cache/kapilrk04___parquet/kapilrk04--codemix-en_enhi-8fa8773b3690eaa4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
en_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1253785
    })
})

In [13]:
en_dataset["train"][0]

{'id': '1',
 'translation': {'en': 'Meanwhile , three people came there on a bike .',
  'en-hi': 'Meanwhile a bike पर three people आते दिखाई दिए।'}}

In [14]:
en_dataset = en_dataset["train"].train_test_split(test_size=0.1)

In [15]:
en_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1128406
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 125379
    })
})

In [16]:
source_lang = "en"
target_lang = "en-hi"
prefix = ""


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [17]:
tokenized_en_dataset = en_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1128406 [00:00<?, ? examples/s]

Map:   0%|          | 0/125379 [00:00<?, ? examples/s]

In [18]:
tokenized_en_dataset["train"][:5]

{'id': ['1074812', '1092141', '124784', '599792', '164536'],
 'translation': [{'en': 'India imports palm oil from Malaysia and Indonesia .',
   'en-hi': 'भारत Malaysia and Indonesia से palm oil खरीदता है।'},
  {'en': 'There is a mythological story behind this ritual .',
   'en-hi': 'There is a mythological story इस प्रथा पीछे .'},
  {'en': 'Everyone has shot their respective segments at home .',
   'en-hi': 'सभी has shot अपने-अपने हिस्से at home .'},
  {'en': 'This is the biggest achievement for me .',
   'en-hi': 'This is the biggest बड़ी जीत मेरे लिए .'},
  {'en': 'It was Mumbais second defeat in three games .',
   'en-hi': 'second मुंबई three मैचों में यह पहली defeat'}],
 'input_ids': [[4783,
   12058,
   263,
   39317,
   10494,
   702,
   9277,
   305,
   3664,
   259,
   260,
   1],
  [5258,
   339,
   259,
   262,
   259,
   111501,
   35157,
   9891,
   259,
   25386,
   714,
   82572,
   259,
   260,
   1],
  [259,
   102162,
   1070,
   39363,
   259,
   1616,
   259,
   75105

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/scratch/prashantk/mt5_based_en_enhi_mt_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_en_dataset["train"],
    eval_dataset=tokenized_en_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train()

/scratch/prashantk/mt5_based_en_enhi_mt_model is already a clone of https://huggingface.co/kapilrk04/mt5_based_en_enhi_mt_model. Make sure you pull the latest changes with `repo.git_pull()`.


In [19]:
trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkapilrk-04[0m. Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 6.492368698120117,
 'eval_bleu': 0.6197,
 'eval_gen_len': 5.1377,
 'eval_runtime': 8376.9866,
 'eval_samples_per_second': 14.967,
 'eval_steps_per_second': 1.871}

### Hindi to codemixed, 1m+ sentence pairs

In [26]:
from datasets import load_dataset

hi_dataset = load_dataset("kapilrk04/codemix-hi_enhi", use_auth_token=True)

Downloading readme:   0%|          | 0.00/470 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/kapilrk04___parquet/kapilrk04--codemix-hi_enhi-f28f98485267408c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1254032 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/kapilrk04___parquet/kapilrk04--codemix-hi_enhi-f28f98485267408c/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
hi_dataset["train"][0]

In [27]:
hi_dataset = hi_dataset["train"].train_test_split(test_size=0.2)

In [28]:
hi_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 1003225
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 250807
    })
})

In [29]:
source_lang = "hi"
target_lang = "en-hi"
prefix = ""


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [30]:
tokenized_hi_dataset = hi_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1003225 [00:00<?, ? examples/s]

Map:   0%|          | 0/250807 [00:00<?, ? examples/s]

In [None]:
tokenized_hi_dataset["train"][:5]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="mt5_based_en_enhi_mt_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hi_dataset["train"],
    eval_dataset=tokenized_hi_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()