In [1]:
!pip install datasets transformers[sentencepiece] sacrebleu -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [18]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-mul"

In [19]:
raw_datasets = load_dataset("pavan-naik/en_to_kn")

In [20]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['english', 'kannada'],
        num_rows: 611
    })
    test: Dataset({
        features: ['english', 'kannada'],
        num_rows: 153
    })
})

In [21]:
raw_datasets['train'][1]

{'english': 'Call the fire department.',
 'kannada': 'ಅಗ್ನಿಶಾಮಕ ಇಲಾಖೆಗೆ ಕರೆ ಮಾಡಿ.'}

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [23]:
tokenizer("Hello, this is a sentence!")

{'input_ids': [13078, 3, 72, 17, 9, 18802, 58, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [25]:
preprocess_function(raw_datasets["train"][:2])

KeyError: 'translation'

In [56]:
tokenized_train_dataset = raw_datasets['train'].map(preprocess_function, batched=True)
tokenized_eval_dataset = raw_datasets['test'].map(preprocess_function, batched=True)

Map:   0%|          | 0/611 [00:00<?, ? examples/s]



Map:   0%|          | 0/153 [00:00<?, ? examples/s]

In [26]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "kannada"


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

**Reasoning**:
The subtask requires updating the `source_lang` and `target_lang` variables to 'english' and 'kannada' respectively. This will be done by modifying the existing code cell that defines these variables.



In [27]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "kannada"


def preprocess_function(examples):
    inputs = [ex["english"] for ex in examples]
    targets = [ex["kannada"] for ex in examples]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

**Reasoning**:
The subtask requires updating the `source_lang` and `target_lang` variables to 'english' and 'kannada' respectively. This will be done by modifying the existing code cell that defines these variables.



In [41]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "kannada"


def preprocess_function(examples):
    inputs = [ex["english"] for ex in examples]
    targets = [ex["kannada"] for ex in examples]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [42]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "kannada"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [43]:
preprocess_function(raw_datasets["train"][:2])



{'input_ids': [[21, 437, 1237, 7, 5580, 3146, 17793, 2, 0], [13992, 5, 2645, 27244, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'labels': [[20235, 5121, 22582, 4, 12902, 12280, 1842, 4040, 2263, 2924, 3174, 1945, 2924, 25071, 2263, 7825, 2, 0], [2076, 2924, 3543, 1512, 1842, 5756, 1521, 2772, 2718, 5121, 1945, 1521, 21665, 16946, 2510, 9446, 4, 30507, 2, 0]]}

In [32]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "kannada"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Using text_target for label tokenization to address deprecation warning
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [33]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "kannada"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Using text_target for label tokenization to address deprecation warning
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [34]:
preprocess_function(raw_datasets["train"][:2])

{'input_ids': [[21, 437, 1237, 7, 5580, 3146, 17793, 2, 0], [13992, 5, 2645, 27244, 2, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'labels': [[20235, 5121, 22582, 4, 12902, 12280, 1842, 4040, 2263, 2924, 3174, 1945, 2924, 25071, 2263, 7825, 2, 0], [2076, 2924, 3543, 1512, 1842, 5756, 1521, 2772, 2718, 5121, 1945, 1521, 21665, 16946, 2510, 9446, 4, 30507, 2, 0]]}

In [35]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tf_model.h5:   0%|          | 0.00/311M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-mul.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [36]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [37]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [44]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [57]:
train_dataset = model.prepare_tf_dataset(
    tokenized_train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [58]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_eval_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [59]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_eval_dataset,
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [53]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer, metrics=["accuracy"])

In [62]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x7e68007cf230>

In [68]:
model.save_pretrained("tf_model1/")

In [69]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model1/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model1/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
input_text  = "dog is an animal"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

In [93]:
tokenized_train_dataset = raw_datasets['train'].map(preprocess_function, batched=True)
tokenized_eval_dataset = raw_datasets['test'].map(preprocess_function, batched=True)

Map:   0%|          | 0/611 [00:00<?, ? examples/s]



Map:   0%|          | 0/153 [00:00<?, ? examples/s]

In [94]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "kannada"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Using text_target for label tokenization to address deprecation warning
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [96]:
tokenized_train_dataset = raw_datasets['train'].map(preprocess_function, batched=True)
tokenized_eval_dataset = raw_datasets['test'].map(preprocess_function, batched=True)

Map:   0%|          | 0/611 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]