## Installing Dependencies

In [1]:
%pip install datasets evaluate transformers[sentencepiece] accelerate

Note: you may need to restart the kernel to use updated packages.


## Logging Credentials

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!git config --global user.email "sathyam.a31@gmail.com"
!git config --global user.name "iSathyam31"

## Importing the dataset

In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("kde4", lang1="en", lang2="ja")

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 131429
    })
})

In [6]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 118286
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 13143
    })
})

In [7]:
split_datasets["validation"] = split_datasets.pop("test")

In [8]:
split_datasets["train"][1]["translation"]

{'en': 'Krita PhotoShop Import Filter', 'ja': 'Krita ora インポートフィルタName'}

## Importing the Model

In [9]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-jap"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")

Device set to use cuda:0


[{'translation_text': '持 っ て い る 類 , 細か な 糸 を 生 じ させ る なら ば ,'}]

In [10]:
split_datasets["train"][172]["translation"]

{'en': "Neighbors' Loved Radio", 'ja': 'ご近所さんのお気に入りラジオ'}

In [11]:
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

[{'translation_text': '" わたし たち の 王 クレセテ , すなわち , 今 に 至 る まで 出帆 し た . この 人 が あ る の は , 異な る こと で は な い " と 言 っ て い た .'}]

## Tokenizer

In [12]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-jap"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [13]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
ja_sentence = split_datasets["train"][1]["translation"]["ja"]

inputs = tokenizer(en_sentence, text_target=ja_sentence)
inputs

{'input_ids': [16465, 2156, 22699, 22784, 3407, 7323, 1, 32478, 31, 2315, 45185, 19121, 29, 20801, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [6, 1, 6, 1, 15508, 3983, 27591, 1696, 1852, 41294, 587, 3443, 1, 0]}

In [14]:
wrong_targets = tokenizer(ja_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁K', 'r', 'ita', '▁or', 'a', '▁', '<unk>', 'N', '<unk>', '</s>']
['▁', '<unk>', '▁', '<unk>', '▁イン', 'ポ', 'ー', 'ト', 'フ', 'ィ', 'ル', 'タ', '<unk>', '</s>']


In [15]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["ja"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [16]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

## Training

In [17]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [18]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [20]:
batch["labels"]

tensor([[    6,     1,     6,     1, 15508,  3983, 27591,  1696,  1852, 41294,
           587,  3443,     1,     0],
        [ 7155, 27591,  1781, 13128,  1060,     4,     4,     4,     0,  -100,
          -100,  -100,  -100,  -100]])

In [21]:
batch["decoder_input_ids"]

tensor([[46275,     6,     1,     6,     1, 15508,  3983, 27591,  1696,  1852,
         41294,   587,  3443,     1],
        [46275,  7155, 27591,  1781, 13128,  1060,     4,     4,     4,     0,
         46275, 46275, 46275, 46275]])

In [22]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[6, 1, 6, 1, 15508, 3983, 27591, 1696, 1852, 41294, 587, 3443, 1, 0]
[7155, 27591, 1781, 13128, 1060, 4, 4, 4, 0]


## Evaluation Metrics(BLEU)

In [23]:
%pip install sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [24]:
import evaluate

metric = evaluate.load("sacrebleu")

In [25]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [26]:
predictions = ["This This This This"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

### Creating a function for the metric

In [27]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

## Setting up the training arguements

In [28]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"sattu-finetuned-kde4-en-to-jap",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)



In [30]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [31]:
trainer.evaluate(max_length=max_length)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/822 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 10.3817720413208,
 'eval_model_preparation_time': 0.0018,
 'eval_bleu': 0.004586053991061511,
 'eval_runtime': 522.7552,
 'eval_samples_per_second': 25.142,
 'eval_steps_per_second': 1.572}

### Train

In [32]:
trainer.train()

  0%|          | 0/44358 [00:00<?, ?it/s]

{'loss': 4.826, 'grad_norm': 14.289438247680664, 'learning_rate': 1.977636502998332e-05, 'epoch': 0.03}
{'loss': 3.6071, 'grad_norm': 12.012697219848633, 'learning_rate': 1.955092655214392e-05, 'epoch': 0.07}
{'loss': 3.2836, 'grad_norm': 10.70588207244873, 'learning_rate': 1.9325488074304522e-05, 'epoch': 0.1}
{'loss': 3.1015, 'grad_norm': 11.486024856567383, 'learning_rate': 1.9100049596465125e-05, 'epoch': 0.14}
{'loss': 2.9937, 'grad_norm': 11.767738342285156, 'learning_rate': 1.8875061995581406e-05, 'epoch': 0.17}
{'loss': 2.8449, 'grad_norm': 13.24001407623291, 'learning_rate': 1.864962351774201e-05, 'epoch': 0.2}
{'loss': 2.7507, 'grad_norm': 12.768835067749023, 'learning_rate': 1.842463591685829e-05, 'epoch': 0.24}
{'loss': 2.6719, 'grad_norm': 15.419469833374023, 'learning_rate': 1.8199197439018893e-05, 'epoch': 0.27}
{'loss': 2.5519, 'grad_norm': 16.112201690673828, 'learning_rate': 1.7973758961179496e-05, 'epoch': 0.3}
{'loss': 2.4944, 'grad_norm': 10.534053802490234, 'learn



{'loss': 1.852, 'grad_norm': 9.755768775939941, 'learning_rate': 1.3240452680463503e-05, 'epoch': 1.01}
{'loss': 1.7904, 'grad_norm': 10.84270191192627, 'learning_rate': 1.3015014202624106e-05, 'epoch': 1.05}
{'loss': 1.7993, 'grad_norm': 7.5307769775390625, 'learning_rate': 1.2789575724784706e-05, 'epoch': 1.08}
{'loss': 1.7581, 'grad_norm': 11.244287490844727, 'learning_rate': 1.2564137246945309e-05, 'epoch': 1.12}
{'loss': 1.6985, 'grad_norm': 11.480894088745117, 'learning_rate': 1.2338698769105912e-05, 'epoch': 1.15}
{'loss': 1.7426, 'grad_norm': 14.928563117980957, 'learning_rate': 1.2113711168222192e-05, 'epoch': 1.18}
{'loss': 1.7435, 'grad_norm': 10.115802764892578, 'learning_rate': 1.1888272690382795e-05, 'epoch': 1.22}
{'loss': 1.7013, 'grad_norm': 13.301074028015137, 'learning_rate': 1.1662834212543397e-05, 'epoch': 1.25}
{'loss': 1.6778, 'grad_norm': 10.21424674987793, 'learning_rate': 1.1437395734704e-05, 'epoch': 1.28}
{'loss': 1.6799, 'grad_norm': 12.599790573120117, 'le

TrainOutput(global_step=44358, training_loss=1.848842666772517, metrics={'train_runtime': 3103.2975, 'train_samples_per_second': 114.349, 'train_steps_per_second': 14.294, 'total_flos': 3330661600198656.0, 'train_loss': 1.848842666772517, 'epoch': 3.0})

### Evaluate

In [33]:
trainer.evaluate(max_length=max_length)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/822 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 1.419884204864502,
 'eval_model_preparation_time': 0.0018,
 'eval_bleu': 20.727494887708588,
 'eval_runtime': 503.2919,
 'eval_samples_per_second': 26.114,
 'eval_steps_per_second': 1.633,
 'epoch': 3.0}

In [34]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

CommitInfo(commit_url='https://huggingface.co/iSathyam03/sattu-finetuned-kde4-en-to-jap/commit/f0b0498e73b7abf35f3be694c759dde0dec6bf97', commit_message='Training complete', commit_description='', oid='f0b0498e73b7abf35f3be694c759dde0dec6bf97', pr_url=None, repo_url=RepoUrl('https://huggingface.co/iSathyam03/sattu-finetuned-kde4-en-to-jap', endpoint='https://huggingface.co', repo_type='model', repo_id='iSathyam03/sattu-finetuned-kde4-en-to-jap'), pr_revision=None, pr_num=None)