Load Dataset

In [13]:
import numpy as np
import pandas as pd

In [14]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict
dataset = load_dataset("open_subtitles", "en-hi")
dataset['train'][100]['translation']

Found cached dataset open_subtitles (C:/Users/jmadr/.cache/huggingface/datasets/open_subtitles/en-hi/2018.0.0/c1ec973ca4b6e588740d8f167cc0e24ea3f626e70bc7ffe467e944730500e198)
100%|██████████| 1/1 [00:00<00:00, 34.41it/s]


{'en': 'Put that bicycle down.', 'hi': 'साइकिल नीचे रखो.'}

In [15]:
dataset = dataset['train'].train_test_split(test_size=0.2)

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'meta', 'translation'],
        num_rows: 74412
    })
    test: Dataset({
        features: ['id', 'meta', 'translation'],
        num_rows: 18604
    })
})

Load Tokenizer and Pre-Trained M2M100 Model 

In [5]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
tokenizer = M2M100Tokenizer.from_pretrained(r"C:\Users\jmadr\Python Projects\exploring-transformers\fine_tuning_m2m100\en-hi-m2m100-6b-1e\checkpoint-12000", src_lang="en", tgt_lang="hi")
model = M2M100ForConditionalGeneration.from_pretrained(r"C:\Users\jmadr\Python Projects\exploring-transformers\fine_tuning_m2m100\en-hi-m2m100-6b-1e\checkpoint-12000")

In [6]:
source_lang = "en"
target_lang = "hi"
prefix = "translate English to Hindi"

Evaluate Model Peformance

In [7]:
max_length = 512

In [8]:
def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, padding=True, truncation=True)
    return model_inputs

In [17]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

                                                                   

In [18]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'meta', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 74412
    })
    test: Dataset({
        features: ['id', 'meta', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 18604
    })
})

In [19]:
import evaluate 
metric = evaluate.load("sacrebleu")

In [20]:
import numpy as  np 

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Train your Model

In [2]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors='pt')

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir="en-hi-m2m100-6b-1e",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    no_cuda=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

[codecarbon INFO @ 13:34:45] [setup] RAM Tracking...
[codecarbon INFO @ 13:34:45] [setup] GPU Tracking...
[codecarbon INFO @ 13:34:45] No GPU found.
[codecarbon INFO @ 13:34:45] [setup] CPU Tracking...
[codecarbon INFO @ 13:34:47] CPU Model on constant consumption mode: AMD Ryzen 5 3600X 6-Core Processor
[codecarbon INFO @ 13:34:47] >>> Tracker's metadata:
[codecarbon INFO @ 13:34:47]   Platform system: Windows-10-10.0.19045-SP0
[codecarbon INFO @ 13:34:47]   Python version: 3.11.3
[codecarbon INFO @ 13:34:47]   CodeCarbon version: 2.2.4
[codecarbon INFO @ 13:34:47]   Available RAM : 15.928 GB
[codecarbon INFO @ 13:34:47]   CPU count: 12
[codecarbon INFO @ 13:34:47]   CPU model: AMD Ryzen 5 3600X 6-Core Processor
[codecarbon INFO @ 13:34:47]   GPU count: None
[codecarbon INFO @ 13:34:47]   GPU model: None


In [22]:
trainer.evaluate()

100%|██████████| 3101/3101 [11:46:52<00:00, 12.73s/it]  

UnboundLocalError: cannot access local variable 'result' where it is not associated with a value