In [19]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
import numpy as np

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liching\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
data = pd.read_csv("../Dataset/meitei_eng/train.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,eng,mani
0,0,in chapter 3 arjun said lord,ꯑꯙ꯭ꯌꯥꯌ ꯳ꯗꯥ ꯑꯔꯖꯨꯅꯅꯥ ꯍꯥꯌꯈꯤ ꯚꯒꯕꯥꯟ꯫
1,1,why do you cast me into dreadful acts when you...,ꯂꯧꯁꯤꯡꯒꯤ ꯂꯝꯕꯤ ꯑꯁꯤ ꯍꯦꯟꯅꯥ ꯐꯩ ꯍꯥꯌꯅꯥ ꯊꯥꯖꯔꯀꯄꯥ ꯃꯇꯃꯗꯥ ...
2,2,he found the way of knowledge easier to practi...,ꯃꯍꯥꯛꯅꯥ ꯂꯧꯁꯤꯡꯒꯤ ꯂꯝꯕꯤ ꯑꯁꯤ ꯆꯠꯅꯍꯅꯕꯗꯥ ꯍꯦꯟꯅꯥ ꯂꯥꯌꯕꯥ ꯎ...
3,3,so there is profit in both success and failure,ꯃꯔꯝ ꯑꯗꯨꯅꯥ ꯃꯥꯌ ꯄꯥꯀꯄꯥ ꯑꯃꯁꯨꯡ ꯃꯥꯌ ꯄꯥꯀꯄꯥ ꯑꯅꯤꯃꯛꯇꯥ ꯀꯥ...
4,4,but by now he understands well that action a p...,ꯑꯗꯨꯕꯨ ꯍꯧꯖꯤꯛ ꯐꯥꯑꯣꯕꯗꯥ ꯃꯍꯥꯛꯅꯥ ꯐꯖꯅꯥ ꯈꯉꯂꯦ ꯃꯗꯨꯗꯤ ꯑꯦꯛ...


In [33]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
# model = AutoModel.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
mani_tokenizer = tokenizer.train_new_from_iterator(data['mani'], 32100)

In [4]:
# testing

inputs = [data["eng"].iloc[ex] for ex in data.index]
targets = [data["mani"].iloc[ex] for ex in data.index]
max_length = 512
model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)
mni_token = mani_tokenizer(targets, max_length=max_length, truncation=True)

# working

In [5]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=False)
val_data.reset_index(inplace= True)

In [6]:
# build a dataset for training model.
def preprocess_function(dataset, en_tokenizer, mn_tokenizer):
    inputs = [dataset["eng"].iloc[ex] for ex in dataset.index]
    targets = [dataset["mani"].iloc[ex] for ex in dataset.index]
    # 
    model_inputs = en_tokenizer(inputs, truncation=True, padding=True, return_tensors="pt")
    labels = mn_tokenizer(targets, truncation=True, padding=True, return_tensors="pt")['input_ids']
 
    model_inputs["labels"] = labels
    
    return model_inputs

train_set = preprocess_function(train_data, tokenizer, mani_tokenizer)
val_set = preprocess_function(val_data, tokenizer, mani_tokenizer)


In [11]:
# seq2seq dataset loader
class Seq2SeqDataset(Dataset):
    def __init__(self, encoder_input_ids, decoder_input_ids):
        self.encoder_input_ids = encoder_input_ids
        self.decoder_input_ids = decoder_input_ids

    def __len__(self):
        return len(self.encoder_input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encoder_input_ids[idx],
            "decoder_input_ids": self.decoder_input_ids[idx],
        }

# Assuming train_X and val_X are tuples/lists of encoder and decoder inputs respectively
train_dataset = Seq2SeqDataset(train_set['input_ids'], train_set['labels'])
eval_dataset = Seq2SeqDataset(val_set['input_ids'], val_set['labels'])

# # training args
# training_args = TrainingArguments(
#     output_dir="../meitei_eng/output", # output directory for saving model checkpoints and logs.
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     num_train_epochs=10,
#     evaluation_strategy="epoch", # evaluate after every epoch.
#     save_total_limit=2,
#     eval_steps=500,
#     logging_steps=500,
#     learning_rate=5e-5,
#     save_steps=500, 
# )

# # Instantiate Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset
# )

# trainer.train()


In [16]:
#############

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_metric


In [23]:
args = Seq2SeqTrainingArguments(
    output_dir= "../meitei_eng/output",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size= 1,
    per_device_eval_batch_size= 1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs= 5,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model= "rouge1",
    report_to= "tensorboard"
)

metric = load_metric("rouge", trust_remote_code=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = mani_tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = mani_tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [27]:
trainer = Seq2SeqTrainer(
    model= model,
    args=args,
    train_dataset= train_dataset,
    eval_dataset= eval_dataset,
    compute_metrics=compute_metrics
)

  0%|          | 0/7080 [46:46<?, ?it/s]


In [31]:
# Start TensorBoard before training to monitor it in progress
model_dir = r"../meitei_eng/output"
%load_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

In [32]:
trainer.train()

  0%|          | 0/7075 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: last_hidden_state,past_key_values,encoder_last_hidden_state. For reference, the inputs it received are input_ids,decoder_input_ids.