In [None]:
# !pip3 install datasets
!pip install git+https://github.com/csebuetnlp/normalizer

In [3]:
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from normalizer import normalize
from datasets import load_dataset
from tqdm.auto import tqdm

# Load and Split dataset

In [4]:
dataset_url = "SKNahin/bengali-transliteration-data"
model_url = "csebuetnlp/banglat5_banglaparaphrase"

In [5]:
raw_dataset = load_dataset(dataset_url)
split_dataset = raw_dataset['train'].train_test_split(test_size=0.2)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

# Load model and Save weight

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_url)
tokenizer = AutoTokenizer.from_pretrained(model_url, use_fast=False)

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Save the model weights to load later

In [7]:
torch.save(model.state_dict(), "model_weights.pt")

## Save both dataset as CSV

In [8]:
train_dataset.to_csv("train.csv")
test_dataset.to_csv("test.csv")

Creating CSV from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

134045

# PyTorch dataset pipeline building

### <font color="orange">Sentence with more than 256 tokens are truncated.

In [13]:
class BanglishToBanglaDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.data = pd.read_csv(data_path)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        d = self.data.loc[idx]
        bn, rm = d['bn'], d['rm']

        bn_t = tokenizer(normalize(bn), return_tensors="pt", padding='max_length', max_length=256, truncation=True).input_ids
        rm_t = tokenizer(normalize(rm), return_tensors="pt", padding='max_length', max_length=256, truncation=True).input_ids

        return rm_t, bn_t
        return {
            "input_ids": rm_t,
            "attention_mask": rm_t.attention_mask.squeeze(0),
            "labels": bn_t,
        }

In [14]:
dataset = BanglishToBanglaDataset("train.csv", tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
dataloader = tqdm(dataloader)

  0%|          | 0/251 [00:00<?, ?it/s]

In [None]:
from transformers import AutoModelForSeq2SeqLM
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-3) 

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 15
for epoch in range(epochs):
    model.train()
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())