# <center><font size="+5">**1. Data Preperation**</font></center>

In [None]:
!pip3 install lightning transformers jiwer pandas matplotlib seaborn torch datasets

In [2]:
import os, shutil
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pytorch_lightning as Lightning
from types import SimpleNamespace
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

2024-02-23 09:50:42.467054: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-23 09:50:42.467180: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-23 09:50:42.630918: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
cfg = SimpleNamespace(
    val_size = 0.15,
    random_seed = 42,
    num_worker=4,
    batch_size=8
)

In [4]:
class BashamulDataModule(Lightning.LightningDataModule):
    def __init__(self, data_dir : str = "/kaggle/input/regipa"):
        super().__init__()
        self.data_dir = data_dir
        self.working_dir = "/kaggle/working/working_dir"
        self.english_alpha_pat = "[a-zA-z0-9]"
        self.bangla_numerals = "[০১২৩৪৫৬৭৮৯]"
        self.files = {
            'train': self.working_dir + "/train_proc.csv",
            'val': self.working_dir + "/val_proc.csv",
            'test': self.working_dir + "/test_proc.csv",
        }
        
        
    def filter_bangla_numerals(self, df):
        filtered = df[lambda x: x["Contents"].str.contains(self.bangla_numerals)]
        return filtered
        
    def prepare_data(self):
        train_df = pd.read_csv("/kaggle/input/regipa/train_regipa.csv")
        test_df = pd.read_csv("/kaggle/input/regipa/test_regipa.csv")
        
        #drop index column
        train_df = train_df.drop(columns=['Index']) 
        test_df = test_df.drop(columns=['Index'])
        
        #drop null
        train_df.dropna(inplace=True) 
        test_df.dropna(inplace=True)
        
        #remove english alphanumerals
        train_df["Contents"] = train_df["Contents"].str.replace(self.english_alpha_pat, "", regex=True)
        test_df["Contents"] = test_df["Contents"].str.replace(self.english_alpha_pat, "", regex=True)
        
        #splitting train and validation set
        train_df, val_df = train_test_split(train_df, test_size=cfg.val_size, shuffle=True, random_state=cfg.random_seed)
        
        #save to folders
        os.makedirs(self.working_dir, exist_ok=True)
        train_df.to_csv(self.files['train'], index=False)
        val_df.to_csv(self.files['val'], index=False)
        test_df.to_csv(self.files['test'], index=False)
        
    def setup(self, stage: str):
        if stage == 'fit':
            self.train_data = pd.read_csv(self.files['train']).reset_index(drop=True).to_numpy()
            self.val_data = pd.read_csv(self.files['val']).reset_index(drop=True).to_numpy()
            
        elif stage== "test":
            self.test_data = pd.read_csv(self.files['test']).reset_index(drop=True)
        elif stage=="predict":
            self.test_data = pd.read_csv(self.files['test']).reset_index(drop=True)
    
    def train_dataloader(self):
        return self.train_data
    
    def test_dataloader(self):
        return self.test_data
    
    def val_dataloader(self):
        return self.val_data
    
    def predict_dataloader(self):
        return self.test_data

In [56]:
data_module = BashamulDataModule()

data_module.prepare_data()
data_module.setup("fit")
for i in data_module.train_dataloader():
    print(i)
    break

['Narail' 'কয় কি ওই মাথায় যখন হিট হয় তখন ওরে মাইদ্দি আমি।'
 'kɔe̯ kɪ o͡ɪ̯ mɐt̪ʰɐe̯ ɟɔkʰon hɪt hɔe̯ t̪ɔkʰon oɾe mɐ͡ɪ̯d̪d̪ɪ ɐmɪ।']


# Training

In [None]:
import numpy as np
from datasets import load_metric

wer_metric = load_metric("wer")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = wer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"wer": result}

# <center><font size="+5">**2. Model Traininig**</font></center>

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

model_id = "mt5-bangla-text-to-ipa"



In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
trainer.save_model(model_id)

# Inference

In [None]:
# Sort by length
index = test_df["Contents"].str.len().sort_values(ascending=False).index
test_df = test_df.reindex(index)

In [None]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model=model_id, device=0)

In [None]:
%%time
texts = test_df["Contents"].tolist()
ipas = pipe(texts, max_length=256, batch_size=16)
ipas = [ipa["generated_text"] for ipa in ipas]

In [None]:
test_df["IPA"] = ipas
test_df = test_df.sort_index()
test_df.head()

In [None]:
test_df.to_csv("submission2.csv", columns=["IPA"])

In [None]:
df = pd.read_csv('/kaggle/working/submission2.csv')


# Check if there's any unnamed column
unnamed_columns = [col for col in df.columns if 'Unnamed' in col]

if unnamed_columns:
    # Rename the first unnamed column to a desired name
    new_name = 'id'  # Specify your desired name here
    old_name = unnamed_columns[0]
    df.rename(columns={old_name: new_name}, inplace=True)


# Display the DataFrame after renaming the columns
print("\nDataFrame after renaming columns:")
df.rename(columns={'IPA': 'string'}, inplace=True)
print(df)
df.to_csv('submission.csv')

# What to do next?

Thanks for reading this far. Here is what to do next:

- Explore methods for handling out-of-vocabulary (OOV) words.
- Investigate strategies for handling Bengali numerals.
- Instead of using an encoder-decoder model, try to find ways to solve it simply using an encoder-only model like BERT.