## Library

In [1]:
!pip install datasets



In [2]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from transformers import MBartForConditionalGeneration, MBartTokenizer, BartTokenizer, BartForConditionalGeneration
from transformers import Trainer, TrainingArguments, pipeline, AdamW
from transformers import DataCollatorForSeq2Seq
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from torch.utils.data import DataLoader

## Dataset Loading

In [3]:
dataset = load_dataset("coastalcph/tydi_xor_rc")
train_set = dataset["train"]
validation_set = dataset["validation"]
df = pd.DataFrame(train_set)
df_eval = pd.DataFrame(validation_set)

Using the latest cached version of the dataset since coastalcph/tydi_xor_rc couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/sohrab/.cache/huggingface/datasets/coastalcph___tydi_xor_rc/default/0.0.0/42871590d25e82c9531347ca205b56037514753b (last modified on Thu Oct 31 16:48:59 2024).


In [4]:
df.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...,WikiLeaks () is an international non-profit or...,bn,True,182,2006,
1,দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?,The war in Europe concluded with an invasion o...,bn,True,48,Germany,
2,মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...,Same-sex marriage in the United States expande...,bn,False,-1,no,
3,আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...,The exact number of Arab casualties is unknown...,bn,True,39,unknown,
4,বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?,"As Thomas Hall (2000) notes, ""The Sung Empire ...",bn,True,1219,17th century,


## Model

In [5]:
model_name = 'facebook/mbart-large-cc25'
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [6]:
# Dataset
train_data = df[df['lang'].isin(['fi', 'ru', 'ja'])][['context', 'question', 'answer']].sample(n=100, random_state=42)
val_data = df_eval[df_eval['lang'].isin(['fi', 'ru', 'ja'])][['context', 'question', 'answer']].sample(n=3, random_state=42)

train_data.head()

Unnamed: 0,context,question,answer
10046,Carl Gustav Jung (; ; 26 July 1875 – 6 June 19...,Ketä pidetään analyyttisen psykologian perusta...,Carl Gustav Jung
3585,Cell culture is the process by which cells are...,人間の細胞を培養することはできる？,no
9780,Dmitry Anatolyevich Medvedev (; ; born 14 Sept...,Kuinka monta vuotta Dmitri Medvedevin oli Venä...,2008 to 2012
7858,"Up until the 1980s, the largest overseas marke...","Как назывался первый индийский фильм, показанн...",Dharti Ke Lal
9947,"Franz Liszt (October 22, 1811July 31, 1886) wa...",Kuka Franz Liszt oli?,"prolific 19th-century Hungarian composer, virt..."


In [7]:
val_data.head()

Unnamed: 0,context,question,answer
1390,"Following the February Revolution of 1917, Nic...",ロシア最後の皇帝ニコライ2世を殺害したのは誰ですか？,Bolshevik guards
1745,"The Western Wall, Wailing Wall, or Kotel, know...",Ketkä rakensivat länsimuurin?,Herod the Great
2427,A mine flail is a vehicle-mounted device that ...,Когда впервые начали использовать кисте́нь?,World War II


In [8]:
def tokenize_data(data):
    inputs = [f"Context: {row['context']} Question: {row['question']}" for _, row in data.iterrows()]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")

    labels = tokenizer(data['answer'].tolist(), max_length=512, truncation=True, padding="max_length", return_tensors="pt")['input_ids']

    labels = torch.where(labels == tokenizer.pad_token_id, torch.tensor(-100), labels)

    model_inputs['labels'] = labels
    return model_inputs

# Tokenizing training datasets
train_encodings = tokenize_data(train_data)

# QADataset class
class QADataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

train_dataset = QADataset(train_encodings)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [9]:
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print("loss: ", total_loss)

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {avg_loss:.4f}")




Epoch 1/3
loss:  5.4606828689575195
loss:  14.387225151062012
loss:  17.269384384155273
loss:  22.487712860107422
loss:  32.43186664581299
loss:  40.943848609924316
loss:  43.278547048568726
loss:  49.221330881118774
loss:  52.23257780075073
loss:  60.502076625823975
loss:  70.08412599563599
loss:  74.37327003479004
loss:  78.42021751403809
loss:  84.30627632141113
loss:  87.59822010993958
loss:  96.58766674995422
loss:  102.22343134880066
loss:  107.32146048545837
loss:  111.98502087593079
loss:  116.19996857643127
loss:  122.8481376171112
loss:  127.40881991386414
loss:  130.49405574798584
loss:  135.21003675460815
loss:  139.59687662124634
Epoch 1/3 - Average Loss: 5.5839
Epoch 2/3
loss:  6.94699764251709
loss:  11.101312160491943
loss:  15.71090841293335
loss:  18.414000272750854
loss:  23.894765615463257
loss:  25.927515983581543
loss:  28.836758136749268
loss:  32.8487868309021
loss:  37.33837127685547
loss:  43.193970680236816
loss:  47.43037128448486
loss:  51.90889263153076
lo

In [33]:
torch.save(model.state_dict(), "model.pt")

In [37]:
model.load_state_dict(torch.load("model.pt"))
model.to(device)
model.eval()

def generate_answer(question, context):
    input_text = f"Context: {context} Question: {question}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    
    outputs = model.generate(inputs["input_ids"], 
                             attention_mask=inputs["attention_mask"],
                             max_length=50, 
                             num_beams=1,
                             temperature=0.7,
                             top_k=50,
                             top_p=0.95,
                             early_stopping=True)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

for _, row in val_data.iterrows():
    generated_answer = generate_answer(row['question'], row['context'])
    print(f"Question: {row['question']}")
    print(f"Generated Answer: {generated_answer}")
    print(f"Expected Answer: {row['answer']}\n")


  model.load_state_dict(torch.load("model.pt"))


Question: ロシア最後の皇帝ニコライ2世を殺害したのは誰ですか？
Generated Answer: Answer: Following: Following the February 1917: Following the February 1917: Following the February 1917: Following the February Revolutionary: Following the February Revolutionary: Following the February Revolutionary: Follow
Expected Answer: Bolshevik guards

Question: Ketkä rakensivat länsimuurin?
Generated Answer: the Great, known as the Buraq Wall, is an Islam, is an Islam, is an Islam, is an Islam, is, is, is an an Islam, is, is, is, , Islam, is,
Expected Answer: Herod the Great

Question: Когда впервые начали использовать кисте́нь?
Generated Answer: mine flail. mine flail. mine flail. mine flail. The mine flail is a mine flail is a mine flail is a mine flail is a mine flail is a mine flail
Expected Answer: World War II

