## Library

In [10]:
import torch
import collections
import random
import numpy as np
from torch import nn
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, BertForMaskedLM, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.cuda.amp import GradScaler, autocast
from sklearn.metrics import accuracy_score
from torch.nn.utils.rnn import pad_sequence

## Dataset Loading

In [11]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
file_path = '/content/drive/MyDrive/NLP/tydi_xor_re.xlsx'
df = pd.read_excel(file_path)

In [13]:
df.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...,WikiLeaks () is an international non-profit or...,bn,True,182,2006,
1,দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?,The war in Europe concluded with an invasion o...,bn,True,48,Germany,
2,মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...,Same-sex marriage in the United States expande...,bn,False,-1,no,
3,আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...,The exact number of Arab casualties is unknown...,bn,True,39,unknown,
4,বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?,"As Thomas Hall (2000) notes, ""The Sung Empire ...",bn,True,1219,17th century,


# Language_Models

## Tokezination

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Function to tokenize questions
def tokenize_question(row):
    if row['lang'] in ['fi', 'ru', 'ja']:
        return tokenizer.tokenize(
            row['question']
        )
    else:
        return None

# Apply tokenization to questions
df['question_tokens'] = df.apply(tokenize_question, axis=1)

# Function to tokenize contexts
def tokenize_context(row):
    if row['lang'] in ['fi', 'ru', 'ja']:
        return tokenizer.tokenize(
            row['context']
        )
    else:
        return None

# Apply tokenization to contexts
df['context_tokens'] = df.apply(tokenize_context, axis=1)





### tokenizer display

In [15]:
print(df['question_tokens'].head(3000))
print(df['context_tokens'].head(3000))

0                                                    None
1                                                    None
2                                                    None
3                                                    None
4                                                    None
                              ...                        
2995    [オ, ##ン, ##ライン, ##ゲーム, ##の, 起, 源, は, ##い, ##つ, ？]
2996                            [唐, は, 中, 国, 統, 一, した, ？]
2997    [ル, ##ッ, ##キ, ##ズ, ##ム, ##に, ##より, 訴, 訟, に, 発,...
2998    [カ, ##ー, ##ダー, ##ル, ・, ヤ, ##ー, ##ノ, ##シ, ##ュ, ...
2999    [コ, ##ロ, ##ラ, ##ド, 州, 最, 初, の, 知, 事, は, ##だ, ##れ]
Name: question_tokens, Length: 3000, dtype: object
0                                                    None
1                                                    None
2                                                    None
3                                                    None
4                                                    None
                     

## Dataset Loader

In [16]:
class QuestionContextDataset(torch.utils.data.Dataset):
    def __init__(self, questions, contexts, question_tokens, context_tokens, labels, tokenizer, max_length=512):
        self.questions = questions
        self.contexts = contexts
        self.question_tokens = question_tokens
        self.context_tokens = context_tokens
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length  # Set max length (default to 512)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Convert question and context tokens to input IDs
        question_ids = self.tokenizer.convert_tokens_to_ids(self.question_tokens[idx])
        context_ids = self.tokenizer.convert_tokens_to_ids(self.context_tokens[idx])

        # Concatenate input_ids
        input_ids = question_ids + context_ids

        # Ensure the input is not longer than max_length
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]

        # Create the attention mask (all 1s for actual tokens)
        attention_mask = [1] * len(input_ids)

        # Padding to max length if necessary
        padding_length = self.max_length - len(input_ids)
        if padding_length > 0:
            input_ids += [0] * padding_length
            attention_mask += [0] * padding_length

        # Convert labels to tensor
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': label
        }



def collate_fn(batch):
    # Extract input_ids, attention_mask, and labels from the batch
    input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
    labels = [torch.tensor(item['labels'], dtype=torch.long) for item in batch]

    # Pad input_ids and attention_mask to the longest sequence in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    labels = torch.stack(labels)

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_mask_padded,
        'labels': labels
    }



## Dataset for Finnish language

In [17]:
# 1. Filter the DataFrame for Finnish language questions
fi_df = df[df['lang'] == 'fi']

# 2. Split the DataFrame into training and validation sets
fi_train_df, fi_val_df = train_test_split(fi_df, test_size=0.2, random_state=42)

fi_df.head(100)

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang,question_tokens,context_tokens
9137,Mitkä olivat Rooman alkuvaiheet?,"In historiography, ancient Rome is Roman civil...",fi,True,0,"In historiography, ancient Rome is Roman civil...",,"[Mit, ##kä, olivat, Rooma, ##n, al, ##ku, ##va...","[In, his, ##torio, ##graphy, ,, ancient, Rome,..."
9138,Kuka oli toisen maailmansodan jälkeisen sosial...,Rákosi had difficulty managing the economy and...,fi,True,187,Mátyás Rákosi,,"[Ku, ##ka, oli, toisen, maailmansodan, j, ##äl...","[R, ##ák, ##osi, had, difficulty, managing, th..."
9139,Mikä oli roomalaisten antama nimi nykyisen Unk...,Hungary in its modern (post-1946) borders roug...,fi,True,286,Pannonia,,"[Mi, ##kä, oli, room, ##alaisten, ant, ##ama, ...","[Hungary, in, its, modern, (, post, -, 1946, )..."
9140,Kuinka monta ihmistä menehtyi Suezin kriisin a...,"On 25 January 1952, British forces attempted t...",fi,True,131,deaths of 41 Egyptians,,"[Kui, ##nka, mont, ##a, ihm, ##istä, men, ##eh...","[On, 25, January, 1952, ,, British, forces, at..."
9141,Millä vuosikymmenellä Yhdysvaltojen varhaishis...,The history of the United States began with th...,fi,True,87,"15,000 BC",,"[Mill, ##ä, vu, ##osi, ##ky, ##mmen, ##ellä, Y...","[The, history, of, the, United, States, began,..."
...,...,...,...,...,...,...,...,...,...
9232,Vaikuttiko myöhäisantiikki Suomessa?,"In 1917, Finland declared independence. A civi...",fi,False,-1,no,,"[Vai, ##kut, ##tik, ##o, my, ##ö, ##h, ##äis, ...","[In, 1917, ,, Finland, declared, independence,..."
9233,Milloin Venäjä on perustettu?,The History of Russia begins with that of the ...,fi,True,210,882,,"[Mill, ##oin, Ve, ##nä, ##jä, on, perustettu, ?]","[The, History, of, Russia, begins, with, that,..."
9234,Miksi Espanjan sisällissota jatkui jopa kolme ...,The armies kept growing. The principal source ...,fi,True,0,The armies kept growing,,"[Mi, ##ksi, Espanjan, sis, ##äl, ##lis, ##sot,...","[The, armies, kept, growing, ., The, principal..."
9235,Mistä tulee nimitys Yhdistynyt kuningaskunta?,The 1707 Acts of Union declared that the kingd...,fi,True,4,1707 Acts of Union,,"[Mis, ##tä, tulee, nimi, ##tys, Y, ##hdi, ##st...","[The, 1707, Acts, of, Union, declared, that, t..."


## BERT_Model_For_Finnish_Language

In [18]:
# Create dataset and dataloaders for training and validation
train_dataset_fi = QuestionContextDataset(fi_train_df['question'].tolist(),
                                          fi_train_df['context'].tolist(),
                                          fi_train_df['question_tokens'].tolist(),
                                          fi_train_df['context_tokens'].tolist(),
                                          fi_train_df['answerable'].tolist(),
                                          tokenizer
                                          )

val_dataset_fi = QuestionContextDataset(fi_val_df['question'].tolist(),
                                        fi_val_df['context'].tolist(),
                                        fi_val_df['question_tokens'].tolist(),
                                        fi_val_df['context_tokens'].tolist(),
                                        fi_val_df['answerable'].tolist(),
                                        tokenizer
                                        )

train_dataloader_fi = DataLoader(train_dataset_fi, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_dataloader_fi = DataLoader(val_dataset_fi, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model.train()
model.to('cuda')

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
for epoch in range(3):
    model.train()
    for batch in train_dataloader_fi:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

# Validate the Model
model.eval()

# Function to get probabilities on the validation set
def validate_model(data_loader):
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f'Validation Accuracy: {accuracy:.4f}')

# Validate the model
validate_model(val_dataloader_fi)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.long) for item in batch]


Validation Accuracy: 0.8569


## Dataset for Russian language

In [19]:
# 1. Filter the DataFrame for Russian language questions
ru_df = df[df['lang'] == 'ru']

# 2. Split the DataFrame into training and validation sets
ru_train_df, ru_val_df = train_test_split(ru_df, test_size=0.2, random_state=42)

## BERT_Model_For_Russian_Language

In [20]:
# Create dataset and dataloaders for training and validation
train_dataset_ru = QuestionContextDataset(ru_train_df['question'].tolist(),
                                          ru_train_df['context'].tolist(),
                                          ru_train_df['question_tokens'].tolist(),
                                          ru_train_df['context_tokens'].tolist(),
                                          ru_train_df['answerable'].tolist(),
                                          tokenizer
                                          )

val_dataset_ru = QuestionContextDataset(ru_val_df['question'].tolist(),
                                        ru_val_df['context'].tolist(),
                                        ru_val_df['question_tokens'].tolist(),
                                        ru_val_df['context_tokens'].tolist(),
                                        ru_val_df['answerable'].tolist(),
                                        tokenizer
                                        )

train_dataloader_ru = DataLoader(train_dataset_ru, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_dataloader_ru = DataLoader(val_dataset_ru, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model.train()
model.to('cuda')

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
for epoch in range(3):
    model.train()
    for batch in train_dataloader_ru:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

# Validate the Model
model.eval()

# Function to get probabilities on the validation set
def validate_model(data_loader):
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f'Validation Accuracy: {accuracy:.4f}')

# Validate the model
validate_model(val_dataloader_ru)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.long) for item in batch]


Validation Accuracy: 0.8634


## Dataset for Japanese Language

In [21]:
# 1. Filter the DataFrame for Japanese language questions
ja_df = df[df['lang'] == 'ja']

# 2. Split the DataFrame into training and validation sets
ja_train_df, ja_val_df = train_test_split(ja_df, test_size=0.2, random_state=42)

## BERT_Model_For_Japanese_Language

In [None]:
# Create dataset and dataloaders for training and validation
train_dataset_ja = QuestionContextDataset(ja_train_df['question'].tolist(),
                                          ja_train_df['context'].tolist(),
                                          ja_train_df['question_tokens'].tolist(),
                                          ja_train_df['context_tokens'].tolist(),
                                          ja_train_df['answerable'].tolist(),
                                          tokenizer
                                          )

val_dataset_ja = QuestionContextDataset(ja_val_df['question'].tolist(),
                                        ja_val_df['context'].tolist(),
                                        ja_val_df['question_tokens'].tolist(),
                                        ja_val_df['context_tokens'].tolist(),
                                        ja_val_df['answerable'].tolist(),
                                        tokenizer
                                        )

train_dataloader_ja = DataLoader(train_dataset_ja, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_dataloader_ja = DataLoader(val_dataset_ja, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
model.train()
model.to('cuda')

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
for epoch in range(3):
    model.train()
    for batch in train_dataloader_ja:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

# Validate the Model
model.eval()

# Function to get probabilities on the validation set
def validate_model(data_loader):
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f'Validation Accuracy: {accuracy:.4f}')

# Validate the model
validate_model(val_dataloader_ja)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  input_ids = [torch.tensor(item['input_ids'], dtype=torch.long) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in batch]
  labels = [torch.tensor(item['labels'], dtype=torch.long) for item in batch]


Validation Accuracy: 0.8134
