## Library

In [12]:
!pip install datasets evaluate
!pip install seqeval
!pip install fugashi ipadic



In [13]:
!import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments, BertJapaneseTokenizer
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import evaluate
import seqeval
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

/bin/bash: line 1: import: command not found


## Dataset Loading

In [14]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
file_path = '/content/drive/MyDrive/NLP/tydi_xor_re.xlsx'
df = pd.read_excel(file_path)

In [16]:
df.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...,WikiLeaks () is an international non-profit or...,bn,True,182,2006,
1,দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?,The war in Europe concluded with an invasion o...,bn,True,48,Germany,
2,মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...,Same-sex marriage in the United States expande...,bn,False,-1,no,
3,আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...,The exact number of Arab casualties is unknown...,bn,True,39,unknown,
4,বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?,"As Thomas Hall (2000) notes, ""The Sung Empire ...",bn,True,1219,17th century,


## Tokenization

In [17]:
def prepare_data(examples):
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

    start_positions = []
    end_positions = []

    # Tokenize the questions and contexts together
    tokenized_examples = tokenizer(
        examples['question'], examples['context'],
        truncation=True, padding=True, return_tensors='pt', max_length=512
    )

    for i, answer in enumerate(examples['answer']):
        if answer != "no":  # Answerable question
            answer_start = examples['context'][i].find(answer)
            if answer_start != -1:  # Ensure the answer is found in the context
                answer_end = answer_start + len(answer)

                start_positions.append(tokenizer(
                    examples['context'][i][:answer_start],
                    return_tensors="pt"
                )['input_ids'].size(1))  # Token position of the start of the answer

                end_positions.append(tokenizer(
                    examples['context'][i][:answer_end],
                    return_tensors="pt"
                )['input_ids'].size(1) - 1)  # Token position of the end of the answer
            else:
                # If answer is not found, treat it as unanswerable
                start_positions.append(0)
                end_positions.append(0)
        else:  # Unanswerable question
            start_positions.append(0)
            end_positions.append(0)

    tokenized_examples['start_positions'] = start_positions
    tokenized_examples['end_positions'] = end_positions

    return tokenized_examples


## BERT_QAModel_For_Finnish_Language

### Dataset for Finnish language

In [18]:
# DataFrame for Finnish language questions
fi_df = df[df['lang'] == 'fi']

# Split the DataFrame into training and validation sets
fi_train_df, fi_val_df = train_test_split(fi_df, test_size=0.2, random_state=42)

fi_train_df = Dataset.from_pandas(fi_train_df)
fi_val_df = Dataset.from_pandas(fi_val_df)

# Tokenize both train and validation datasets
train_dataset_fi = fi_train_df.map(prepare_data, batched=True)
val_dataset_fi = fi_val_df.map(prepare_data, batched=True)

fi_df.head(100)

Map:   0%|          | 0/2123 [00:00<?, ? examples/s]



Map:   0%|          | 0/531 [00:00<?, ? examples/s]

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
9137,Mitkä olivat Rooman alkuvaiheet?,"In historiography, ancient Rome is Roman civil...",fi,True,0,"In historiography, ancient Rome is Roman civil...",
9138,Kuka oli toisen maailmansodan jälkeisen sosial...,Rákosi had difficulty managing the economy and...,fi,True,187,Mátyás Rákosi,
9139,Mikä oli roomalaisten antama nimi nykyisen Unk...,Hungary in its modern (post-1946) borders roug...,fi,True,286,Pannonia,
9140,Kuinka monta ihmistä menehtyi Suezin kriisin a...,"On 25 January 1952, British forces attempted t...",fi,True,131,deaths of 41 Egyptians,
9141,Millä vuosikymmenellä Yhdysvaltojen varhaishis...,The history of the United States began with th...,fi,True,87,"15,000 BC",
...,...,...,...,...,...,...,...
9232,Vaikuttiko myöhäisantiikki Suomessa?,"In 1917, Finland declared independence. A civi...",fi,False,-1,no,
9233,Milloin Venäjä on perustettu?,The History of Russia begins with that of the ...,fi,True,210,882,
9234,Miksi Espanjan sisällissota jatkui jopa kolme ...,The armies kept growing. The principal source ...,fi,True,0,The armies kept growing,
9235,Mistä tulee nimitys Yhdistynyt kuningaskunta?,The 1707 Acts of Union declared that the kingd...,fi,True,4,1707 Acts of Union,


### Model

In [19]:
# Model definition and training arguments
model = BertForQuestionAnswering.from_pretrained('bert-base-multilingual-cased')

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Metric for evaluation using the evaluate library
metric = evaluate.load("seqeval")

# Define compute_metrics function for F1 score evaluation
def compute_metrics(pred):
    start_pred = pred.predictions[0].argmax(axis=-1)
    end_pred = pred.predictions[1].argmax(axis=-1)

    start_labels = pred.label_ids[0]
    end_labels = pred.label_ids[1]

    def create_label_sequences(starts, ends, true_starts, true_ends, max_length):
        pred_labels = []
        true_labels = []

        for i in range(len(starts)):
            pred_sequence = ["O"] * max_length
            true_sequence = ["O"] * max_length

            if true_starts[i] != -100:
                true_sequence[true_starts[i]] = "B" if true_starts[i] == true_ends[i] else "I"
                for j in range(true_starts[i] + 1, true_ends[i] + 1):
                    true_sequence[j] = "I"

            if starts[i] != -100:
                pred_sequence[starts[i]] = "B" if starts[i] == ends[i] else "I"
                for j in range(starts[i] + 1, ends[i] + 1):
                    pred_sequence[j] = "I"

            pred_labels.append(pred_sequence)
            true_labels.append(true_sequence)

        return pred_labels, true_labels

    max_length = 512
    true_predictions, true_labels = create_label_sequences(start_pred, end_pred, start_labels, end_labels, max_length)

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_fi,
    eval_dataset=val_dataset_fi,
    compute_metrics=compute_metrics
)

# Model training
trainer.train()

# Model evaluation on validation set
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,4.083368,0.150659,0.150659,0.150659,0.98713
2,4.207800,3.91448,0.150659,0.150659,0.150659,0.979877
3,4.207800,3.944979,0.152542,0.152542,0.152542,0.97842
4,3.494800,3.934733,0.156309,0.156309,0.156309,0.976209
5,3.494800,3.988454,0.163842,0.163842,0.163842,0.977758


Evaluation Results: {'eval_loss': 3.9884536266326904, 'eval_precision': 0.1638418079096045, 'eval_recall': 0.1638418079096045, 'eval_f1': 0.1638418079096045, 'eval_accuracy': 0.9777579154896422, 'eval_runtime': 22.1019, 'eval_samples_per_second': 24.025, 'eval_steps_per_second': 3.031, 'epoch': 5.0}


## BERT_QAModel_For_Russian_Language

### Dataset of Russian language

In [20]:
# 1. Filter the DataFrame for Russian language questions
ru_df = df[df['lang'] == 'ru']

# 2. Split the DataFrame into training and validation sets
ru_train_df, ru_val_df = train_test_split(ru_df, test_size=0.2, random_state=42)

ru_train_df = Dataset.from_pandas(ru_train_df)
ru_val_df = Dataset.from_pandas(ru_val_df)

# Tokenize both train and validation datasets
train_dataset_ru = ru_train_df.map(prepare_data, batched=True)
val_dataset_ru = ru_val_df.map(prepare_data, batched=True)

ru_df.head(100)

Map:   0%|          | 0/1903 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/476 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
7204,Сколько человек погибло от сталинских репресси...,The Great Purge or the Great Terror was a camp...,ru,True,816,"681,692-1,200,000",
7205,Какой город США самый зеленый на март 2019?,"Portland is often awarded ""Greenest City in Am...",ru,True,0,Portland,
7206,Россия участвует в борьбе с терроризмом в Ирак...,The Russian military intervention in the Syria...,ru,True,4,Russian military intervention,
7207,Можно ли работать учителем в России при наличи...,Children of elementary classes are normally se...,ru,False,-1,no,
7208,Можео ли в России с регистрацией в психиатриче...,In 2014 the Pearson/Economist Intelligence Uni...,ru,False,-1,no,
...,...,...,...,...,...,...,...
7299,Когда был запущен первый «Аполло́н»?,The Apollo program was the third United States...,ru,True,207,1969,
7300,Какая российская теннисистка является самой ти...,"Maria Yuryevna Sharapova (; born April 19, 198...",ru,True,0,Maria Yuryevna Sharapova,
7301,Сколько республик входило в состав СССР в 1990...,"In the final decades of its existence, the Sov...",ru,True,80,fifteen,
7302,Как называлась территории Словакии во времена ...,Slovakia was partly occupied by Roman legions ...,ru,True,75,Marcomannia,


### Model

In [21]:
# Model definition and training arguments
model = BertForQuestionAnswering.from_pretrained('bert-base-multilingual-cased')

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Metric for evaluation using the evaluate library
metric = evaluate.load("seqeval")

# Define compute_metrics function for F1 score evaluation
def compute_metrics(pred):
    start_pred = pred.predictions[0].argmax(axis=-1)
    end_pred = pred.predictions[1].argmax(axis=-1)

    start_labels = pred.label_ids[0]
    end_labels = pred.label_ids[1]

    def create_label_sequences(starts, ends, true_starts, true_ends, max_length):
      pred_labels = []
      true_labels = []

      for i in range(len(starts)):
          pred_sequence = ["O"] * max_length
          true_sequence = ["O"] * max_length

          # Handle true labels
          if true_starts[i] != -100:  # Ignore padding
              if true_starts[i] < max_length and true_ends[i] < max_length:
                  true_sequence[true_starts[i]] = "B" if true_starts[i] == true_ends[i] else "I"
                  for j in range(true_starts[i] + 1, true_ends[i] + 1):
                      if j < max_length:  # Check the bounds before assignment
                          true_sequence[j] = "I"

          # Handle predicted labels
          if starts[i] != -100:  # Ignore padding
              if starts[i] < max_length and ends[i] < max_length:
                  pred_sequence[starts[i]] = "B" if starts[i] == ends[i] else "I"
                  for j in range(starts[i] + 1, ends[i] + 1):
                      if j < max_length:  # Check the bounds before assignment
                          pred_sequence[j] = "I"

          pred_labels.append(pred_sequence)
          true_labels.append(true_sequence)

      return pred_labels, true_labels


    max_length = 512
    true_predictions, true_labels = create_label_sequences(start_pred, end_pred, start_labels, end_labels, max_length)

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_ru,
    eval_dataset=val_dataset_ru,
    compute_metrics=compute_metrics
)

# Model training
trainer.train()

# Model evaluation on validation set
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,4.271239,0.138655,0.138947,0.138801,0.989648
2,No log,4.078876,0.144958,0.145263,0.14511,0.98449
3,4.369500,4.076096,0.140756,0.141053,0.140904,0.975561
4,4.369500,4.147091,0.142857,0.143158,0.143007,0.977281
5,3.548900,4.205728,0.144958,0.145263,0.14511,0.97573


Evaluation Results: {'eval_loss': 4.205728054046631, 'eval_precision': 0.14495798319327732, 'eval_recall': 0.14526315789473684, 'eval_f1': 0.14511041009463724, 'eval_accuracy': 0.9757295496323529, 'eval_runtime': 20.1739, 'eval_samples_per_second': 23.595, 'eval_steps_per_second': 2.974, 'epoch': 5.0}


## BERT_QAModel_For_Japanese_Language

### Dataset of Japanese Language

In [22]:
# 1. Filter the DataFrame for Japanese language questions
ja_df = df[df['lang'] == 'ja']

# 2. Split the DataFrame into training and validation sets
ja_train_df, ja_val_df = train_test_split(ja_df, test_size=0.2, random_state=42)

ja_train_df = Dataset.from_pandas(ja_train_df)
ja_val_df = Dataset.from_pandas(ja_val_df)

# Tokenize both train and validation datasets
train_dataset_ja = ja_train_df.map(prepare_data, batched=True)
val_dataset_ja = ja_val_df.map(prepare_data, batched=True)

print(train_dataset_ja)

Map:   0%|          | 0/2205 [00:00<?, ? examples/s]



Map:   0%|          | 0/552 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'context', 'lang', 'answerable', 'answer_start', 'answer', 'answer_inlang', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 2205
})


### Model

In [23]:
# Model definition and training arguments
model = BertForQuestionAnswering.from_pretrained('bert-base-multilingual-cased')

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Metric for evaluation using the evaluate library
metric = evaluate.load("seqeval")

# Define compute_metrics function for F1 score evaluation
def compute_metrics(pred):
    start_pred = pred.predictions[0].argmax(axis=-1)
    end_pred = pred.predictions[1].argmax(axis=-1)

    start_labels = pred.label_ids[0]
    end_labels = pred.label_ids[1]

    def create_label_sequences(starts, ends, true_starts, true_ends, max_length):
        pred_labels = []
        true_labels = []

        for i in range(len(starts)):
            pred_sequence = ["O"] * max_length
            true_sequence = ["O"] * max_length

            if true_starts[i] != -100:
                true_sequence[true_starts[i]] = "B" if true_starts[i] == true_ends[i] else "I"
                for j in range(true_starts[i] + 1, true_ends[i] + 1):
                    true_sequence[j] = "I"

            if starts[i] != -100:
                pred_sequence[starts[i]] = "B" if starts[i] == ends[i] else "I"
                for j in range(starts[i] + 1, ends[i] + 1):
                    pred_sequence[j] = "I"

            pred_labels.append(pred_sequence)
            true_labels.append(true_sequence)

        return pred_labels, true_labels

    max_length = 512
    true_predictions, true_labels = create_label_sequences(start_pred, end_pred, start_labels, end_labels, max_length)

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_ja,
    eval_dataset=val_dataset_ja,
    compute_metrics=compute_metrics
)

# Model training
trainer.train()

# Model evaluation on validation set
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,4.017016,0.190217,0.190217,0.190217,0.991271
2,4.094500,3.883777,0.182971,0.182971,0.182971,0.986006
3,4.094500,3.894896,0.184783,0.184783,0.184783,0.98308
4,3.338700,3.892817,0.182971,0.182971,0.182971,0.981757
5,3.338700,3.947004,0.179348,0.179348,0.179348,0.981541


Evaluation Results: {'eval_loss': 3.9470043182373047, 'eval_precision': 0.1793478260869565, 'eval_recall': 0.1793478260869565, 'eval_f1': 0.1793478260869565, 'eval_accuracy': 0.9815408457880435, 'eval_runtime': 23.2292, 'eval_samples_per_second': 23.763, 'eval_steps_per_second': 2.97, 'epoch': 5.0}
