## Library

In [None]:
!pip install datasets evaluate
!pip install seqeval
!pip install fugashi ipadic

In [2]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, AdamW, BertTokenizer
from datasets import Dataset, load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import evaluate
import seqeval
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from torch.utils.data import DataLoader, Dataset
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

## Dataset Loading

In [3]:
dataset = load_dataset("coastalcph/tydi_xor_rc")
train_set = dataset["train"]
validation_set = dataset["validation"]
df = pd.DataFrame(train_set)
df_eval = pd.DataFrame(validation_set)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/6.87M [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15326 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3028 [00:00<?, ? examples/s]

In [4]:
df.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...,WikiLeaks () is an international non-profit or...,bn,True,182,2006,
1,দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?,The war in Europe concluded with an invasion o...,bn,True,48,Germany,
2,মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...,Same-sex marriage in the United States expande...,bn,False,-1,no,
3,আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...,The exact number of Arab casualties is unknown...,bn,True,39,unknown,
4,বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?,"As Thomas Hall (2000) notes, ""The Sung Empire ...",bn,True,1219,17th century,


In [5]:
df_eval.head()

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,ఒరెగాన్ రాష్ట్రంలోని అతిపెద్ద నగరం ఏది ?,Portland is the largest city in the U.S. state...,te,True,0,Portland,
1,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు ?,"The word cholera is from ""kholera"" from χολή ""...",te,True,99,Indian subcontinent,
2,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు ?,Since it became widespread in the 19th century...,te,True,451,England,
3,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది ?,World War I occurred from 1914 to 1918. In ter...,te,True,26,1914,
4,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది ?,"World War I (often abbreviated as WWI or WW1),...",te,True,155,28 July 1914,


## BERT_QAModel_For_Finnish_Language

### Tokenization

In [6]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def tokenize_and_map(fi_df):
    tokenized_data = []

    for _, row in fi_df.iterrows():
        question = row['question']
        context = row['context']
        answerable = row['answerable']
        answer_start = row['answer_start'] if answerable else None
        answer = row['answer'] if answerable else ""

        encoding = tokenizer.encode_plus(
            question,
            context,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        offsets = encoding['offset_mapping'][0]

        # BIO labels
        labels = ['O'] * len(offsets)

        if answerable:
            answer_start_token_index = None
            answer_end_token_index = None

            answer_end = answer_start + len(answer)

            for i, (start, end) in enumerate(offsets):
                if start == answer_start and answer_start_token_index is None:
                    answer_start_token_index = i
                if end >= answer_end and answer_end_token_index is None:
                    answer_end_token_index = i
                    break

            if answer_start_token_index is not None:
                labels[answer_start_token_index] = 'B-ANSWER'
                for i in range(answer_start_token_index + 1, answer_end_token_index + 1):
                    labels[i] = 'I-ANSWER'

        tokenized_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'offsets': offsets,
            'labels': labels
        })

    return tokenized_data


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



In [7]:
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = [data['input_ids'] for data in tokenized_data]
        self.attention_masks = [data['attention_mask'] for data in tokenized_data]
        self.offsets = [data['offsets'] for data in tokenized_data]

        # Convert BIO labels to a numerical format
        self.labels = self.create_labels(tokenized_data)

    def create_labels(self, tokenized_data):
        label_map = {'O': 0, 'B-ANSWER': 1, 'I-ANSWER': 2}
        labels = []

        for data in tokenized_data:
            label_list = [label_map[label] for label in data['labels']]
            labels.append(label_list)

        return labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [8]:
fi_df = df[df['lang'] == 'fi']
eval_fi_df = df_eval[df_eval['lang'] == 'fi']

tokenized_fi_data = tokenize_and_map(fi_df)
tokenized_fi_eval_data = tokenize_and_map(eval_fi_df)
train_fi_data = tokenized_fi_data
val_fi_data = tokenized_fi_eval_data

train_fi_dataset = QADataset(train_fi_data)
val_fi_dataset = QADataset(val_fi_data)

train_fi_dataloader = DataLoader(train_fi_dataset, batch_size=2, shuffle=True)
val_fi_dataloader = DataLoader(val_fi_dataset, batch_size=2, shuffle=False)

In [9]:
for batch in train_fi_dataloader:
    print("Training batch:", batch)
    break

Training batch: {'input_ids': tensor([[  101, 19803, 61256,  ...,     0,     0,     0],
        [  101, 20108, 43208,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


In [10]:
for batch in val_fi_dataloader:
    print("Training batch:", batch)
    break

Training batch: {'input_ids': tensor([[  101, 14247, 11013,  ...,     0,     0,     0],
        [  101, 19803, 61256,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


In [None]:
for i in range(5):
    print(f"Example {i}:")
    print(f"Input IDs: {tokenized_fi_data[i]['input_ids']}")
    print(f"Attention Mask: {tokenized_fi_data[i]['attention_mask']}")
    print(f"Labels: {tokenized_fi_data[i]['labels']}")
    print()


Example 0:
Input IDs: tensor([   101,  12699,  61256,  17586,  66270,  10115,  10164,  10853,  37557,
        110151,  10123,    136,    102,  10167,  10226,  29322,  28109,    117,
         21226,  14592,  10124,  12359,  71458,  27048,  10188,  10105,  30704,
         10108,  10105,  11584,  10108,  14592,  10106,  10105,  32074,  11943,
         19376,  10114,  10105,  64306,  10108,  10105,  13163,  12359,  13642,
         10106,  10105,  23255,  11943,  22992,    117,  10110,  22530,  59680,
         10105,  12359,  14648,    117,  12359,  13681,  10111,  12359,  13642,
         11444,  10105,  18042,  10108,  10105,  16672,  34873,    119,  10117,
         71458,  27048,  11941,  10146,  10151,  45050,  10350,  23931,  10106,
         10105,  11667,  71921,    117,  33573,  10188,  10105,  32074,  11943,
         19376,    117,  10189,  23616,  10708,  10105,  11584,  10108,  14592,
         10111,  10319,  20961,  15362,  10474,  11324,  10114,  10105,  34873,
         10491,  1

### Model Training

In [None]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=3)
model.to('cuda')

# label mapping
label_to_id = {'B-Answer': 0, 'I-Answer': 1, 'O': 2}
id_to_label = {v: k for k, v in label_to_id.items()}

model.train()

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_fi_dataloader:
        optimizer.zero_grad()
        inputs = {key: val.to('cuda') for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_fi_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


Epoch 1/3, Loss: 0.0499
Epoch 2/3, Loss: 0.0388
Epoch 3/3, Loss: 0.0314


### Model Evaluation

In [None]:
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_fi_dataloader:
        inputs = {key: val.to('cuda') for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(inputs['labels'].cpu().numpy())


flat_preds = [pred for sublist in all_preds for pred in sublist if pred != -100]
flat_labels = [label for sublist in all_labels for label in sublist if label != -100]


print(classification_report(flat_labels, flat_preds, target_names=list(label_to_id.keys())))

  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


              precision    recall  f1-score   support

    B-Answer       0.99      1.00      0.99    268871
    I-Answer       0.49      0.06      0.10       416
           O       0.42      0.33      0.37      2585

    accuracy                           0.99    271872
   macro avg       0.64      0.46      0.49    271872
weighted avg       0.99      0.99      0.99    271872



## BERT_QAModel_For_Russian_Language

### Tokenization

In [11]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def tokenize_and_map(ru_df):
    tokenized_data = []

    for _, row in ru_df.iterrows():
        question = row['question']
        context = row['context']
        answerable = row['answerable']
        answer_start = row['answer_start'] if answerable else None
        answer = row['answer'] if answerable else ""

        encoding = tokenizer.encode_plus(
            question,
            context,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        offsets = encoding['offset_mapping'][0]

        # BIO labels
        labels = ['O'] * len(offsets)

        if answerable:
            answer_start_token_index = None
            answer_end_token_index = None

            answer_end = answer_start + len(answer)

            for i, (start, end) in enumerate(offsets):
                if start == answer_start and answer_start_token_index is None:
                    answer_start_token_index = i
                if end >= answer_end and answer_end_token_index is None:
                    answer_end_token_index = i
                    break

            if answer_start_token_index is not None:
                labels[answer_start_token_index] = 'B-ANSWER'
                for i in range(answer_start_token_index + 1, answer_end_token_index + 1):
                    labels[i] = 'I-ANSWER'

        tokenized_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'offsets': offsets,
            'labels': labels
        })

    return tokenized_data


In [12]:
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = [data['input_ids'] for data in tokenized_data]
        self.attention_masks = [data['attention_mask'] for data in tokenized_data]
        self.offsets = [data['offsets'] for data in tokenized_data]

        # Convert BIO labels to a numerical format
        self.labels = self.create_labels(tokenized_data)

    def create_labels(self, tokenized_data):
        label_map = {'O': 0, 'B-ANSWER': 1, 'I-ANSWER': 2}
        labels = []

        for data in tokenized_data:
            label_list = [label_map[label] for label in data['labels']]
            labels.append(label_list)

        return labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [13]:
ru_df = df[df['lang'] == 'ru']
eval_ru_df = df_eval[df_eval['lang'] == 'ru']

tokenized_ru_data = tokenize_and_map(ru_df)
tokenized_ru_eval_data = tokenize_and_map(eval_ru_df)

train_ru_data = tokenized_ru_data
val_ru_data = tokenized_ru_eval_data

train_ru_dataset = QADataset(train_ru_data)
val_ru_dataset = QADataset(val_ru_data)

train_ru_dataloader = DataLoader(train_ru_dataset, batch_size=2, shuffle=True)
val_ru_dataloader = DataLoader(val_ru_dataset, batch_size=2, shuffle=False)

In [14]:
for batch in train_ru_dataloader:
    print("Training batch:", batch)
    break

Training batch: {'input_ids': tensor([[  101, 45383, 10384,  ...,     0,     0,     0],
        [  101, 85904, 10387,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


In [15]:
for batch in val_ru_dataloader:
    print("Training batch:", batch)
    break

Training batch: {'input_ids': tensor([[  101,   512, 12265,  ...,     0,     0,     0],
        [  101,   511, 12668,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


In [None]:
for i in range(5):
    print(f"Example {i}:")
    print(f"Input IDs: {tokenized_ru_data[i]['input_ids']}")
    print(f"Attention Mask: {tokenized_ru_data[i]['attention_mask']}")
    print(f"Labels: {tokenized_ru_data[i]['labels']}")
    print()


Example 0:
Input IDs: tensor([   101,    526, 101351,  13181,  69624,  11602,  10332,  17465,  34845,
           557,  29749, 101617,  11905,    117,    558,  31840,  11029,    118,
           562,  10297,  33431,  10811,    118,    562,  18148,    543, 107954,
         11106,  62982,  36852,  72459,  81026,  16954,  72502,    136,    102,
         10117,  12592,    153,  51932,  10112,  10345,  10105,  12592,  39925,
         10134,    169,  17323,  10108,  13736,  76456,  45791,  10106,  10105,
         15277,  11457,  10319,  25738,  10188,  11123,  10114,  11082,    119,
         10377,  16247,    169,  12077,    118,  19707,  32385,  10525,  10108,
         10105,  33065,  12529,  10111,  12047,  27730,    117,  76456,  45791,
         10108,  59750,  11773,  69564,  10107,  10111,  10105,  11641,  12762,
         25121,    117,  48675,  15034,  58553,    117,  10846,  34891,  11046,
         10108,  10148,  25022,  17824,    117,  46298,    118,  48336,  26445,
           117,  9

### Model Training

In [None]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=3)
model.to('cuda')

# label mapping
label_to_id = {'B-Answer': 0, 'I-Answer': 1, 'O': 2}
id_to_label = {v: k for k, v in label_to_id.items()}

model.train()

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_ru_dataloader:
        optimizer.zero_grad()
        inputs = {key: val.to('cuda') for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_ru_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


Epoch 1/3, Loss: 0.0483
Epoch 2/3, Loss: 0.0372
Epoch 3/3, Loss: 0.0308


### Model Evaluation

In [None]:
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_ru_dataloader:
        inputs = {key: val.to('cuda') for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(inputs['labels'].cpu().numpy())

flat_preds = [pred for sublist in all_preds for pred in sublist if pred != -100]
flat_labels = [label for sublist in all_labels for label in sublist if label != -100]


print(classification_report(flat_labels, flat_preds, target_names=list(label_to_id.keys())))

  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


              precision    recall  f1-score   support

    B-Answer       0.99      1.00      1.00    241460
    I-Answer       0.43      0.02      0.03       363
           O       0.47      0.06      0.10      1889

    accuracy                           0.99    243712
   macro avg       0.63      0.36      0.38    243712
weighted avg       0.99      0.99      0.99    243712



## BERT_QAModel_For_Japanese_Language

### Tokenization

In [16]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def tokenize_and_map(ja_df):
    tokenized_data = []

    for _, row in ja_df.iterrows():
        question = row['question']
        context = row['context']
        answerable = row['answerable']
        answer_start = row['answer_start'] if answerable else None
        answer = row['answer'] if answerable else ""

        encoding = tokenizer.encode_plus(
            question,
            context,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'][0]
        attention_mask = encoding['attention_mask'][0]
        offsets = encoding['offset_mapping'][0]

        # BIO labels
        labels = ['O'] * len(offsets)

        if answerable:
            answer_start_token_index = None
            answer_end_token_index = None

            answer_end = answer_start + len(answer)

            for i, (start, end) in enumerate(offsets):
                if start == answer_start and answer_start_token_index is None:
                    answer_start_token_index = i
                if end >= answer_end and answer_end_token_index is None:
                    answer_end_token_index = i
                    break

            if answer_start_token_index is not None:
                labels[answer_start_token_index] = 'B-ANSWER'
                for i in range(answer_start_token_index + 1, answer_end_token_index + 1):
                    labels[i] = 'I-ANSWER'

        tokenized_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'offsets': offsets,
            'labels': labels
        })

    return tokenized_data


In [17]:
class QADataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = [data['input_ids'] for data in tokenized_data]
        self.attention_masks = [data['attention_mask'] for data in tokenized_data]
        self.offsets = [data['offsets'] for data in tokenized_data]

        # Convert BIO labels to a numerical format
        self.labels = self.create_labels(tokenized_data)

    def create_labels(self, tokenized_data):
        label_map = {'O': 0, 'B-ANSWER': 1, 'I-ANSWER': 2}
        labels = []

        for data in tokenized_data:
            label_list = [label_map[label] for label in data['labels']]
            labels.append(label_list)

        return labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [18]:
ja_df = df[df['lang'] == 'ja']
ja_eval_df = df_eval[df_eval['lang'] == 'ja']

tokenized_ja_data = tokenize_and_map(ja_df)
tokenized_ja_eval_data = tokenize_and_map(ja_eval_df)

train_ja_data = tokenized_ja_data
val_ja_data = tokenized_ja_eval_data

train_ja_dataset = QADataset(train_ja_data)
val_ja_dataset = QADataset(val_ja_data)

train_ja_dataloader = DataLoader(train_ja_dataset, batch_size=2, shuffle=True)
val_ja_dataloader = DataLoader(val_ja_dataset, batch_size=2, shuffle=False)

In [19]:
for batch in train_ja_dataloader:
    print("Training batch:", batch)
    break

Training batch: {'input_ids': tensor([[  101,  2016, 48643,  ...,     0,     0,     0],
        [  101,  2007, 69395,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


In [20]:
for batch in val_ja_dataloader:
    print("Training batch:", batch)
    break

Training batch: {'input_ids': tensor([[  101,  4460,  1946,  ...,     0,     0,     0],
        [  101,  2044, 21612,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}


  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


In [None]:
for i in range(5):
    print(f"Example {i}:")
    print(f"Input IDs: {tokenized_ja_data[i]['input_ids']}")
    print(f"Attention Mask: {tokenized_ja_data[i]['attention_mask']}")
    print(f"Labels: {tokenized_ja_data[i]['labels']}")
    print()


Example 0:
Input IDs: tensor([   101,   2044,  73595,  12236,   7681,   3035,   4282,   8336,   1912,
          7069,  19140,  45440,  10083,    102,  10882,  10105,  15277,  10107,
         27072,  11222,  17879,  10106,  10825,  10111,  10670,    117,  10105,
         12026,  17941,  93621,    119,  10117,  64574,    118,  29050,  62390,
        105844,  10134,  34398,  10106,  11112,  10825,  10106,  45123,    117,
         10105,  10422,  11922,  19870,  11584,  12381,  10105,  10751,  57028,
         10114,  10347,  80176,  10155,  10105,  15277,  10107,  10188,  10105,
         81351,    117,  10111,  11941,  10114,  13574,  10491,  10105,  17941,
         10108,  10105,  12723,  10146,  10105,  46269,  75559,  10336,    119,
         10117,  19870,  12047,  10106,  10829,  43082,  36627,  10336,  10105,
         31989,  10108,  10105,  62390, 105844,    119,  10117,  62390, 105844,
         10134,  12857,  10155,  11984,  12087,  97286,  10371,    118,  46052,
         16231,   

### Model Training

In [None]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=3)
model.to('cuda')

# label mapping
label_to_id = {'B-Answer': 0, 'I-Answer': 1, 'O': 2}
id_to_label = {v: k for k, v in label_to_id.items()}

model.train()

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 3

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_ja_dataloader:
        optimizer.zero_grad()
        inputs = {key: val.to('cuda') for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_ja_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


Epoch 1/3, Loss: 0.0494
Epoch 2/3, Loss: 0.0350
Epoch 3/3, Loss: 0.0268


### Model Evaluation

In [None]:
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_ja_dataloader:
        inputs = {key: val.to('cuda') for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(inputs['labels'].cpu().numpy())

flat_preds = [pred for sublist in all_preds for pred in sublist if pred != -100]
flat_labels = [label for sublist in all_labels for label in sublist if label != -100]

print(classification_report(flat_labels, flat_preds, target_names=list(label_to_id.keys())))

  'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),


              precision    recall  f1-score   support

    B-Answer       0.99      0.99      0.99    279987
    I-Answer       0.46      0.36      0.41       442
           O       0.38      0.43      0.40      2195

    accuracy                           0.99    282624
   macro avg       0.61      0.59      0.60    282624
weighted avg       0.99      0.99      0.99    282624

