In [20]:
import torch
import random
import numpy as np
import pandas as pd

from sklearn.metrics import precision_recall_fscore_support

from transformers import BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset

In [19]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

dataset_train       = pd.read_parquet("dataset/train.parquet")
dataset_validation  = pd.read_parquet("dataset/validation.parquet")


In [4]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForQuestionAnswering.from_pretrained('bert-base-multilingual-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
dataset_train.columns
dataset_validation.columns


Index(['question', 'context', 'lang', 'answerable', 'answer_start', 'answer',
       'answer_inlang'],
      dtype='object')

In [13]:
# Custom Dataset class to prepare data for training
class QADataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the row at the specified index
        row = self.data.iloc[idx]

        # Extract question, context, answer start and answer
        question = row['question']
        context = row['context']
        answer = row['answer']
        answer_start = row['answer_start']
        
        # Tokenize the context and question
        inputs = self.tokenizer(question, context, return_tensors='pt', padding=True, truncation=True)

        # Find the token index for the answer start and end
        answer_end = answer_start + len(answer)
        start_positions = inputs.char_to_token(0, answer_start)
        end_positions = inputs.char_to_token(0, answer_end - 1)

        # If start or end position is None, return empty labels for unanswerable questions
        if start_positions is None or end_positions is None:
            start_positions = torch.tensor([0])  # No valid answer, empty label
            end_positions = torch.tensor([0])    # No valid answer, empty label

        # Add labels for start and end positions
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'start_positions': torch.tensor(start_positions),
            'end_positions': torch.tensor(end_positions)
        }

In [14]:
# Initialize datasets and data loaders
train_dataset = QADataset(dataset_train)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)



In [16]:
# Training loop (simplified)
for batch in train_loader:
    # Move tensors to the appropriate device if needed (GPU/CPU)
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    start_positions = batch['start_positions']
    end_positions = batch['end_positions']
    
    # Forward pass
    outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                    start_positions=start_positions, end_positions=end_positions)
    
    # Loss and backward
    loss = outputs.loss
    loss.backward()

    # Update model weights (you'd typically include an optimizer here)
    print(f"Loss: {loss.item()}")

ValueError: char_to_token() is not available when using Python based tokenizers

In [None]:
# Tokenize inputs
inputs = tokenizer(questions, contexts, return_tensors="pt", padding=True, truncation=True)

# Get start and end position labels for the answers
start_positions = torch.tensor([answer[0] for answer in answers])
end_positions = torch.tensor([answer[1] for answer in answers])

# Training step
outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
loss.backward()  # Backpropagate

In [None]:
# Evaluation
predictions = model(**inputs)
pred_start = torch.argmax(predictions.start_logits, dim=1)
pred_end = torch.argmax(predictions.end_logits, dim=1)