In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the dataset
dataset_train = pd.read_parquet("dataset/train.parquet")
dataset_validation = pd.read_parquet("dataset/validation.parquet")

# Initialize the tokenizer and model (using multilingual BERT)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')



In [19]:
def create_labels(example, tokenizer):
    """
    Create token labels for sequence labelling based on answer_start and answer length.
    """
    question = example['question']
    context = example['context']
    answer_start = example['answer_start']
    answer_text = example['answer']
    
    # Tokenize question and context together
    encoding = tokenizer(question, context, truncation=True, padding='max_length', return_offsets_mapping=True)
    
    # Initialize labels as all 0s (no answer span)
    labels = np.zeros(len(encoding['input_ids']))

    if example['answerable']:  # Only label the span if the question is answerable
        # Get the start and end of the answer
        answer_end = answer_start + len(answer_text)
        
        # Mark answer span in the tokenized context
        for idx, (start, end) in enumerate(encoding['offset_mapping']):
            if start >= answer_start and end <= answer_end:
                labels[idx] = 1  # Mark tokens as part of the answer
                
    return encoding['input_ids'], encoding['attention_mask'], labels

In [20]:
# Function to create encodings and labels from the dataset
def create_encodings_and_labels(dataset, tokenizer):
    encodings = {'input_ids': [], 'attention_mask': []}
    labels = []
    
    for _, row in dataset.iterrows():
        input_ids, attention_mask, label = create_labels(row, tokenizer)
        encodings['input_ids'].append(input_ids)
        encodings['attention_mask'].append(attention_mask)
        labels.append(label)
        
    # Convert lists to tensors
    encodings['input_ids'] = torch.tensor(encodings['input_ids'])
    encodings['attention_mask'] = torch.tensor(encodings['attention_mask'])
    labels = torch.tensor(labels)
    
    return encodings, labels

In [27]:
# Apply this function to both train and validation sets
train_encodings, train_labels = create_encodings_and_labels(dataset_train, tokenizer)
val_encodings, val_labels = create_encodings_and_labels(dataset_validation, tokenizer)

# Convert data to HuggingFace Dataset format
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 
                                   'attention_mask': train_encodings['attention_mask'], 
                                   'labels': train_labels})

val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 
                                 'attention_mask': val_encodings['attention_mask'], 
                                 'labels': val_labels})

In [22]:
# Define a custom Dataset class
class QADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Return all necessary items that the model requires for training
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)
    
# Initialize the datasets
train_dataset = QADataset(train_encodings, train_labels)
val_dataset = QADataset(val_encodings, val_labels)


In [28]:
# Load pre-trained multilingual BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",     # Evaluation strategy
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size
    per_device_eval_batch_size=8,    
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [30]:
# Train the model
trainer.train()



# Evaluate the model
trainer.evaluate()

: 

In [None]:
def postprocess_predictions(predictions, offsets, context):
    """
    Postprocess token-level predictions to extract the answer span.
    """
    answer_tokens = [idx for idx, label in enumerate(predictions) if label == 1]
    
    # If no answer tokens are found, the answer is unanswerable
    if not answer_tokens:
        return ""
    
    # Extract the start and end tokens
    start_char = offsets[answer_tokens[0]][0]
    end_char = offsets[answer_tokens[-1]][1]
    
    return context[start_char:end_char]

# Example on how to use it
# Assuming `preds` are the token predictions and `offsets` are the offset mappings from tokenization
# pred_answer = postprocess_predictions(preds, offsets, context)
