In [24]:
import pandas as pd
from datasets import Dataset

# Load your CSV file
df = pd.read_csv('./data-collection/qa/medquad.csv')

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)


In [25]:
display(df)

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
...,...,...,...,...
16407,What is (are) Diabetic Neuropathies: The Nerve...,Focal neuropathy appears suddenly and affects ...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16408,How to prevent Diabetic Neuropathies: The Nerv...,The best way to prevent neuropathy is to keep ...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16409,How to diagnose Diabetic Neuropathies: The Ner...,Doctors diagnose neuropathy on the basis of sy...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...
16410,What are the treatments for Diabetic Neuropath...,The first treatment step is to bring blood glu...,NIDDK,Diabetic Neuropathies: The Nerve Damage of Dia...


In [27]:
# Define the filter function to remove invalid rows
def filter_invalid_rows(example):
    return isinstance(example["answer"], str)

# Preprocess the data
def preprocess_data(examples):
    questions = examples["question"]
    answers = examples["answer"]
    contexts = examples["source"]  # Assuming source is the context, change if needed

    tokenized_examples = tokenizer(
        questions, 
        contexts, 
        truncation=True, 
        max_length=384, 
        padding="max_length",
        return_tensors="pt"
    )

    # Start and end positions of the answers
    start_positions = []
    end_positions = []

    for i in range(len(questions)):
        context = contexts[i]
        answer = answers[i]
        
        # Tokenize context and answer separately to find start and end positions
        tokenized_context = tokenizer(context, truncation=True, max_length=384)
        tokenized_answer = tokenizer(answer, truncation=True, max_length=384)
        
        # Find the start and end token positions of the answer in the context
        start_pos = tokenized_context['input_ids'].index(tokenized_answer['input_ids'][1])  # Exclude [CLS] token
        end_pos = start_pos + len(tokenized_answer['input_ids']) - 3  # Exclude [CLS] and [SEP] tokens
        
        start_positions.append(start_pos)
        end_positions.append(end_pos)
    
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    
    return tokenized_examples

# Apply the filter function to the dataset
filtered_dataset = dataset.filter(filter_invalid_rows)

# Apply the preprocessing function
tokenized_dataset = filtered_dataset.map(preprocess_data, batched=True)

# Remove columns not required by the model
tokenized_dataset = tokenized_dataset.remove_columns(["question", "answer", "source", "focus_area", "__index_level_0__"])

Filter:   0%|          | 0/16412 [00:00<?, ? examples/s]

Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

ValueError: 1043 is not in list

In [13]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define the filter function to remove invalid rows
def filter_invalid_rows(example):
    return isinstance(example["answer"], str)

# Apply the filter function to the dataset
filtered_dataset = dataset.filter(filter_invalid_rows)

# Define the preprocess function
def preprocess_function(examples):
    inputs = tokenizer(examples["question"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(examples["answer"], padding="max_length", truncation=True, max_length=512)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Use `batched=True` and set batch size
tokenized_dataset = filtered_dataset.map(preprocess_function, batched=True, batch_size=8)

# To display tokenized dataset
print(tokenized_dataset)


Filter:   0%|          | 0/16412 [00:00<?, ? examples/s]

Map:   0%|          | 0/16407 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'source', 'focus_area', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 16407
})


In [14]:
class QATrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        start_positions = inputs.pop("start_positions")
        end_positions = inputs.pop("end_positions")
        outputs = model(**inputs)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

        # Compute the loss
        loss_fct = torch.nn.CrossEntropyLoss()
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2

        return total_loss


In [16]:
qa_trainer = QATrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

qa_trainer.train()


  0%|          | 0/12306 [00:00<?, ?it/s]

KeyError: 'start_positions'

In [3]:
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments
import torch

# Load the pre-trained model
model_name = "bert-base-uncased"
model = BertForQuestionAnswering.from_pretrained(model_name)

# Define a custom Trainer subclass
class QATrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

        # Compute the loss
        loss_fct = torch.nn.CrossEntropyLoss()
        start_loss = loss_fct(start_logits, labels[:, 0])
        end_loss = loss_fct(end_logits, labels[:, 1])
        total_loss = (start_loss + end_loss) / 2

        return total_loss

# Initialize the QA Trainer with your model and training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

qa_trainer = QATrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Assuming you have split your dataset into train and eval
    eval_dataset=tokenized_dataset,
)

# Train the model using your customized Trainer
qa_trainer.train()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12306 [00:00<?, ?it/s]

KeyError: 'labels'

In [4]:
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments
import torch

# Load the pre-trained model and tokenizer
model_name = "bert-base-uncased"
model = BertForQuestionAnswering.from_pretrained(model_name)

# Define a custom Trainer subclass
class QATrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

        # Compute the loss
        loss_fct = torch.nn.CrossEntropyLoss()
        start_loss = loss_fct(start_logits, labels[:, 0])
        end_loss = loss_fct(end_logits, labels[:, 1])
        total_loss = (start_loss + end_loss) / 2  # You can adjust how you combine the losses

        return (total_loss,)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the QA Trainer with your model and training arguments
qa_trainer = QATrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model using your customized Trainer
qa_trainer.train()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12306 [00:00<?, ?it/s]

KeyError: 'labels'