In [1]:
pip install transformers datasets torch sacrebleu scikit-learn





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import sacrebleu

# Step 1: Load MedQA-USMLE-4-options dataset
dataset = load_dataset('GBaker/MedQA-USMLE-4-options')

# Step 2: Use the available 'train' split
train_data = dataset['train']

# Step 3: Extract questions and correct answers
questions = [item['question'] for item in train_data]
answers = [item['answer'] for item in train_data]

# Step 4: Load BERT Large model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Step 5: Function to predict answers
def predict_answer(question, model, tokenizer):
    inputs = tokenizer(question, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer

# Step 6: Predict answers for all questions
predictions = [predict_answer(q, model, tokenizer) for q in questions]

# Step 7: Calculate BLEU score
def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

bleu_score = compute_bleu(predictions, answers)
print(f"BLEU Score (BERT Large): {bleu_score}")

Downloading readme:   0%|          | 0.00/654 [00:00<?, ?B/s]

Downloading and preparing dataset json/GBaker--MedQA-USMLE-4-options to C:/Users/91896/.cache/huggingface/datasets/GBaker___json/GBaker--MedQA-USMLE-4-options-4a9914430e36c3d0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/91896/.cache/huggingface/datasets/GBaker___json/GBaker--MedQA-USMLE-4-options-4a9914430e36c3d0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



BLEU Score (BERT Large): 0.0841919825087901


In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from datasets import load_dataset
import sacrebleu

# Step 1: Load the dataset
dataset = load_dataset('GBaker/MedQA-USMLE-4-options')
train_data = dataset['train']
questions = [item['question'] for item in train_data]
answers = [item['answer'] for item in train_data]

# Step 2: Load RoBERTa model and tokenizer
roberta_model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
model = AutoModelForQuestionAnswering.from_pretrained(roberta_model_name)

# Step 3: Function to predict answers
def predict_answer_roberta(question, model, tokenizer):
    inputs = tokenizer(question, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens)
    return answer

# Step 4: BLEU score computation
def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

# Step 5: Predict answers and calculate BLEU score for RoBERTa
predictions = [predict_answer_roberta(q, model, tokenizer) for q in questions]
bleu_score = compute_bleu(predictions, answers)
print(f"BLEU Score (RoBERTa): {bleu_score}")

Found cached dataset json (C:/Users/91896/.cache/huggingface/datasets/GBaker___json/GBaker--MedQA-USMLE-4-options-4a9914430e36c3d0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]


BLEU Score (RoBERTa): 0.011614127870993073


In [2]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Step 1: Load XLM-R model and tokenizer
xlmr_model_name = "deepset/xlm-roberta-large-squad2"
tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)
model = AutoModelForQuestionAnswering.from_pretrained(xlmr_model_name)

# Step 2: Predict answers for XLM-R
def predict_answer_xlmr(question, model, tokenizer):
    inputs = tokenizer(question, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens)
    return answer

# Step 3: Calculate BLEU score for XLM-R
predictions = [predict_answer_xlmr(q, model, tokenizer) for q in questions]
bleu_score = compute_bleu(predictions, answers)
print(f"BLEU Score (XLM-R): {bleu_score}")

Downloading tokenizer_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BLEU Score (XLM-R): 0.006705457242632046


In [3]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from datasets import load_dataset
import sacrebleu

# Step 1: Load the dataset
dataset = load_dataset('GBaker/MedQA-USMLE-4-options')
train_data = dataset['train']

# Ensure correct field names for questions and answers
questions = [item['question'] for item in train_data]
answers = [item['answer'] for item in train_data]  # Adjust based on actual field

# Step 2: Load BioBERT model and tokenizer
biobert_model_name = "dmis-lab/biobert-base-cased-v1.1-squad"
tokenizer = AutoTokenizer.from_pretrained(biobert_model_name)
model = AutoModelForQuestionAnswering.from_pretrained(biobert_model_name)

# Step 3: Define function to predict answers with BioBERT
def predict_answer_biobert(question, model, tokenizer, max_length=512):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens)
    return answer

# Step 4: Predict answers for all questions
predictions = [predict_answer_biobert(q, model, tokenizer) for q in questions]

# Step 5: Function to compute BLEU score
def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

# Step 6: Calculate and print BLEU score for BioBERT
bleu_score = compute_bleu(predictions, answers)
print(f"BLEU Score (BioBERT): {bleu_score}")


Found cached dataset json (C:/Users/91896/.cache/huggingface/datasets/GBaker___json/GBaker--MedQA-USMLE-4-options-4a9914430e36c3d0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BLEU Score (BioBERT): 0.05529896622394584


In [4]:
import sacrebleu
from datasets import load_dataset

# Step 1: Load MedQA-USMLE-4-options dataset
dataset = load_dataset('GBaker/MedQA-USMLE-4-options')
train_data = dataset['train']
questions = [item['question'] for item in train_data]
answers = [item['answer'] for item in train_data]

# Step 2: Mock function to convert text to SPARQL query
def text_to_sparql(question):
    return f"SELECT ?answer WHERE {{ ?question '{question}' }}"

# Step 3: Function to predict answers using SPARQL queries (simulated)
def predict_answer_sparql(question):
    sparql_query = text_to_sparql(question)
    answer = "mocked_answer_based_on_sparql_query"  # Placeholder answer
    return answer

# Step 4: BLEU score computation
def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

predictions = [predict_answer_sparql(q) for q in questions]
bleu_score = compute_bleu(predictions, answers)
print(f"BLEU Score (SPARQL-based models): {bleu_score}")

Found cached dataset json (C:/Users/91896/.cache/huggingface/datasets/GBaker___json/GBaker--MedQA-USMLE-4-options-4a9914430e36c3d0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/2 [00:00<?, ?it/s]

BLEU Score (SPARQL-based models): 0.0012031990418496278


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import sacrebleu
import numpy as np

# Load the dataset
dataset = load_dataset('GBaker/MedQA-USMLE-4-options')
train_data = dataset['train']

questions = [item['question'] for item in train_data]
answers = [item['answer'] for item in train_data]

# Initialize CountVectorizer and Logistic Regression
vectorizer = CountVectorizer()
classifier = LogisticRegression(max_iter=1000)

batch_size = 1000  # Set a batch size
n_batches = len(questions) // batch_size

predictions = []

for i in range(n_batches + 1):
    batch_questions = questions[i * batch_size:(i + 1) * batch_size]
    batch_answers = answers[i * batch_size:(i + 1) * batch_size]
    
    # Vectorize the batch
    X_batch = vectorizer.fit_transform(batch_questions)
    
    # Train classifier on the batch
    classifier.fit(X_batch, batch_answers)
    
    # Predict for the batch
    batch_predictions = classifier.predict(X_batch)
    predictions.extend(batch_predictions)

# Function to compute BLEU score
def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

# Calculate and print BLEU score
bleu_score = compute_bleu(predictions, answers)
print(f"BLEU Score (CountVectorizer + Logistic Regression): {bleu_score}")



Found cached dataset json (C:/Users/91896/.cache/huggingface/datasets/GBaker___json/GBaker--MedQA-USMLE-4-options-4a9914430e36c3d0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/2 [00:00<?, ?it/s]

BLEU Score (CountVectorizer + Logistic Regression): 100.00000000000004


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
import sacrebleu

# Step 1: Load the dataset
dataset = load_dataset('GBaker/MedQA-USMLE-4-options')
train_data = dataset['train']
questions = [item['question'] for item in train_data]
answers = [item['answer'] for item in train_data]

# Step 2: Load GPT-2 model and tokenizer
gpt2_model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name)
model = AutoModelForCausalLM.from_pretrained(gpt2_model_name)

# Step 3: Function to predict answers using GPT-2
def predict_answer_gpt2(question, model, tokenizer, max_length=100, max_new_tokens=50):
    inputs = tokenizer(question, return_tensors="pt", truncation=True)
    
    # Generate predictions with specified `max_length` and `max_new_tokens`
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,  # Increase max_length if input is too long
        max_new_tokens=max_new_tokens,  # Control how many tokens GPT-2 generates
        pad_token_id=tokenizer.eos_token_id,  # Ensure correct padding
        attention_mask=inputs.attention_mask
    )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

# Step 4: Predict answers for all questions
predictions = [predict_answer_gpt2(q, model, tokenizer) for q in questions]

# Step 5: Function to compute BLEU score
def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

# Step 6: Calculate and print BLEU score
bleu_score = compute_bleu(predictions, answers)
print(f"BLEU Score (GPT-2): {bleu_score}")


Found cached dataset json (C:/Users/91896/.cache/huggingface/datasets/GBaker___json/GBaker--MedQA-USMLE-4-options-4a9914430e36c3d0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  torch.utils._pytree._register_pytree_node(
Both `max_new_tokens` (=50) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)





Both `max_new_tokens` (=50) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

BLEU Score (GPT-2): 0.03535077658592161
