In [8]:
# 1) Environment setup (Colab)
import sys
import subprocess

def pip_install(packages):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + packages)

# Core ML stack
pip_install([
    "transformers>=4.44.0",
    "datasets>=2.14.0",
    "accelerate>=0.26.0",
    "evaluate>=0.4.0",
])

# Colab-specific checks
try:
    import torch
    import platform
    print("=" * 60)
    print("ENVIRONMENT")
    print("=" * 60)
    print(f"Python: {sys.version.split()[0]} | Platform: {platform.platform()}")
    print(f"PyTorch: {torch.__version__}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)} | CUDA: {torch.version.cuda}")
    else:
        print("GPU not detected. Enable a GPU in Runtime > Change runtime type > T4/other.")
    print("=" * 60)
except Exception as e:
    print("Environment check failed:", e)

# 2) Imports and GPU config
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("GPU CONFIGURATION")
print("=" * 60)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Memory: {round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2)} GB")
else:
    device = torch.device("cpu")
    print("GPU not available; training will be slower.")
print("=" * 60)

# 3) Dataset loading (Cardiovascular Q&A)
# Option A: Mount Drive
USE_DRIVE = False  # set True to use Drive
CSV_PATH = ""       # e.g., "/content/drive/MyDrive/medquadCardiovascular.csv"

if USE_DRIVE:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive')

# Option B: Upload a file
USE_UPLOAD = not USE_DRIVE
if USE_UPLOAD:
    try:
        from google.colab import files  # type: ignore
        uploaded = files.upload()
        # Pick the first uploaded file
        if uploaded:
            CSV_PATH = list(uploaded.keys())[0]
    except Exception:
        pass

if not CSV_PATH:
    # Fallback sample: you can place the CSV at a public URL and download it
    # For now, raise an error to prompt the user.
    raise ValueError("Please provide CSV_PATH via Drive or upload.")

print("Dataset CSV:", CSV_PATH)

# 4) Load and split dataset
print("\n" + "=" * 60)
print("LOADING CARDIOVASCULAR DATASET")
print("=" * 60)

dataset = pd.read_csv(CSV_PATH)
print(f"Total records: {len(dataset)}")
print(f"Columns: {list(dataset.columns)}")
print(f"Sample question: {str(dataset.iloc[0]['question'])[:80]}...")
print(f"Sample answer chars: {len(str(dataset.iloc[0]['answer']))}")

# Drop nulls
dataset = dataset.dropna(subset=["question", "answer"]).reset_index(drop=True)

# Train/val split (85/15)
dataset_shuffled = dataset.sample(frac=1.0, random_state=42).reset_index(drop=True)
split_idx = int(len(dataset_shuffled) * 0.85)
train_data = dataset_shuffled.iloc[:split_idx].copy()
eval_data = dataset_shuffled.iloc[split_idx:].copy()

print(f"Train: {len(train_data)} | Val: {len(eval_data)}")
print("=" * 60)

# 5) Model and tokenizer
print("\n" + "=" * 60)
print("LOADING MODEL AND TOKENIZER")
print("=" * 60)

MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
print("Model:", MODEL_NAME)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
model.to(device)

print("Model loaded.")
print("=" * 60)


# 6) Tokenization and label prep (extractive QA)
print("\n" + "=" * 60)
print("TOKENIZING DATASET")
print("=" * 60)

train_ds = Dataset.from_pandas(train_data)
eval_ds = Dataset.from_pandas(eval_data)

MAX_LENGTH = 384
DOC_STRIDE = 128

def prepare_train_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["answer"],
        truncation="only_second",
        max_length=MAX_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized["offset_mapping"]):
        sequence_ids = tokenized.sequence_ids(i)

        context_start = None
        context_end = None
        for idx, seq_id in enumerate(sequence_ids):
            if seq_id == 1:
                if context_start is None:
                    context_start = idx
                context_end = idx

        if context_start is None:
            start_positions.append(0)
            end_positions.append(0)
        else:
            answer_start = context_start
            answer_end = min(context_start + 50, context_end)
            start_positions.append(answer_start)
            end_positions.append(answer_end)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions

    # Drop offset_mapping so it isn't fed to the model
    if "offset_mapping" in tokenized:
        tokenized.pop("offset_mapping")

    return tokenized

print("Tokenizing train...")
tokenized_train = train_ds.map(
    prepare_train_features,
    batched=True,
    remove_columns=train_ds.column_names,
    desc="Tokenizing train",
)

print("Tokenizing eval...")
tokenized_eval = eval_ds.map(
    prepare_train_features,
    batched=True,
    remove_columns=eval_ds.column_names,
    desc="Tokenizing eval",
)

print("Done.")
print("=" * 60)

# 7) Metrics
import numpy as np

def compute_qa_metrics(eval_pred):
    predictions, label_ids = eval_pred
    start_logits, end_logits = predictions

    pred_starts = np.argmax(start_logits, axis=1)
    pred_ends = np.argmax(end_logits, axis=1)

    # Ensure label_ids is treated as a tuple
    true_starts = np.asarray(label_ids[0]).reshape(-1)
    true_ends = np.asarray(label_ids[1]).reshape(-1)

    exact_match = np.mean((pred_starts == true_starts) & (pred_ends == true_ends))
    start_accuracy = np.mean(pred_starts == true_starts)
    end_accuracy = np.mean(pred_ends == true_ends)

    f1_scores = []
    for ps, pe, ts, te in zip(pred_starts, pred_ends, true_starts, true_ends):
        ps, pe, ts, te = int(ps), int(pe), int(ts), int(te)
        pred_tokens = set(range(ps, pe + 1))
        true_tokens = set(range(ts, te + 1))
        if not pred_tokens and not true_tokens:
            f1_scores.append(1.0)
        elif not pred_tokens or not true_tokens:
            f1_scores.append(0.0)
        else:
            common = len(pred_tokens & true_tokens)
            if common == 0:
                f1_scores.append(0.0)
            else:
                precision = common / len(pred_tokens)
                recall = common / len(true_tokens)
                f1_scores.append(2 * (precision * recall) / (precision + recall))

    return {
        "exact_match": float(exact_match),
        "start_accuracy": float(start_accuracy),
        "end_accuracy": float(end_accuracy),
        "f1": float(np.mean(f1_scores)) if f1_scores else 0.0,
    }

print("Metrics ready.")

# 8) Training configuration (T4 GPU Optimized)
print("\n" + "=" * 60)
print("TRAINING CONFIGURATION (T4 GPU OPTIMIZED)")
print("=" * 60)

training_args = TrainingArguments(
    output_dir="./results_cardio_qa",
    num_train_epochs=5,  # Increased for better convergence
    per_device_train_batch_size=8,  # Reduced for T4 memory (16GB)
    per_device_eval_batch_size=8,  # Matched with train batch
    gradient_accumulation_steps=4,  # Increased to maintain effective batch size of 32
    learning_rate=2e-5,  # Slightly lower for medical domain stability
    weight_decay=0.01,
    warmup_ratio=0.15,  # More warmup for better stability
    lr_scheduler_type="linear",
    max_grad_norm=1.0,
    fp16=torch.cuda.is_available(),  # Essential for T4 performance
    dataloader_pin_memory=True,
    dataloader_num_workers=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_cardio_qa",
    logging_steps=25,  # More frequent logging
    logging_strategy="steps",
    report_to=[],
    seed=42,
    disable_tqdm=False,
    remove_unused_columns=True,
    # T4-specific optimizations
    gradient_checkpointing=False,  # Disabled for speed (T4 has enough memory)
    optim="adamw_torch",  # PyTorch AdamW is faster on T4
    lr_scheduler_kwargs={"num_cycles": 0.5},  # Cosine annealing alternative
)

print("T4 GPU Configuration:")
print(f"  - Batch Size: {training_args.per_device_train_batch_size}")
print(f"  - Gradient Accumulation: {training_args.gradient_accumulation_steps}")
print(f"  - Effective Batch Size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  - FP16 Enabled: {training_args.fp16}")
print(f"  - Learning Rate: {training_args.learning_rate}")
print(f"  - Epochs: {training_args.num_train_epochs}")
print("=" * 60)

# 9) Initialize Trainer
print("\n" + "=" * 60)
print("INITIALIZING TRAINER")
print("=" * 60)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_qa_metrics,
)

print("Trainer ready.")

import time
from openpyxl import Workbook
from sklearn.metrics import precision_score, recall_score

print("\n" + "=" * 60)
print("RANDOM SEARCH HYPERPARAMETER TUNING (T4 GPU OPTIMIZED)")
print("=" * 60)

# Define hyperparameter search space for Random Search
hp_search_space = {
    "epochs": [3, 5, 7, 10],
    "lr": [1e-5, 1.5e-5, 2e-5, 3e-5, 4e-5, 5e-5],
    "batch": [8],  # Fixed for T4 memory constraints
    "warmup": [0.1, 0.15, 0.2, 0.25],
    "weight_decay": [0.0, 0.001, 0.01, 0.05],
}

# Random Search: Sample N random configurations
N_RANDOM_CONFIGS = 10  # Adjust based on time budget
np.random.seed(42)  # For reproducibility

hyperparam_sets = []
for _ in range(N_RANDOM_CONFIGS):
    config = {
        "epochs": int(np.random.choice(hp_search_space["epochs"])),
        "lr": float(np.random.choice(hp_search_space["lr"])),
        "batch": int(np.random.choice(hp_search_space["batch"])),
        "warmup": float(np.random.choice(hp_search_space["warmup"])),
        "weight_decay": float(np.random.choice(hp_search_space["weight_decay"])),
    }
    hyperparam_sets.append(config)

total_iters = len(hyperparam_sets)
total_search_space = (len(hp_search_space["epochs"]) * len(hp_search_space["lr"]) *
                      len(hp_search_space["warmup"]) * len(hp_search_space["weight_decay"]))
coverage_percent = round(100 * total_iters / total_search_space, 2)

print(f"üîç Random Search Strategy")
print(f"Total search space size: {total_search_space} possible combinations")
print(f"Random sampling: {total_iters} configurations ({coverage_percent}% coverage)")
print(f"Fixed batch size: 8 (optimal for T4 16GB memory)")
print(f"Gradient accumulation: 4 (effective batch size: 32)")
print("=" * 60)

# Excel setup
wb = Workbook()
ws = wb.active
ws.title = "Random Search Results"
ws.append([
    "Iteration", "Epochs", "Learning Rate", "Batch Size", "Warmup Ratio", "Weight Decay",
    "Accuracy", "F1-Score", "Precision", "Recall", "Runtime (s)"
])

# Loop through each random configuration
for i, params in enumerate(hyperparam_sets, 1):
    print(f"\n{'='*60}")
    print(f"‚ñ∂Ô∏è Random Search Config {i}/{total_iters}")
    print(f"Params: Epochs={params['epochs']}, LR={params['lr']}, Warmup={params['warmup']}, Weight Decay={params['weight_decay']}")
    print(f"{'='*60}")

    # CRITICAL: Reload model from scratch for each iteration to avoid weight corruption
    print("üîÑ Reloading fresh model...")
    model_fresh = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
    model_fresh.to(device)

    # Update training arguments dynamically
    training_args = TrainingArguments(
        output_dir=f"./results_random_{i}",
        num_train_epochs=params["epochs"],
        per_device_train_batch_size=params["batch"],
        per_device_eval_batch_size=params["batch"],
        gradient_accumulation_steps=4,  # Fixed at 4 for T4 optimization
        learning_rate=params["lr"],
        warmup_ratio=params["warmup"],
        weight_decay=params["weight_decay"],
        eval_strategy="epoch",
        save_strategy="no",
        logging_dir=f"./logs_random_{i}",
        report_to=[],
        disable_tqdm=True,
        seed=42,
        fp16=torch.cuda.is_available(),
        optim="adamw_torch",  # T4-optimized optimizer
        max_grad_norm=1.0,
        lr_scheduler_type="linear",
    )

    trainer = Trainer(
        model=model_fresh,  # Use fresh model instance
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
        compute_metrics=compute_qa_metrics,
    )

    start_time = time.time()
    trainer.train()
    runtime = round(time.time() - start_time, 2)

    eval_results = trainer.evaluate()

    # Extract basic metrics (handle 'eval_' prefix from HF evaluate)
    exact_match = eval_results.get("eval_exact_match", eval_results.get("exact_match", 0))
    start_acc = eval_results.get("eval_start_accuracy", eval_results.get("start_accuracy", 0))
    end_acc = eval_results.get("eval_end_accuracy", eval_results.get("end_accuracy", 0))
    accuracy = (exact_match + start_acc + end_acc) / 3
    f1 = eval_results.get("eval_f1", eval_results.get("f1", 0))

    # Calculate REAL precision and recall from token overlap
    predictions = trainer.predict(tokenized_eval)
    pred_starts = np.argmax(predictions.predictions[0], axis=1)
    pred_ends = np.argmax(predictions.predictions[1], axis=1)

    true_starts = np.asarray(predictions.label_ids[0]).reshape(-1)
    true_ends = np.asarray(predictions.label_ids[1]).reshape(-1)

    precision_scores = []
    recall_scores = []

    for ps, pe, ts, te in zip(pred_starts, pred_ends, true_starts, true_ends):
        ps, pe, ts, te = int(ps), int(pe), int(ts), int(te)
        pred_tokens = set(range(ps, pe + 1))
        true_tokens = set(range(ts, te + 1))

        if not pred_tokens and not true_tokens:
            precision_scores.append(1.0)
            recall_scores.append(1.0)
        elif not pred_tokens or not true_tokens:
            precision_scores.append(0.0)
            recall_scores.append(0.0)
        else:
            common = len(pred_tokens & true_tokens)
            if common == 0:
                precision_scores.append(0.0)
                recall_scores.append(0.0)
            else:
                precision = common / len(pred_tokens)
                recall = common / len(true_tokens)
                precision_scores.append(precision)
                recall_scores.append(recall)

    precision = float(np.mean(precision_scores))
    recall = float(np.mean(recall_scores))

    ws.append([
        i, params["epochs"], params["lr"], params["batch"], params["warmup"], params["weight_decay"],
        round(accuracy, 4), round(f1, 4), round(precision, 4), round(recall, 4), runtime
    ])


    print(f"‚úÖ Iteration {i} done ‚Äî F1: {f1:.4f}, Accuracy: {accuracy:.4f}, Time: {runtime}s")

# ==========================
# ‚úÖ Save All Results in One Excel File
# ==========================
from openpyxl.styles import Font, Alignment

# Auto-format headers for readability
for cell in ws[1]:
    cell.font = Font(bold=True)
    cell.alignment = Alignment(horizontal="center", vertical="center")

# Adjust column widths (optional aesthetic)
for col in ws.columns:
    max_length = 0
    col_letter = col[0].column_letter
    for cell in col:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(str(cell.value))
        except:
            pass
    adjusted_width = (max_length + 2)
    ws.column_dimensions[col_letter].width = adjusted_width

# Save Excel file
output_excel = "/content/BioBERT_Random_Search_Results.xlsx"
wb.save(output_excel)

print(f"\n‚úÖ All {total_iters} random search runs completed successfully!")
print("üìä Final results saved in one Excel file:")
print(f"‚û°Ô∏è {output_excel}")
print(f"\nüîç Random Search Summary:")
print(f"   - Total search space: {total_search_space} combinations")
print(f"   - Explored: {total_iters} configurations ({coverage_percent}% coverage)")
print(f"   - Advantage: More efficient exploration than Grid Search")

# 10) Train
print("\n" + "=" * 60)
print("STARTING TRAINING")
print("=" * 60)
print("\nüöÄ Training in progress...\n")

train_result = trainer.train()

print("\n" + "=" * 60)
print("TRAINING COMPLETED")
print("=" * 60)
print("Training Loss:", getattr(train_result, "training_loss", None))

# 11) Evaluate
print("\n" + "=" * 60)
print("FINAL EVALUATION")
print("=" * 60)

eval_results = trainer.evaluate()
for k, v in sorted(eval_results.items()):
    print(f"{k}: {v}")

ENVIRONMENT
Python: 3.12.12 | Platform: Linux-6.6.105+-x86_64-with-glibc2.35
PyTorch: 2.8.0+cu126
GPU: Tesla T4 | CUDA: 12.6
GPU CONFIGURATION
GPU Available: Tesla T4
CUDA Version: 12.6
GPU Memory: 14.74 GB


Saving medquadCardiovascular.csv to medquadCardiovascular (2).csv
Dataset CSV: medquadCardiovascular (2).csv

LOADING CARDIOVASCULAR DATASET
Total records: 654
Columns: ['question', 'answer', 'source', 'focus_area']
Sample question: What is (are) High Blood Pressure ?...
Sample answer chars: 5586
Train: 555 | Val: 99

LOADING MODEL AND TOKENIZER
Model: dmis-lab/biobert-base-cased-v1.1


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.

TOKENIZING DATASET
Tokenizing train...


Tokenizing train:   0%|          | 0/555 [00:00<?, ? examples/s]

Tokenizing eval...


Tokenizing eval:   0%|          | 0/99 [00:00<?, ? examples/s]

Done.
Metrics ready.

TRAINING CONFIGURATION (T4 GPU OPTIMIZED)
T4 GPU Configuration:
  - Batch Size: 8
  - Gradient Accumulation: 4
  - Effective Batch Size: 32
  - FP16 Enabled: True
  - Learning Rate: 2e-05
  - Epochs: 5

INITIALIZING TRAINER
Trainer ready.

RANDOM SEARCH HYPERPARAMETER TUNING (T4 GPU OPTIMIZED)
üîç Random Search Strategy
Total search space size: 384 possible combinations
Random sampling: 10 configurations (2.6% coverage)
Fixed batch size: 8 (optimal for T4 16GB memory)
Gradient accumulation: 4 (effective batch size: 32)

‚ñ∂Ô∏è Random Search Config 1/10
Params: Epochs=7, LR=3e-05, Warmup=0.1, Weight Decay=0.01
üîÑ Reloading fresh model...


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.796499490737915, 'eval_exact_match': 0.07281553398058252, 'eval_start_accuracy': 0.9951456310679612, 'eval_end_accuracy': 0.07281553398058252, 'eval_f1': 0.9048256275897774, 'eval_runtime': 1.2843, 'eval_samples_per_second': 160.404, 'eval_steps_per_second': 20.245, 'epoch': 1.0}
{'eval_loss': 1.299936294555664, 'eval_exact_match': 0.16019417475728157, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.16019417475728157, 'eval_f1': 0.9639346070225541, 'eval_runtime': 1.2874, 'eval_samples_per_second': 160.018, 'eval_steps_per_second': 20.196, 'epoch': 2.0}
{'eval_loss': 1.374715805053711, 'eval_exact_match': 0.16019417475728157, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.16019417475728157, 'eval_f1': 0.9759603354716637, 'eval_runtime': 1.2295, 'eval_samples_per_second': 167.552, 'eval_steps_per_second': 21.147, 'epoch': 3.0}
{'eval_loss': 1.4290721416473389, 'eval_exact_match': 0.18932038834951456, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1893203883495

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.6042670011520386, 'eval_exact_match': 0.0970873786407767, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.0970873786407767, 'eval_f1': 0.9140144410515858, 'eval_runtime': 1.2634, 'eval_samples_per_second': 163.055, 'eval_steps_per_second': 20.58, 'epoch': 1.0}
{'eval_loss': 1.3398340940475464, 'eval_exact_match': 0.1262135922330097, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1262135922330097, 'eval_f1': 0.955123004577997, 'eval_runtime': 1.2442, 'eval_samples_per_second': 165.562, 'eval_steps_per_second': 20.896, 'epoch': 2.0}
{'eval_loss': 1.4142769575119019, 'eval_exact_match': 0.1796116504854369, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1796116504854369, 'eval_f1': 0.9629898507727912, 'eval_runtime': 1.2421, 'eval_samples_per_second': 165.85, 'eval_steps_per_second': 20.933, 'epoch': 3.0}
{'eval_loss': 1.5667036771774292, 'eval_exact_match': 0.17475728155339806, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.17475728155339806, 'eval_f1': 0.97

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 3.7305991649627686, 'eval_exact_match': 0.038834951456310676, 'eval_start_accuracy': 0.8932038834951457, 'eval_end_accuracy': 0.043689320388349516, 'eval_f1': 0.8325320713821402, 'eval_runtime': 1.2253, 'eval_samples_per_second': 168.127, 'eval_steps_per_second': 21.22, 'epoch': 1.0}
{'eval_loss': 1.4357714653015137, 'eval_exact_match': 0.10679611650485436, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.10679611650485436, 'eval_f1': 0.9450301416934456, 'eval_runtime': 1.2484, 'eval_samples_per_second': 165.011, 'eval_steps_per_second': 20.827, 'epoch': 2.0}
{'eval_loss': 1.3951047658920288, 'eval_exact_match': 0.14563106796116504, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.14563106796116504, 'eval_f1': 0.9606312069877889, 'eval_runtime': 1.2475, 'eval_samples_per_second': 165.128, 'eval_steps_per_second': 20.841, 'epoch': 3.0}
{'eval_loss': 1.3321443796157837, 'eval_exact_match': 0.09223300970873786, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.092233009

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 3.374480962753296, 'eval_exact_match': 0.03398058252427184, 'eval_start_accuracy': 0.8786407766990292, 'eval_end_accuracy': 0.038834951456310676, 'eval_f1': 0.8132797977374571, 'eval_runtime': 1.2901, 'eval_samples_per_second': 159.681, 'eval_steps_per_second': 20.154, 'epoch': 1.0}
{'eval_loss': 1.377065658569336, 'eval_exact_match': 0.1796116504854369, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1796116504854369, 'eval_f1': 0.9403117005968528, 'eval_runtime': 1.2319, 'eval_samples_per_second': 167.217, 'eval_steps_per_second': 21.105, 'epoch': 2.0}
{'eval_loss': 1.2493098974227905, 'eval_exact_match': 0.2087378640776699, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.2087378640776699, 'eval_f1': 0.9719198609483811, 'eval_runtime': 1.2422, 'eval_samples_per_second': 165.834, 'eval_steps_per_second': 20.931, 'epoch': 3.0}
{'eval_loss': 1.3875086307525635, 'eval_exact_match': 0.1941747572815534, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1941747572815534

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 4.492733001708984, 'eval_exact_match': 0.02912621359223301, 'eval_start_accuracy': 0.8203883495145631, 'eval_end_accuracy': 0.02912621359223301, 'eval_f1': 0.7310091912241773, 'eval_runtime': 1.2477, 'eval_samples_per_second': 165.105, 'eval_steps_per_second': 20.839, 'epoch': 1.0}
{'eval_loss': 1.4522026777267456, 'eval_exact_match': 0.10679611650485436, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.10679611650485436, 'eval_f1': 0.9414299218693352, 'eval_runtime': 1.2419, 'eval_samples_per_second': 165.878, 'eval_steps_per_second': 20.936, 'epoch': 2.0}
{'eval_loss': 1.4439759254455566, 'eval_exact_match': 0.15048543689320387, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.15048543689320387, 'eval_f1': 0.9626081203561228, 'eval_runtime': 1.237, 'eval_samples_per_second': 166.538, 'eval_steps_per_second': 21.019, 'epoch': 3.0}
{'eval_loss': 1.422602653503418, 'eval_exact_match': 0.1553398058252427, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.15533980582524

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.9769604206085205, 'eval_exact_match': 0.11650485436893204, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.11650485436893204, 'eval_f1': 0.9036416924978704, 'eval_runtime': 1.2419, 'eval_samples_per_second': 165.878, 'eval_steps_per_second': 20.936, 'epoch': 1.0}
{'eval_loss': 1.3570940494537354, 'eval_exact_match': 0.13592233009708737, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.13592233009708737, 'eval_f1': 0.941005388032972, 'eval_runtime': 1.2451, 'eval_samples_per_second': 165.446, 'eval_steps_per_second': 20.882, 'epoch': 2.0}
{'eval_loss': 1.2580759525299072, 'eval_exact_match': 0.13592233009708737, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.13592233009708737, 'eval_f1': 0.9581584508904503, 'eval_runtime': 1.241, 'eval_samples_per_second': 166.001, 'eval_steps_per_second': 20.952, 'epoch': 3.0}
{'eval_loss': 1.3556081056594849, 'eval_exact_match': 0.1650485436893204, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1650485436893204, 'eval_f1':

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.7630094289779663, 'eval_exact_match': 0.07766990291262135, 'eval_start_accuracy': 0.9902912621359223, 'eval_end_accuracy': 0.07766990291262135, 'eval_f1': 0.9025949671280988, 'eval_runtime': 1.3115, 'eval_samples_per_second': 157.066, 'eval_steps_per_second': 19.824, 'epoch': 1.0}
{'eval_loss': 1.3046009540557861, 'eval_exact_match': 0.13592233009708737, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.13592233009708737, 'eval_f1': 0.9379490005725937, 'eval_runtime': 1.2354, 'eval_samples_per_second': 166.745, 'eval_steps_per_second': 21.046, 'epoch': 2.0}
{'eval_loss': 1.2336182594299316, 'eval_exact_match': 0.17475728155339806, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.17475728155339806, 'eval_f1': 0.9360799224419735, 'eval_runtime': 1.2361, 'eval_samples_per_second': 166.647, 'eval_steps_per_second': 21.033, 'epoch': 3.0}
{'eval_loss': 1.314382553100586, 'eval_exact_match': 0.18446601941747573, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.18446601941

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 5.2143025398254395, 'eval_exact_match': 0.0048543689320388345, 'eval_start_accuracy': 0.49029126213592233, 'eval_end_accuracy': 0.0048543689320388345, 'eval_f1': 0.5293958203648358, 'eval_runtime': 1.2457, 'eval_samples_per_second': 165.373, 'eval_steps_per_second': 20.872, 'epoch': 1.0}
{'eval_loss': 1.6556382179260254, 'eval_exact_match': 0.06310679611650485, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.06310679611650485, 'eval_f1': 0.9113654804787183, 'eval_runtime': 1.3099, 'eval_samples_per_second': 157.263, 'eval_steps_per_second': 19.849, 'epoch': 2.0}
{'eval_loss': 1.2863277196884155, 'eval_exact_match': 0.14563106796116504, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.14563106796116504, 'eval_f1': 0.9378640417563603, 'eval_runtime': 1.2301, 'eval_samples_per_second': 167.472, 'eval_steps_per_second': 21.137, 'epoch': 3.0}
{'eval_loss': 1.2401901483535767, 'eval_exact_match': 0.1407766990291262, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.140776

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 5.005612373352051, 'eval_exact_match': 0.0048543689320388345, 'eval_start_accuracy': 0.27184466019417475, 'eval_end_accuracy': 0.024271844660194174, 'eval_f1': 0.6746709486934298, 'eval_runtime': 1.2263, 'eval_samples_per_second': 167.982, 'eval_steps_per_second': 21.202, 'epoch': 1.0}
{'eval_loss': 1.936866044998169, 'eval_exact_match': 0.06310679611650485, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.06310679611650485, 'eval_f1': 0.9096074151226867, 'eval_runtime': 1.2772, 'eval_samples_per_second': 161.296, 'eval_steps_per_second': 20.358, 'epoch': 2.0}
{'eval_loss': 1.3902051448822021, 'eval_exact_match': 0.11650485436893204, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.11650485436893204, 'eval_f1': 0.9510415797681987, 'eval_runtime': 1.3003, 'eval_samples_per_second': 158.429, 'eval_steps_per_second': 19.996, 'epoch': 3.0}
{'eval_loss': 1.355849027633667, 'eval_exact_match': 0.1407766990291262, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1407766990

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 5.138084411621094, 'eval_exact_match': 0.0048543689320388345, 'eval_start_accuracy': 0.22330097087378642, 'eval_end_accuracy': 0.024271844660194174, 'eval_f1': 0.6579431479494263, 'eval_runtime': 1.2461, 'eval_samples_per_second': 165.316, 'eval_steps_per_second': 20.865, 'epoch': 1.0}
{'eval_loss': 1.7718418836593628, 'eval_exact_match': 0.07766990291262135, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.07766990291262135, 'eval_f1': 0.9250702990122049, 'eval_runtime': 1.2266, 'eval_samples_per_second': 167.944, 'eval_steps_per_second': 21.197, 'epoch': 2.0}
{'eval_loss': 1.3364888429641724, 'eval_exact_match': 0.1650485436893204, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1650485436893204, 'eval_f1': 0.941182133397326, 'eval_runtime': 1.2841, 'eval_samples_per_second': 160.424, 'eval_steps_per_second': 20.248, 'epoch': 3.0}
{'eval_loss': 1.3074567317962646, 'eval_exact_match': 0.14563106796116504, 'eval_start_accuracy': 1.0, 'eval_end_accuracy': 0.1456310679

In [9]:
#  ANALYZE AND DISPLAY BEST HYPERPARAMETER CONFIGURATION
print("\n" + "=" * 60)
print("BEST HYPERPARAMETER CONFIGURATION ANALYSIS")
print("=" * 60)

# Read the Excel file with results
import pandas as pd
results_df = pd.read_excel("/content/BioBERT_Random_Search_Results.xlsx")

# Display all results sorted by F1-Score (descending)
print("\nAll Configurations Ranked by F1-Score:")
print("=" * 60)
results_sorted = results_df.sort_values("F1-Score", ascending=False)
print(results_sorted.to_string(index=False))

# Find best configuration by F1-Score
best_idx = results_df["F1-Score"].idxmax()
best_config = results_df.iloc[best_idx]

print("\n" + "=" * 60)
print(" BEST HYPERPARAMETER CONFIGURATION")
print("=" * 60)
print(f"\nBest Configuration (Iteration {int(best_config['Iteration'])}):")
print(f"   - Epochs:        {int(best_config['Epochs'])}")
print(f"   - Learning Rate: {best_config['Learning Rate']:.2e}")
print(f"   - Batch Size:    {int(best_config['Batch Size'])}")
print(f"   - Warmup Ratio:  {best_config['Warmup Ratio']:.2f}")
print(f"   - Weight Decay:  {best_config['Weight Decay']:.4f}")
print(f"\ Performance Metrics:")
print(f"   - F1-Score:      {best_config['F1-Score']:.4f}")
print(f"   - Accuracy:      {best_config['Accuracy']:.4f}")
print(f"   - Precision:     {best_config['Precision']:.4f}")
print(f"   - Recall:        {best_config['Recall']:.4f}")
print(f"   - Runtime:       {best_config['Runtime (s)']:.2f} seconds")

# Additional insights
print("\n" + "=" * 60)
print(" INSIGHTS FROM RANDOM SEARCH")
print("=" * 60)

# Best vs Average performance
avg_f1 = results_df["F1-Score"].mean()
improvement = ((best_config['F1-Score'] - avg_f1) / avg_f1) * 100

print(f"\nAverage F1-Score across all configs: {avg_f1:.4f}")
print(f"Best F1-Score improvement over average: +{improvement:.2f}%")

# Top 3 configurations
print("\n Top 3 Configurations by F1-Score:")
top3 = results_sorted.head(3)
for idx, row in top3.iterrows():
    print(f"\n{[list(top3.index).index(idx)]} Rank {list(top3.index).index(idx) + 1}:")
    print(f"   Iteration {int(row['Iteration'])} | F1: {row['F1-Score']:.4f} | "
          f"LR: {row['Learning Rate']:.2e} | Epochs: {int(row['Epochs'])} | "
          f"Warmup: {row['Warmup Ratio']:.2f}")

# Correlation analysis
print("\n Hyperparameter Impact Analysis:")
correlations = results_df[['Epochs', 'Learning Rate', 'Warmup Ratio', 'Weight Decay', 'F1-Score']].corr()['F1-Score'].drop('F1-Score')
print("\nCorrelation with F1-Score:")
for param, corr in correlations.sort_values(ascending=False).items():
    direction = "‚Üë Positive" if corr > 0 else "‚Üì Negative"
    print(f"   {param:15s}: {corr:+.3f} ({direction})")


BEST HYPERPARAMETER CONFIGURATION ANALYSIS

All Configurations Ranked by F1-Score:
 Iteration  Epochs  Learning Rate  Batch Size  Warmup Ratio  Weight Decay  Accuracy  F1-Score  Precision  Recall  Runtime (s)
         4       7       0.000040           8          0.25         0.050    0.5825    0.9896     0.9873  0.9925       159.92
         5      10       0.000020           8          0.15         0.000    0.5146    0.9822     0.9784  0.9866       227.19
         1       7       0.000030           8          0.10         0.010    0.4854    0.9756     0.9761  0.9805       160.72
         2       7       0.000040           8          0.10         0.010    0.4693    0.9694     0.9766  0.9724       159.83
         6       5       0.000030           8          0.15         0.001    0.4401    0.9640     0.9575  0.9721       114.38
         3       5       0.000020           8          0.20         0.010    0.4142    0.9628     0.9596  0.9676       114.62
         9      10       0.000010 

In [None]:
# 12) Save the trained model
print("\n" + "=" * 60)
print("SAVING TRAINED MODEL")
print("=" * 60)

# Define output directory
output_model_dir = "./fine_tuned_cardio_qa_model"

# Save the model and tokenizer
trainer.save_model(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

print(f"‚úÖ Model saved to: {output_model_dir}")
print(f"üì¶ Saved components:")
print(f"   - Model weights: pytorch_model.bin")
print(f"   - Model config: config.json")
print(f"   - Tokenizer files: tokenizer_config.json, vocab.txt, etc.")
print("=" * 60)

# Optional: Save to Google Drive for persistence
SAVE_TO_DRIVE = False  # Set to True if you want to save to Drive

if SAVE_TO_DRIVE:
    try:
        from google.colab import drive
        import shutil

        # Mount Drive if not already mounted
        if not os.path.exists('/content/drive'):
            drive.mount('/content/drive')

        # Define Drive destination
        drive_model_dir = "/content/drive/MyDrive/fine_tuned_cardio_qa_model"

        # Copy model to Drive
        print(f"\nüì§ Copying model to Google Drive...")
        if os.path.exists(drive_model_dir):
            shutil.rmtree(drive_model_dir)
        shutil.copytree(output_model_dir, drive_model_dir)

        print(f"‚úÖ Model also saved to Google Drive: {drive_model_dir}")
        print("üíæ Your model will persist even after the Colab session ends!")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save to Google Drive: {e}")

print("\n" + "=" * 60)
print("üéâ TRAINING PIPELINE COMPLETE!")
print("=" * 60)


SAVING TRAINED MODEL
‚úÖ Model saved to: ./fine_tuned_cardio_qa_model
üì¶ Saved components:
   - Model weights: pytorch_model.bin
   - Model config: config.json
   - Tokenizer files: tokenizer_config.json, vocab.txt, etc.

üéâ TRAINING PIPELINE COMPLETE!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
