# Healthcare Diagnosis Chatbot Fine-Tuning

This notebook replaces the Hydra-driven script with explicit parameter blocks.
All hyperparameters and paths are defined upfront for clarity and easy adjustment.


In [None]:
# 1️⃣ Parameters & Configuration
import os

# Data paths
train_csv = "D:\\projects\\medicare-chatbot\\data\\processed\\symptom-disease-train-dataset.csv"
val_csv = "D:\\projects\\medicare-chatbot\\data\\processed\\symptom-disease-test-dataset.csv"
mapping_json = "D:\\projects\\medicare-chatbot\\data\\processed\\mapping.json"

# Model, BnB & PEFT settings
# hf_checkpoint = "meta-llama/Llama-3.2-1B"
hf_checkpoint = "EleutherAI/pythia-70m"
peft_r = 8
peft_alpha = 16
peft_dropout = 0.05
# peft_target_modules = ["q_proj", "v_proj"]
peft_target_modules = ["query_key_value", "dense_h_to_4h", "dense_4h_to_h"]
use_4bit = True
use_8bit = True
bnb_quant_type = "nf4"
bnb_compute_dtype = "float16"
use_nested_quant = False

# Training arguments
output_dir = "D:\\projects\\medicare-chatbot\\outputs"
num_train_epochs = 1
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
learning_rate = 5e-5
weight_decay = 0.01
logging_steps = 10
save_steps = 500
eval_strategy = "epoch"
eval_steps = 200
seed = 42
pin_memory = False
label_name = "labels"
gradient_accumulation_steps = 4
warmup_steps = 2
# Optimizer to use
optim = "paged_adamw_32bit"
fp16 = False 
bf16 = False
model_dtype = "float16"
logging_strategy="steps",    # ensure step-based logging
logging_steps=1,
logging_first_step=True,     # also log at step 0

# Learning rate schedule
lr_scheduler_type = "cosine"
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = "auto"


In [None]:
# 2️⃣ Imports
import json
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [12]:
# 3️⃣ Load & Preprocess Dataset
# Load CSVs using pandas
import pandas as pd

train_df = pd.read_csv(train_csv)[:1000]
val_df = pd.read_csv(val_csv)[:100]

# Convert pandas DataFrames to Hugging Face Dataset
from datasets import Dataset, DatasetDict

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
})

train_df.head()

Unnamed: 0,text,label
0,I have been having migraines and headaches. I ...,Drug Reaction
1,I have asthma and I get wheezing and breathing...,Allergy
2,Signs and symptoms of primary ovarian insuffic...,Premature Ovarian Failure
3,"cough,high_fever,breathlessness,family_history...",Bronchial Asthma
4,"chills,vomiting,high_fever,sweating,headache,n...",Malaria


In [None]:
# 4️⃣ Tokenizer & Model Initialization (with PyTorch dynamic quantization)
import torch
from torch.quantization import quantize_dynamic
from peft import LoraConfig, get_peft_model

# — Tokenizer setup —
tokenizer = AutoTokenizer.from_pretrained(hf_checkpoint)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# — Load the base model (FP16 for speed on GPU, FP32 on CPU) —
model = AutoModelForCausalLM.from_pretrained(
    hf_checkpoint,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

model = model.to("cpu")
# — Apply dynamic quantization (CPU only) —
# model = quantize_dynamic(
#     model,
#     {torch.nn.Linear},
#     dtype=torch.qint8
# )


# — Attach LoRA adapters for efficient fine-tuning —
lora_cfg = LoraConfig(
    r=peft_r,
    lora_alpha=peft_alpha,
    target_modules=peft_target_modules,
    lora_dropout=peft_dropout,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_cfg)

# Now `model` is quantized and PEFT-ready—proceed to Trainer…


In [None]:
# 5️⃣ Tokenization Function
def tokenize_fn(batch):
    """Prepare combined prompt+answer sequences and align labels.

    Each example becomes:
        "Symptoms: <symptom text>\n\nDisease: <disease> <eos>"
    Prompt tokens are masked out in ``labels`` with ``-100`` so the loss is
    only computed on the answer portion.
    """
    # Build prompt and answer strings
    prompts = [f"Symptoms: {sym}\n\nDisease:" for sym in batch["text"]]
    answers = [str(label) for label in batch["label"]]

    # Full texts fed to the model
    full_texts = [p + " " + a + tokenizer.eos_token for p, a in zip(prompts, answers)]
    tokenized = tokenizer(full_texts, truncation=True, padding="longest")

    # Construct labels that ignore (mask) the prompt part
    labels = []
    for p, a in zip(prompts, answers):
        prompt_ids = tokenizer(p, add_special_tokens=False).input_ids
        answer_ids = tokenizer(" " + a + tokenizer.eos_token, add_special_tokens=False).input_ids
        labels.append([-100] * len(prompt_ids) + answer_ids)

    # Left-pad labels to the max sequence length in this batch so labels
    # shape matches ``input_ids``.
    max_len = max(len(l) for l in labels)
    for l in labels:
        l += [-100] * (max_len - len(l))

    tokenized["labels"] = labels
    return tokenized

tokenized = ds.map(
    tokenize_fn, batched=True
)

# Set PyTorch format
tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

batch = tokenized["train"][:1]
print(batch.keys())

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1000/1000 [00:01<00:00, 863.18 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 2024.11 examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])





In [None]:
# 6️⃣ Training Setup & Execution
training_args = TrainingArguments(
    no_cuda=True,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    # optim=optim,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    logging_steps=logging_steps,
    save_steps=save_steps,
    eval_strategy=eval_strategy,
    seed=seed,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    dataloader_pin_memory=pin_memory,
    # dataloader_drop_last=True,
    label_names=["labels"],
    # report_to="none",
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    # peft_config=lora_cfg,
    # dataset_text_field="text",
    # max_seq_length=max_seq_length,
    processing_class=tokenizer,
    # packing=packing,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()
trainer.save_model(output_dir)


Epoch,Training Loss,Validation Loss
1,No log,3.515504


In [22]:
# 6️⃣ Training Setup & Execution
training_args = TrainingArguments(
    no_cuda=True,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    # optim=optim,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    logging_steps=10,
    logging_strategy="steps",
    logging_first_step=True,
    save_steps=save_steps,
    # eval_strategy=eval_strategy,
    seed=seed,
    fp16=fp16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    dataloader_pin_memory=pin_memory,
    # dataloader_drop_last=True,
    label_names=["labels"],
    # report_to="none",
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    # eval_dataset=tokenized["validation"],
    # peft_config=lora_cfg,
    # dataset_text_field="text",
    # max_seq_length=max_seq_length,
    processing_class=tokenizer,
    # packing=packing,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()
trainer.save_model(output_dir)




Step,Training Loss
1,2.1098
10,3.3711
20,3.12
30,3.2041


In [1]:
print(trainer.state.log_history)
print("Global step:", trainer.state.global_step)


NameError: name 'trainer' is not defined

In [2]:
# 1️⃣ Parameters & Configuration
import os

# Data paths
train_csv        = "D:\\projects\\medicare-chatbot\\data\\processed\\symptom-disease-train-dataset.csv"
mapping_json     = "D:\\projects\\medicare-chatbot\\data\\processed\\mapping.json"

# Model & PEFT (LoRA) settings
hf_checkpoint      = "EleutherAI/pythia-14m"
peft_r             = 8
peft_alpha         = 16
peft_dropout       = 0.05                      # 5% dropout balances regularization and learning :contentReference[oaicite:0]{index=0}
peft_target_modules= ["query_key_value",       # attention QKV
                       "dense_h_to_4h",        # MLP “up” projection
                       "dense_4h_to_h"]        # MLP “down” projection

# Training arguments
output_dir                   = "D:\\projects\\medicare-chatbot\\outputs"
num_train_epochs             = 3                    # more epochs to fully train on 1 000 examples
per_device_train_batch_size  = 2                    # small batch size for CPU memory limits :contentReference[oaicite:1]{index=1}
gradient_accumulation_steps  = 4                    # effective batch size = 2×4 = 8
learning_rate                = 2e-4                 # recommended for small LoRA‐tuned models :contentReference[oaicite:2]{index=2}
weight_decay                 = 0.0                  # no weight decay to avoid over-regularization
warmup_ratio                 = 0.1                  # 10% of total steps for warmup
lr_scheduler_type            = "linear"             # linear decay after warmup
max_grad_norm                = 0.3                  # gradient clipping for stability
logging_strategy             = "steps"              # log every N steps
logging_steps                = 1                   # frequent feedback on CPU :contentReference[oaicite:3]{index=3}
logging_first_step           = True                 # also log at step 0
report_to                     = "none"               # disable W&B/TB integrations
no_cuda                      = True                 # CPU-only run
seed                         = 42                   # for reproducibility

# DataLoader and tokenization
group_by_length              = True                 # speed up by grouping similar-length sequences :contentReference[oaicite:4]{index=4}
max_seq_length               = None                 # allow tokenizer default (often 1024–2048)

# (SFTTrainer will handle everything else under the hood)



In [None]:
# 5️⃣ SFTTrainer Setup & Train
sft_config = SFTConfig(                 # build SFT config :contentReference[oaicite:36]{index=36}
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    max_grad_norm=max_grad_norm,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    logging_first_step=logging_first_step,
    report_to=report_to,
    no_cuda=no_cuda,
    seed=seed
)


trainer = SFTTrainer(                   # instantiate SFTTrainer :contentReference[oaicite:37]{index=37}
    model=model,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    peft_config=lora_cfg,
    args=sft_config
)

trainer.train() 