In [60]:
# 1️⃣ Parameters & Configuration
import os

# Data paths - Update these to match your dataset structure
train_csv = "D:\\projects\\medicare-chatbot\\data\\raw\\symptom-disease-train-dataset.csv"
val_csv = "D:\\projects\\medicare-chatbot\\data\\raw\\symptom-disease-test-dataset.csv"
mapping_file = "D:\\projects\\medicare-chatbot\\data\\raw\\mapping.json"

# Model settings for CPU training
hf_checkpoint = "distilbert-base-uncased"  # Changed to DistilBERT for better CPU performance

# PEFT LoRA settings optimized for CPU
peft_r = 8
peft_alpha = 32  # Increased for better performance
peft_dropout = 0.1
# DistilBERT target modules for LoRA
peft_target_modules = ["q_lin", "k_lin", "v_lin", "out_lin"]

# Remove all quantization settings for CPU training
use_4bit = False
use_8bit = False
bnb_quant_type = None
bnb_compute_dtype = None
use_nested_quant = False

# Training arguments optimized for CPU
output_dir = "D:\\projects\\medicare-chatbot\\outputs\\distilbert"
num_train_epochs = 1
per_device_train_batch_size = 8  # Smaller batch size for CPU
per_device_eval_batch_size = 8
learning_rate = 2e-4  # Higher learning rate for LoRA
weight_decay = 0.01
logging_steps = 1
save_steps = 500
eval_strategy = "steps"
eval_steps = 200
seed = 42
pin_memory = False
label_name = "labels"
gradient_accumulation_steps = 4
warmup_steps = 100

# CPU-specific optimizer settings
optim = "adamw_torch"  # Standard AdamW for CPU
fp16 = False  # Disable mixed precision for CPU
bf16 = False
model_dtype = "float32"  # Use float32 for CPU training

# Learning rate schedule
lr_scheduler_type = "linear"
max_grad_norm = 1.0
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
max_seq_length = 512
packing = False

# CPU device settings
device_map = None  # Let PyTorch handle device placement
pin_memory = False  # Explicitly set to False for CPU training
dataloader_pin_memory = False  # Additional setting for dataloader


In [4]:
# 2️⃣ Imports
import json
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate

# Set device to CPU
device = torch.device("cpu")
print(f"Using device: {device}")


Using device: cpu


In [83]:
# 3️⃣ Helper Functions
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

def load_and_prepare_data(train_path: str, val_path: str, mapping_path: str):
    """
    Load and prepare the symptom-disease dataset using mapping.json for labels.
    
    Args:
        train_path (str): Path to the training CSV with columns "text" and "label" (string disease names).
        val_path (str): Path to the validation CSV with the same format.
        mapping_path (str): Path to mapping.json mapping disease names to integer labels.
    
    Returns:
        DatasetDict: A Hugging Face DatasetDict with 'train' and 'validation' splits.
        LabelEncoder: A fitted LabelEncoder (optional, retained for compatibility).
        int: Number of unique labels.
    """
    # 1. Load mapping.json
    with open(mapping_path, 'r') as f:
        disease2idx = json.load(f)
    
    # Build inverse mapping: idx -> disease name
    idx2disease = {idx: disease for disease, idx in disease2idx.items()}
    
    # Create ordered list of disease names by index
    num_labels = len(idx2disease)
    label_names = [disease for _, disease in idx2disease.items()]
    
    # 2. Load CSV data (first 10 rows for quick testing)
    train_df = pd.read_csv(train_path).iloc[:100]
    val_df = pd.read_csv(val_path).iloc[:100]

    # 3. Rename label to labels
    train_df = train_df.rename(columns={'label': 'labels'})
    val_df = val_df.rename(columns={'label': 'labels'})
    
    print(f"Train dataset shape: {train_df.shape}")
    print(f"Validation dataset shape: {val_df.shape}")
    print(f"Sample data:\n{train_df.head()}")
    
    # 4. Sanity check: ensure no missing mappings
    if train_df['labels'].isnull().any() or val_df['labels'].isnull().any():
        raise ValueError("Some labels in CSV not found in mapping.json")
    
    # 5. Create HF datasets
    train_dataset = Dataset.from_pandas(train_df[['text', 'labels']])
    val_dataset   = Dataset.from_pandas(val_df[['text',  'labels']])
    dataset_dict  = DatasetDict({'train': train_dataset, 'validation': val_dataset})
    
    # 6. Optionally fit a LabelEncoder for compatibility (not strictly needed)
    label_encoder = LabelEncoder()
    label_encoder.classes_ = label_names
    
    return dataset_dict, label_encoder, num_labels



In [84]:
# 4️⃣ Data Loading and Preprocessing
print("Loading and preparing dataset...")
dataset, label_encoder, num_labels = load_and_prepare_data(train_csv, val_csv, mapping_file)

print(f"Number of unique labels: {num_labels}")
print(f"Label classes: {list(label_encoder.classes_)}")

# Load tokenizer
print("Loading tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained(hf_checkpoint)

# Tokenization function
def tokenize_function(examples):
    """Tokenize the input text for the model"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=max_seq_length,
        return_tensors="pt"
    )

# Apply tokenization to the dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("Dataset preparation complete!")

Loading and preparing dataset...
Train dataset shape: (100, 2)
Validation dataset shape: (100, 2)
Sample data:
                                                text  labels
0  I have been having migraines and headaches. I ...     308
1  I have asthma and I get wheezing and breathing...      35
2  Signs and symptoms of primary ovarian insuffic...     798
3  cough,high_fever,breathlessness,family_history...     149
4  chills,vomiting,high_fever,sweating,headache,n...     596
Number of unique labels: 1082
Label classes: ['(Vertigo) Paroymsal  Positional Vertigo', 'Abdominal Aortic Aneurysm', 'Acanthosis Nigricans', 'Achalasia', 'Achilles Tendinitis', 'Achilles Tendon Rupture', 'Acl Injury', 'Acne', 'Acoustic Neuroma', 'Acromegaly', 'Actinic Keratosis', 'Acute Coronary Syndrome', 'Acute Flaccid Myelitis', 'Acute Liver Failure', 'Acute Lymphocytic Leukemia', 'Acute Myelogenous Leukemia', 'Acute Sinusitis', 'Addisons Disease', 'Adenomyosis', 'Adhd', 'Adjustment Disorders', 'Adrenal Cancer', '

Map: 100%|██████████| 100/100 [00:00<00:00, 1499.79 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1653.09 examples/s]

Dataset preparation complete!





In [85]:
# 5️⃣ Model Setup with LoRA
print("Loading model...")

# Create label mappings
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}

# Load model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained(
    hf_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.float32  # Use float32 for CPU
)

print("Setting up LoRA configuration...")
# LoRA Configuration for sequence classification
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=peft_r,
    lora_alpha=peft_alpha,
    lora_dropout=peft_dropout,
    target_modules=peft_target_modules,
    bias="none",
    modules_to_save=["classifier"]  # Save the classification head
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

# Move model to CPU (explicitly)
model = model.to(device)
print("Model setup complete!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model...
Setting up LoRA configuration...
trainable params: 1717562 || all params: 69503092 || trainable%: 2.47
Model setup complete!


In [86]:
# 6️⃣ Training Setup
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation metrics
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, and F1 score with debugging"""
    print("*** compute_metrics called ***")  # Debug statement
    predictions, labels = eval_pred
    print(f"Predictions shape: {predictions.shape}")  # Debug statement
    print(f"Labels shape: {labels.shape}")  # Debug statement
    
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    metrics = {
        'eval_accuracy': accuracy,
        'eval_f1': f1,
        'eval_precision': precision,
        'eval_recall': recall
    }
    
    print(f"Computed metrics: {metrics}")  # Debug statement
    return metrics


# Label names
label_names = list(label_encoder.classes_)

# Training arguments optimized for CPU
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir=f'{output_dir}/logs',
    logging_steps=logging_steps,
    eval_strategy=eval_strategy,
    save_strategy="epoch",
    load_best_model_at_end=False,
    # metric_for_best_model="loss",
    # greater_is_better=True,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    # CPU-specific settings
    fp16=fp16,
    bf16=bf16,
    dataloader_num_workers=0,  # Set to 0 for CPU to avoid multiprocessing issues
    dataloader_pin_memory=False,
    report_to="none",
    remove_unused_columns=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    label_names=label_names,
)

print("Training arguments configured for CPU training!")

Training arguments configured for CPU training!


In [87]:
# 7️⃣ Training
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
print("Note: CPU training will be slower than GPU training but requires no special hardware")

# Training
training_result = trainer.train()

print("Training completed!")
print(f"Training results: {training_result}")

Starting training...
Note: CPU training will be slower than GPU training but requires no special hardware


Step,Training Loss,Validation Loss
1,6.9984,No log
2,6.9951,No log
3,6.9952,No log
4,1.765,No log


Training completed!
Training results: TrainOutput(global_step=4, training_loss=5.688447266817093, metrics={'train_runtime': 374.4434, 'train_samples_per_second': 0.267, 'train_steps_per_second': 0.011, 'total_flos': 13919905104000.0, 'train_loss': 5.688447266817093, 'epoch': 1.0})


In [88]:
from torch.utils.data import DataLoader
eval_loader = DataLoader(tokenized_dataset['validation'], batch_size=8)
all_preds, all_labels = [], []
model.eval()
for batch in eval_loader:
    inputs = {k: v.to(device) for k,v in batch.items() if k!='labels'}
    with torch.no_grad():
        outputs = model(**inputs)
    batch_preds = outputs.logits.argmax(dim=-1).cpu().numpy()
    all_preds.extend(batch_preds)
    all_labels.extend(batch['labels'].numpy())
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           7       0.00      0.00      0.00       3.0
          10       0.00      0.00      0.00       0.0
          14       0.00      0.00      0.00       1.0
          33       0.00      0.00      0.00       2.0
          35       0.00      0.00      0.00       2.0
          72       0.00      0.00      0.00       4.0
          73       0.00      0.00      0.00       1.0
         139       0.00      0.00      0.00       1.0
         169       0.00      0.00      0.00       0.0
         186       0.00      0.00      0.00       1.0
         197       0.00      0.00      0.00       0.0
         207       0.00      0.00      0.00       2.0
         273       0.00      0.00      0.00       1.0
         275       0.00      0.00      0.00       2.0
         284       0.00      0.00      0.00       1.0
         285       0.00      0.00      0.00       3.0
         297       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [89]:
# 8️⃣ Model Saving and Evaluation
# Save the model
# print("Saving model...")
# trainer.save_model()
# tokenizer.save_pretrained(output_dir)

# Save label encoder for future use
import joblib
joblib.dump(label_encoder, f"{output_dir}/label_encoder.pkl")

# Final evaluation
print("Final evaluation...")
eval_results = trainer.evaluate()
print(f"Final evaluation results: {eval_results}")

# Save evaluation results
with open(f"{output_dir}/eval_results.json", "w") as f:
    json.dump(eval_results, f, indent=2)

print("Model and results saved successfully!")

Final evaluation...


Final evaluation results: {'eval_runtime': 69.3505, 'eval_samples_per_second': 1.442, 'eval_steps_per_second': 0.187, 'epoch': 1.0}
Model and results saved successfully!


In [90]:
# 9️⃣ Inference Function
def predict_diagnosis(symptoms_text, model, tokenizer, label_encoder, device):
    """Predict diagnosis from symptoms text"""
    # Tokenize input
    inputs = tokenizer(
        symptoms_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_seq_length
    ).to(device)
    
    # Set model to evaluation mode
    model.eval()
    
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class_id = probabilities.argmax().item()
        confidence = probabilities.max().item()
    
    # Decode prediction
    predicted_diagnosis = label_encoder.inverse_transform([predicted_class_id])[0]
    return predicted_diagnosis, confidence

# Example usage
sample_symptoms = "I have been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination."
diagnosis, confidence = predict_diagnosis(sample_symptoms, model, tokenizer, label_encoder, device)

print(f"\nSample prediction:")
print(f"Symptoms: {sample_symptoms}")
print(f"Predicted diagnosis: {diagnosis}")
print(f"Confidence: {confidence:.4f}")

AttributeError: 'list' object has no attribute 'shape'

In [None]:
# 🔟 Model Loading Function (for future use)
def load_fine_tuned_model(model_path):
    """Load the fine-tuned LoRA model for inference"""
    from peft import PeftModel
    import joblib
    
    # Load label encoder
    label_encoder = joblib.load(f"{model_path}/label_encoder.pkl")
    num_labels = len(label_encoder.classes_)
    
    # Create label mappings
    id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
    label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
    
    # Load base model
    base_model = DistilBertForSequenceClassification.from_pretrained(
        hf_checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
        torch_dtype=torch.float32
    )
    
    # Load the LoRA adapter
    model = PeftModel.from_pretrained(base_model, model_path)
    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    
    return model, tokenizer, label_encoder

print("Model loading function defined. Use this to load your trained model in the future.")

In [40]:
print(id2label)

{0: 'Alcoholic Hepatitis', 1: 'Allergy', 2: 'Bronchial Asthma', 3: 'Cervical Spondylosis', 4: 'Dengue', 5: 'Dimorphic Hemmorhoids(Piles)', 6: 'Drug Reaction', 7: 'Gerd', 8: 'Heart Attack', 9: 'Hepatitis C', 10: 'Hyperthyroidism', 11: 'Jaundice', 12: 'Malaria', 13: 'Osteoarthristis', 14: 'Paralysis (Brain Hemorrhage)', 15: 'Posterior Cortical Atrophy', 16: 'Premature Ovarian Failure', 17: 'Temporal Lobe Seizure', 18: 'Tuberculosis', 19: 'Urinary Tract Infection'}
