In [None]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# ML imports
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils import resample
from sklearn.metrics import ConfusionMatrixDisplay
import joblib

# Import generator class
import sys
sys.path.append('../src')
from generator import NigerianDiseaseGenerator

## Data Generation

In [2]:
# Generate dataset dataframe using the NigerianDiseaseGenerator
gen = NigerianDiseaseGenerator(random_seed=42)

# Generate a dataset of 10000 samples without balancing the disease distribution
df = gen.generate_dataset(10000, balance_diseases=False)
print("Generated NATURAL dataset")

Generated NATURAL dataset


## Data Exploration

In [3]:
# Display the first 5 rows of the generated DataFrame
df.head()

Unnamed: 0,patient_id,age_band,gender,setting,region,season,diagnosis,fever,headache,cough,...,dizziness,confusion,rash,maculopapular_rash,rose_spots,conjunctivitis,lymph_nodes,recurrent_infections,oral_thrush,generation_timestamp
0,PT_007854,25-44,male,urban,south,rainy,diabetes,0,1,0,...,0,0,0,0,0,0,0,1,0,2025-09-07 21:04:41.213933
1,PT_003863,25-44,female,rural,north,rainy,typhoid,1,1,0,...,0,0,0,0,1,0,0,0,0,2025-09-07 21:04:41.213933
2,PT_008848,0-4,female,rural,north,transition,measles,1,0,1,...,0,0,0,1,0,1,0,0,0,2025-09-07 21:04:41.213933
3,PT_008208,15-24,male,rural,middle_belt,transition,gastroenteritis,1,0,0,...,0,0,0,0,0,0,0,0,0,2025-09-07 21:04:41.213933
4,PT_004167,25-44,female,rural,south,dry,malaria,1,1,0,...,0,1,0,0,0,1,0,0,0,2025-09-07 21:04:41.213933


In [4]:
# Display the column names of the DataFrame
df.columns

Index(['patient_id', 'age_band', 'gender', 'setting', 'region', 'season',
       'diagnosis', 'fever', 'headache', 'cough', 'chronic_cough',
       'productive_cough', 'fatigue', 'body_ache', 'chills', 'sweats',
       'night_sweats', 'weight_loss', 'loss_of_appetite', 'nausea', 'vomiting',
       'diarrhea', 'constipation', 'abdominal_pain', 'epigastric_pain',
       'heartburn', 'hunger_pain', 'sore_throat', 'runny_nose', 'chest_pain',
       'shortness_of_breath', 'rapid_breathing', 'hemoptysis', 'dysuria',
       'polyuria', 'oliguria', 'polydipsia', 'polyphagia', 'blurred_vision',
       'dizziness', 'confusion', 'rash', 'maculopapular_rash', 'rose_spots',
       'conjunctivitis', 'lymph_nodes', 'recurrent_infections', 'oral_thrush',
       'generation_timestamp'],
      dtype='object')

In [5]:
# Display the shape of the DataFrame
df.shape

(10000, 49)

In [32]:
# Get detailed statistics about the generated dataset
gen.get_disease_statistics(df, pretty=True, include_symptom_correlations=True)

# Save the generated dataset to a CSV file for reproducibility
df.to_csv("synthetic_patients.csv", index=False)


         ENHANCED NIGERIAN DISEASE DATASET SUMMARY
Total patients: 10,000

healthy        :   1802 patients (18.02%)
malaria        :   1664 patients (16.64%)
gastroenteritis:   1088 patients (10.88%)
typhoid        :   1014 patients (10.14%)
pneumonia_ari  :    967 patients ( 9.67%)
measles        :    851 patients ( 8.51%)
tuberculosis   :    754 patients ( 7.54%)
peptic_ulcer   :    674 patients ( 6.74%)
hiv            :    414 patients ( 4.14%)
diabetes       :    392 patients ( 3.92%)
hypertension   :    380 patients (  3.8%)


DIABETES:
  polyuria            :  70.2% (vs   2.2% others, 32.0x specific)
  polydipsia          :  82.1% (vs   1.9% others, 42.6x specific)
  polyphagia          :  74.7% (vs   1.7% others, 43.0x specific)
--------------------
TYPHOID:
  headache            :  86.3% (vs  40.4% others,  2.1x specific)
--------------------
MEASLES:
  cough               :  95.5% (vs  19.0% others,  5.0x specific)
  runny_nose          :  95.7% (vs   5.9% others, 16.2x spec

## Data Preprocessing

In [7]:
# Separate the features (X) and the target variable (y)
X = df.drop(columns=['diagnosis', 'patient_id', 'generation_timestamp'])
y = df['diagnosis'] # The target variable is the 'diagnosis' column

In [8]:
# Define the list of categorical columns (non-binary features)
categorical_cols = ['age_band', 'gender', 'setting', 'region', 'season']

# Define the list of binary symptom columns by excluding categorical columns from all feature columns
symptom_cols = [col for col in X.columns if col not in categorical_cols]

In [9]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [10]:
# Initialize a LabelEncoder to convert disease names into numerical labels
le = LabelEncoder()

# Fit the encoder on the combined training and testing target variables
le.fit(pd.concat([y_train, y_test]))

# Transform the training and testing target variables into encoded numerical labels
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

# Save the original class names
class_names = le.classes_

## Model Training & Evaluation

In [11]:
# --- Main helper function for training and evaluation ---
def train_and_evaluate(
    model,
    X_train, y_train,
    X_test, y_test,
    class_names=None,
    calibration_class_index=None,
    balance=False,
    label_encoder=None
):
    """
    Trains a given model, evaluates it on the test set, and prints various metrics.

    Args:
        model: The machine learning model or pipeline to train.
        X_train: Training features.
        y_train: Training target labels (can be encoded).
        X_test: Testing features.
        y_test: Testing target labels (can be encoded).
        class_names: List of original class names for reporting.
        calibration_class_index: Index of the class to use for calibration plotting (if applicable).
        balance: Boolean indicating whether to use sample weighting for class balancing.
        label_encoder: The LabelEncoder used to decode labels if y_train/y_test are encoded.
    """
    fit_params = {}

    if balance:
        # Compute sample weights to balance the classes based on their frequency in the training data
        sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

        # If using label encoding, decode y_train so counts show names not numbers for display
        if label_encoder is not None:
            y_train_display = label_encoder.inverse_transform(y_train)
        else:
            y_train_display = y_train

        # Calculate original counts and effective weights per class
        counts = Counter(y_train_display)
        weight_by_class = {}
        for cls in np.unique(y_train):
            mask = (y_train == cls)
            weight_by_class[
                label_encoder.inverse_transform([cls])[0] if label_encoder is not None else cls
            ] = sample_weights[mask].sum()

        print("\n[Info] Class balance overview (per disease):")
        print("  Disease            Original Count   Effective Weight")
        for disease, orig_count in counts.items():
            eff_weight = weight_by_class[disease]
            print(f"  {disease:18} {orig_count:>7}           {eff_weight:.1f}")

        # Pass sample weights correctly into the pipeline or model
        if hasattr(model, "steps"):
            final_step_name, _ = model.steps[-1]
            fit_params[f"{final_step_name}__sample_weight"] = sample_weights
        else:
            fit_params["sample_weight"] = sample_weights

    # Train the model using the training data and calculated fit parameters
    model.fit(X_train, y_train, **fit_params)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Decode the predicted and true labels back to original class names if a label encoder was used
    if label_encoder is not None:
        y_test_decoded = label_encoder.inverse_transform(y_test)
        y_pred_decoded = label_encoder.inverse_transform(y_pred)
    else:
        y_test_decoded = y_test
        y_pred_decoded = y_pred

    # Calculate and print the accuracy score
    acc = accuracy_score(y_test_decoded, y_pred_decoded)

    print(f"\nAccuracy: {acc:.3f}\n")
    print("Classification Report:")
    print(classification_report(y_test_decoded, y_pred_decoded, target_names=class_names))

In [12]:
# Define the preprocessing steps for the categorical and symptom columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols), # One-hot encode categorical columns
        ('sym', 'passthrough', symptom_cols) # Keep symptom columns as they are
    ],
    remainder='passthrough'
)

# Create a pipeline for the Random Forest model
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

# Create a pipeline for the XGBoost model
xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss' # Evaluation metric for multi-class classification
    ))
])

In [13]:
# --- Train and Evaluate Random Forest ---
print("\n========== RANDOM FOREST ==========\n")
train_and_evaluate(
    rf_pipeline,
    X_train, y_train_encoded,
    X_test, y_test_encoded,
    class_names=le.classes_,
    calibration_class_index=0,
    balance=True,                  # Apply class balancing
    label_encoder=le
)

# --- Train and Evaluate XGBoost ---
print("\n========== XGBOOST ==========\n")
train_and_evaluate(
    xgb_pipeline,
    X_train, y_train_encoded,
    X_test, y_test_encoded,
    class_names=le.classes_,
    calibration_class_index=0,
    balance=True,                  # Apply class balancing
    label_encoder=le
)




[Info] Class balance overview (per disease):
  Disease            Original Count   Effective Weight
  hiv                    331           727.3
  pneumonia_ari          774           727.3
  typhoid                811           727.3
  diabetes               314           727.3
  malaria               1331           727.3
  tuberculosis           603           727.3
  gastroenteritis        870           727.3
  healthy               1442           727.3
  measles                681           727.3
  hypertension           304           727.3
  peptic_ulcer           539           727.3

Accuracy: 0.944

Classification Report:
                 precision    recall  f1-score   support

       diabetes       0.97      0.97      0.97        78
gastroenteritis       0.95      0.92      0.93       218
        healthy       0.96      0.98      0.97       360
            hiv       0.98      0.96      0.97        83
   hypertension       0.88      0.84      0.86        76
        malaria   

In [14]:
# Save the trained Random Forest pipeline to a file
joblib.dump(rf_pipeline, "random_forest_pipeline.pkl")

# Save the trained XGBoost pipeline to a file
joblib.dump(xgb_pipeline, "xgboost_pipeline.pkl")

print("Models saved successfully!")

Models saved successfully!


## Confidence

In [15]:
# Compute 95% confidence interval for accuracy
def bootstrap_accuracy(model, X, y, n_bootstrap=1000, random_state=42):
    """Compute 95% CI for accuracy via bootstrapping."""
    rng = np.random.RandomState(random_state)
    acc_scores = []
    for _ in range(n_bootstrap):
        X_resampled, y_resampled = resample(X, y, replace=True, random_state=rng)
        y_pred = model.predict(X_resampled)
        acc_scores.append(accuracy_score(y_resampled, y_pred))
    # Calculate the 2.5 and 97.5 percentiles for the confidence interval
    lower = np.percentile(acc_scores, 2.5)
    upper = np.percentile(acc_scores, 97.5)
    # Calculate the mean accuracy
    mean_acc = np.mean(acc_scores)

    print(f"Bootstrap Accuracy: {mean_acc:.3f} (95% CI: {lower:.3f} - {upper:.3f})")

    return mean_acc, (lower, upper)

# --- Compute and Print Bootstrapped Accuracy for Random Forest ---
print("=== Random Forest Bootstrapped Accuracy ===")
bootstrap_accuracy(rf_pipeline, X_test, y_test_encoded)

# --- Compute and Print Bootstrapped Accuracy for XGBoost ---
print("=== XGBoost Bootstrapped Accuracy ===")
bootstrap_accuracy(xgb_pipeline, X_test, y_test_encoded)

=== Random Forest Bootstrapped Accuracy ===
Bootstrap Accuracy: 0.943 (95% CI: 0.933 - 0.954)
=== XGBoost Bootstrapped Accuracy ===
Bootstrap Accuracy: 0.937 (95% CI: 0.925 - 0.947)


(np.float64(0.936538), (np.float64(0.925), np.float64(0.947)))