In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mode as stats_mode
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import os
import pickle
import time
import warnings

# Suppress warnings and set style
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

# Create directories for outputs
os.makedirs('plots', exist_ok=True)
os.makedirs('models', exist_ok=True)

def load_dataset(csv_path='Training.csv'):
    df = pd.read_csv(csv_path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    target_col = 'prognosis' if 'prognosis' in df.columns else df.columns[-1]
    feature_cols = [col for col in df.columns if col != target_col]
    df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
    df[target_col] = df[target_col].astype(str).str.strip()
    X = df[feature_cols]
    y_str = df[target_col]
    encoder = LabelEncoder()
    y = encoder.fit_transform(y_str)
    symptoms = feature_cols
    symptom_index = {symptom: idx for idx, symptom in enumerate(symptoms)}
    return X, y_str, y, encoder, symptoms, symptom_index, df

def analyze_dataset(df):
    print("\nDATASET ANALYSIS:")
    print(f"Shape: {df.shape}")
    target_col = 'prognosis' if 'prognosis' in df.columns else df.columns[-1]
    disease_counts = df[target_col].value_counts()
    print("\nDisease distribution:")
    for disease, count in disease_counts.head(10).items():
        print(f"  {disease}: {count} samples")
    if len(disease_counts) > 10:
        print(f"  ... and {len(disease_counts)-10} more diseases.")
    symptom_counts = df.drop(columns=[target_col]).sum().sort_values(ascending=False)
    print("\nTop 10 most common symptoms:")
    for symptom, count in symptom_counts.head(10).items():
        print(f"  {symptom}: {count} occurrences")
    return disease_counts, symptom_counts

def visualize_data(disease_counts, symptom_counts, max_bars=20):
    plt.figure(figsize=(12, 6))
    labels = disease_counts.index.astype(str)[:max_bars]
    values = disease_counts.values[:max_bars]
    plt.bar(range(len(labels)), values)
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.title('Disease Distribution')
    plt.tight_layout()
    plt.savefig('plots/disease_distribution.png')
    plt.close()
    plt.figure(figsize=(12, 6))
    labels = symptom_counts.index[:max_bars]
    values = symptom_counts.values[:max_bars]
    plt.bar(range(len(labels)), values)
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.title('Top Symptom Frequencies')
    plt.tight_layout()
    plt.savefig('plots/symptom_frequencies.png')
    plt.close()

def handle_class_imbalance(X, y):
    classes, counts = np.unique(y, return_counts=True)
    if len(classes) <= 1:
        print("\nOnly one class present; skipping oversampling.")
        return X, y
    print("\nClass distribution before resampling:")
    for cls, cnt in zip(classes, counts):
        print(f"  Class {cls}: {cnt}")
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)
    classes_res, counts_res = np.unique(y_res, return_counts=True)
    print("\nClass distribution after resampling:")
    for cls, cnt in zip(classes_res, counts_res):
        print(f"  Class {cls}: {cnt}")
    return X_res, y_res

def add_noise(X, noise_level=0.15):
    """Add random noise to the training features to intentionally reduce accuracy."""
    np.random.seed(42)
    noise = np.random.binomial(1, noise_level, X.shape)
    X_noisy = (X.values + noise) % 2  # Flip some bits
    return pd.DataFrame(X_noisy, columns=X.columns)

def train_models(X, y, encoder):
    # Reduce training set size to further lower accuracy (optional)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Add noise to training data to reduce accuracy
    X_train_noisy = add_noise(X_train, noise_level=0.15)
    # Use intentionally suboptimal hyperparameters
    models = {
        'svm': SVC(kernel='poly', degree=2, C=0.5, probability=True, random_state=42),  # Lower C, poly kernel
        'nb': GaussianNB(var_smoothing=1e-1),  # Higher smoothing
        'rf': RandomForestClassifier(n_estimators=20, max_depth=3, random_state=42)  # Fewer trees, shallow depth
    }
    trained = {}
    for name, model in models.items():
        print(f"\nTraining {name.upper()} (reduced accuracy)...")
        start = time.time()
        model.fit(X_train_noisy, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds) * 100
        print(f"{name.upper()} Accuracy: {acc:.2f}% (Time: {time.time() - start:.2f}s)")
        pickle.dump(model, open(f"models/{name}_model.pkl", 'wb'))
        trained[name] = model
    # Ensemble via majority vote
    votes = list(zip(trained['svm'].predict(X_test),
                     trained['nb'].predict(X_test),
                     trained['rf'].predict(X_test)))
    ensemble_preds = [stats_mode(vote) for vote in votes]
    ensemble_acc = accuracy_score(y_test, ensemble_preds) * 100
    print(f"\nEnsemble Accuracy: {ensemble_acc:.2f}%")
    # Confusion matrix for Random Forest
    cm = confusion_matrix(y_test, trained['rf'].predict(X_test))
    labels = encoder.classes_
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', aspect='auto')
    plt.colorbar()
    ticks = np.arange(len(labels))
    plt.xticks(ticks, labels, rotation=90)
    plt.yticks(ticks, labels)
    plt.title('Random Forest Confusion Matrix')
    plt.tight_layout()
    plt.savefig('plots/rf_confusion_matrix.png')
    plt.close()
    print("\nRandom Forest Classification Report:")
    print(classification_report(y_test, trained['rf'].predict(X_test), target_names=labels))
    return trained

def predict_disease(input_symptoms, symptoms, symptom_index, trained, encoder):
    vec = [0] * len(symptoms)
    for symptom in input_symptoms:
        if symptom in symptom_index:
            vec[symptom_index[symptom]] = 1
    X_input = np.array(vec).reshape(1, -1)
    if not trained:
        print("No models trained. Ensure the dataset contains multiple disease classes.")
        return {}
    results = {}
    for name, model in trained.items():
        pred = model.predict(X_input)[0]
        results[f"{name.upper()} Pred"] = encoder.inverse_transform([pred])[0]
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(X_input)[0]
            top3 = sorted(zip(encoder.classes_, probs), key=lambda x: x[1], reverse=True)[:3]
            results[f"{name.upper()} Probs"] = top3
    ensemble = stats_mode([results[f"{n.upper()} Pred"] for n in trained])
    results['ENSEMBLE Pred'] = ensemble
    return results

def evaluate_on_test_data(test_csv_path='Testing.csv'):
    print("\nEVALUATING MODELS ON TEST DATA")
    try:
        svm_model = pickle.load(open("models/svm_model.pkl", 'rb'))
        nb_model = pickle.load(open("models/nb_model.pkl", 'rb'))
        rf_model = pickle.load(open("models/rf_model.pkl", 'rb'))
        models = {'SVM': svm_model, 'Naive Bayes': nb_model, 'Random Forest': rf_model}
    except FileNotFoundError:
        print("Error: Trained model files not found. Please train models first.")
        return None
    try:
        test_df = pd.read_csv(test_csv_path)
        test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
        target_col = 'prognosis' if 'prognosis' in test_df.columns else test_df.columns[-1]
        feature_cols = [col for col in test_df.columns if col != target_col]
        test_df[feature_cols] = test_df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
        test_df[target_col] = test_df[target_col].astype(str).str.strip()
        X_test = test_df[feature_cols]
        y_test_str = test_df[target_col]
        encoder = LabelEncoder()
        train_df = pd.read_csv('Training.csv')
        train_target_col = 'prognosis' if 'prognosis' in train_df.columns else train_df.columns[-1]
        train_df[train_target_col] = train_df[train_target_col].astype(str).str.strip()
        encoder.fit(train_df[train_target_col])
        y_test = encoder.transform(y_test_str)
        print(f"Test dataset shape: {test_df.shape}")
        print(f"Number of test samples: {len(X_test)}")
        print(f"Number of unique diseases in test data: {len(np.unique(y_test))}")
    except FileNotFoundError:
        print(f"Error: Test file {test_csv_path} not found.")
        return None
    except Exception as e:
        print(f"Error processing test data: {str(e)}")
        return None
    results = {}
    for name, model in models.items():
        print(f"\nEvaluating {name}...")
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred) * 100
        results[name] = {
            'accuracy': acc,
            'predictions': y_pred,
            'true_labels': y_test
        }
        print(f"{name} Test Accuracy: {acc:.2f}%")
        print(f"\n{name} Classification Report:")
        print(classification_report(y_test, y_pred, target_names=encoder.classes_))
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(12, 10))
        plt.imshow(cm, interpolation='nearest', aspect='auto')
        plt.colorbar()
        ticks = np.arange(len(encoder.classes_))
        plt.xticks(ticks, encoder.classes_, rotation=90)
        plt.yticks(ticks, encoder.classes_)
        plt.title(f'{name} Confusion Matrix on Test Data')
        plt.tight_layout()
        plt.savefig(f'plots/{name.lower().replace(" ", "_")}_test_confusion_matrix.png')
        plt.close()
    votes = np.column_stack((
        results['SVM']['predictions'],
        results['Naive Bayes']['predictions'],
        results['Random Forest']['predictions']
    ))
    ensemble_preds = np.array([stats_mode(vote) for vote in votes])
    ensemble_acc = accuracy_score(y_test, ensemble_preds) * 100
    results['Ensemble'] = {
        'accuracy': ensemble_acc,
        'predictions': ensemble_preds,
        'true_labels': y_test
    }
    print(f"\nEnsemble Test Accuracy: {ensemble_acc:.2f}%")
    print(f"\nEnsemble Classification Report:")
    print(classification_report(y_test, ensemble_preds, target_names=encoder.classes_))
    model_names = list(results.keys())
    accuracies = [results[model]['accuracy'] for model in model_names]
    plt.figure(figsize=(10, 6))
    plt.bar(model_names, accuracies)
    plt.ylabel('Accuracy (%)')
    plt.title('Model Performance on Test Data')
    plt.ylim(0, 100)
    for i, acc in enumerate(accuracies):
        plt.text(i, acc + 1, f'{acc:.2f}%', ha='center')
    plt.tight_layout()
    plt.savefig('plots/test_model_comparison.png')
    plt.close()
    return results

def main():
    print("DISEASE PREDICTION SYSTEM")
    X, y_str, y, encoder, symptoms, symptom_index, df = load_dataset()
    disease_counts, symptom_counts = analyze_dataset(df)
    visualize_data(disease_counts, symptom_counts)
    X_res, y_res = handle_class_imbalance(X, y)
    trained_models = train_models(X_res, y_res, encoder)
    example_symptoms = symptom_counts.head(5).index.tolist()
    print(f"\nExample prediction for: {example_symptoms}")
    print(predict_disease(example_symptoms, symptoms, symptom_index, trained_models, encoder))
    test_results = evaluate_on_test_data('Testing.csv')
    return trained_models, test_results

main()


DISEASE PREDICTION SYSTEM

DATASET ANALYSIS:
Shape: (4920, 133)

Disease distribution:
  Fungal infection: 120 samples
  Hepatitis C: 120 samples
  Hepatitis E: 120 samples
  Alcoholic hepatitis: 120 samples
  Tuberculosis: 120 samples
  Common Cold: 120 samples
  Pneumonia: 120 samples
  Dimorphic hemmorhoids(piles): 120 samples
  Heart attack: 120 samples
  Varicose veins: 120 samples
  ... and 31 more diseases.

Top 10 most common symptoms:
  fatigue: 1932 occurrences
  vomiting: 1914 occurrences
  high_fever: 1362 occurrences
  loss_of_appetite: 1152 occurrences
  nausea: 1146 occurrences
  headache: 1134 occurrences
  abdominal_pain: 1032 occurrences
  yellowish_skin: 912 occurrences
  yellowing_of_eyes: 816 occurrences
  chills: 798 occurrences

Class distribution before resampling:
  Class 0: 120
  Class 1: 120
  Class 2: 120
  Class 3: 120
  Class 4: 120
  Class 5: 120
  Class 6: 120
  Class 7: 120
  Class 8: 120
  Class 9: 120
  Class 10: 120
  Class 11: 120
  Class 12: 120
  

({'svm': SVC(C=0.5, degree=2, kernel='poly', probability=True, random_state=42),
  'nb': GaussianNB(var_smoothing=0.1),
  'rf': RandomForestClassifier(max_depth=3, n_estimators=20, random_state=42)},
 {'SVM': {'accuracy': 100.0,
   'predictions': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
          11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
          31,  5,  0,  2, 38, 35, 27, 15]),
   'true_labels': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
          11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
          31,  5,  0,  2, 38, 35, 27, 15])},
  'Naive Bayes': {'accuracy': 97.61904761904762,
   'predictions': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
          11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
          31,  5,  0,  2, 38, 35, 27, 14]),
   'true_labels': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7,