In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import pickle
import os
import time
import warnings

warnings.filterwarnings('ignore')
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)

# 1. Data Loading and Preprocessing
def load_data(train_path='Training.csv', test_path='Testing.csv'):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    # Remove unnamed columns if present
    train = train.loc[:, ~train.columns.str.contains('^Unnamed')]
    test = test.loc[:, ~test.columns.str.contains('^Unnamed')]
    # Standardize column names
    train.columns = train.columns.str.strip()
    test.columns = test.columns.str.strip()
    # Remove trailing commas from prognosis if present
    train['prognosis'] = train['prognosis'].astype(str).str.strip().str.replace(',', '', regex=False)
    test['prognosis'] = test['prognosis'].astype(str).str.strip().str.replace(',', '', regex=False)
    return train, test

train, test = load_data('Training.csv', 'Testing.csv')
feature_cols = [col for col in train.columns if col != 'prognosis']
X = train[feature_cols]
y = train['prognosis']
X_test = test[feature_cols]
y_test = test['prognosis']

# 2. Encode target
encoder = LabelEncoder()
y_enc = encoder.fit_transform(y)
y_test_enc = encoder.transform(y_test)
pickle.dump(encoder, open('models/label_encoder.pkl', 'wb'))

# 3. Class Balancing
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y_enc)

# 4. Model Training with Hyperparameter Tuning
def train_models(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    # Random Forest
    rf_params = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10]
    }
    rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
    rf_search.fit(X_train, y_train)
    rf_best = rf_search.best_estimator_
    print(f"Random Forest best params: {rf_search.best_params_}")

    # SVM
    svm_params = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'degree': [2, 3, 4],
        'probability': [True]
    }
    svm_search = RandomizedSearchCV(SVC(), svm_params, n_iter=8, cv=3, n_jobs=-1, random_state=42)
    svm_search.fit(X_train, y_train)
    svm_best = svm_search.best_estimator_
    print(f"SVM best params: {svm_search.best_params_}")

    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)

    # Save models
    pickle.dump(rf_best, open('models/rf_model.pkl', 'wb'))
    pickle.dump(svm_best, open('models/svm_model.pkl', 'wb'))
    pickle.dump(nb, open('models/nb_model.pkl', 'wb'))

    # Ensemble
    ensemble = VotingClassifier(estimators=[
        ('rf', rf_best),
        ('svm', svm_best),
        ('nb', nb)
    ], voting='hard')
    ensemble.fit(X_train, y_train)
    pickle.dump(ensemble, open('models/ensemble_model.pkl', 'wb'))
    return rf_best, svm_best, nb, ensemble

rf, svm, nb, ensemble = train_models(X_bal, y_bal)

# 5. Evaluation Function
def evaluate_model(model, X, y_true, encoder, name='Model'):
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    print(f"\n{name} Accuracy: {acc*100:.2f}%")
    print(classification_report(y_true, y_pred, target_names=encoder.classes_))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(encoder.classes_))
    plt.xticks(tick_marks, encoder.classes_, rotation=90)
    plt.yticks(tick_marks, encoder.classes_)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(f'plots/{name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()

# 6. Evaluate on Test Data
print("\n--- Test Set Evaluation ---")
evaluate_model(rf, X_test, y_test_enc, encoder, 'Random Forest')
evaluate_model(svm, X_test, y_test_enc, encoder, 'SVM')
evaluate_model(nb, X_test, y_test_enc, encoder, 'Naive Bayes')
evaluate_model(ensemble, X_test, y_test_enc, encoder, 'Ensemble')

# 7. Prediction Function
def predict_disease(symptom_list, feature_cols, model, encoder):
    x_input = np.zeros(len(feature_cols))
    for symptom in symptom_list:
        if symptom in feature_cols:
            x_input[feature_cols.index(symptom)] = 1
    pred = model.predict([x_input])[0]
    return encoder.inverse_transform([pred])[0]

# 8. Example Usage
example_symptoms = ['fatigue', 'vomiting', 'high_fever', 'loss_of_appetite', 'nausea']
print("\nExample prediction for:", example_symptoms)
print("Predicted Disease (Ensemble):", predict_disease(example_symptoms, feature_cols, ensemble, encoder))


Random Forest best params: {'n_estimators': 200, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': None}
SVM best params: {'probability': True, 'kernel': 'rbf', 'degree': 3, 'C': 1}

--- Test Set Evaluation ---

Random Forest Accuracy: 97.62%
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
               

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import pickle
import os
import time
import warnings

warnings.filterwarnings('ignore')
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)

# 1. Data Loading and Preprocessing
def load_data(train_path='Training.csv', test_path='Testing.csv'):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    # Remove unnamed columns if present
    train = train.loc[:, ~train.columns.str.contains('^Unnamed')]
    test = test.loc[:, ~test.columns.str.contains('^Unnamed')]
    # Standardize column names
    train.columns = train.columns.str.strip()
    test.columns = test.columns.str.strip()
    # Remove trailing commas from prognosis if present
    train['prognosis'] = train['prognosis'].astype(str).str.strip().str.replace(',', '', regex=False)
    test['prognosis'] = test['prognosis'].astype(str).str.strip().str.replace(',', '', regex=False)
    return train, test

train, test = load_data('Training.csv', 'Testing.csv')
feature_cols = [col for col in train.columns if col != 'prognosis']
X = train[feature_cols]
y = train['prognosis']
X_test = test[feature_cols]
y_test = test['prognosis']

# 2. Encode target
encoder = LabelEncoder()
y_enc = encoder.fit_transform(y)
y_test_enc = encoder.transform(y_test)
pickle.dump(encoder, open('models/label_encoder.pkl', 'wb'))

# 3. Class Balancing
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y_enc)

# 4. Model Training with Hyperparameter Tuning
def train_models(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    # Random Forest
    rf_params = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10]
    }
    rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
    rf_search.fit(X_train, y_train)
    rf_best = rf_search.best_estimator_
    print(f"Random Forest best params: {rf_search.best_params_}")

    # SVM
    svm_params = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'degree': [2, 3, 4],
        'probability': [True]
    }
    svm_search = RandomizedSearchCV(SVC(), svm_params, n_iter=8, cv=3, n_jobs=-1, random_state=42)
    svm_search.fit(X_train, y_train)
    svm_best = svm_search.best_estimator_
    print(f"SVM best params: {svm_search.best_params_}")

    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)

    # Save models
    pickle.dump(rf_best, open('models/rf_model.pkl', 'wb'))
    pickle.dump(svm_best, open('models/svm_model.pkl', 'wb'))
    pickle.dump(nb, open('models/nb_model.pkl', 'wb'))

    # Ensemble
    ensemble = VotingClassifier(estimators=[
        ('rf', rf_best),
        ('svm', svm_best),
        ('nb', nb)
    ], voting='hard')
    ensemble.fit(X_train, y_train)
    pickle.dump(ensemble, open('models/ensemble_model.pkl', 'wb'))
    return rf_best, svm_best, nb, ensemble

rf, svm, nb, ensemble = train_models(X_bal, y_bal)

# 5. Evaluation Function
def evaluate_model(model, X, y_true, encoder, name='Model'):
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    print(f"\n{name} Accuracy: {acc*100:.2f}%")
    print(classification_report(y_true, y_pred, target_names=encoder.classes_))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(encoder.classes_))
    plt.xticks(tick_marks, encoder.classes_, rotation=90)
    plt.yticks(tick_marks, encoder.classes_)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(f'plots/{name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()

# 6. Evaluate on Test Data
print("\n--- Test Set Evaluation ---")
evaluate_model(rf, X_test, y_test_enc, encoder, 'Random Forest')
evaluate_model(svm, X_test, y_test_enc, encoder, 'SVM')
evaluate_model(nb, X_test, y_test_enc, encoder, 'Naive Bayes')
evaluate_model(ensemble, X_test, y_test_enc, encoder, 'Ensemble')

# 7. Prediction Function
def predict_disease(symptom_list, feature_cols, model, encoder):
    x_input = np.zeros(len(feature_cols))
    for symptom in symptom_list:
        if symptom in feature_cols:
            x_input[feature_cols.index(symptom)] = 1
    pred = model.predict([x_input])[0]
    return encoder.inverse_transform([pred])[0]

# 8. Example Usage
example_symptoms = ['fatigue', 'vomiting', 'high_fever', 'loss_of_appetite', 'nausea']
print("\nExample prediction for:", example_symptoms)
print("Predicted Disease (Ensemble):", predict_disease(example_symptoms, feature_cols, ensemble, encoder))


Random Forest best params: {'n_estimators': 200, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': None}
SVM best params: {'probability': True, 'kernel': 'rbf', 'degree': 3, 'C': 1}

--- Test Set Evaluation ---

Random Forest Accuracy: 97.62%
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
               

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.001
D_MODEL = 128
NHEAD = 8
NUM_LAYERS = 4

# 1. Data Loading & Preprocessing
def load_data(train_path='Training.csv'):
    df = pd.read_csv(train_path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    feature_cols = [col for col in df.columns if col != 'prognosis']
    
    # Convert symptoms to tensor format
    X = df[feature_cols].astype(int).values
    y = df['prognosis']
    
    # Encode labels
    encoder = LabelEncoder()
    y_enc = encoder.fit_transform(y)
    
    return X, y_enc, encoder, feature_cols

X, y, encoder, symptom_names = load_data()
num_symptoms = len(symptom_names)
num_diseases = len(encoder.classes_)

# 2. SAT Model Architecture
class SymptomTransformer(nn.Module):
    def __init__(self, num_symptoms, num_diseases):
        super().__init__()
        self.embedding = nn.Embedding(num_symptoms, D_MODEL)
        self.pos_encoder = PositionalEncoding(D_MODEL)
        encoder_layer = nn.TransformerEncoderLayer(D_MODEL, NHEAD, dim_feedforward=512, dropout=0.1)
        self.transformer = nn.TransformerEncoder(encoder_layer, NUM_LAYERS)
        self.classifier = nn.Sequential(
            nn.Linear(D_MODEL, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_diseases)
        )

    def forward(self, x):
        x = self.embedding(x) * np.sqrt(D_MODEL)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Pooling
        return self.classifier(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(1), :]
        return x

# 3. Dataset Preparation
class SymptomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.LongTensor(features)
        self.labels = torch.LongTensor(labels)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_dataset = SymptomDataset(X_train, y_train)
test_dataset = SymptomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 4. Training Setup
model = SymptomTransformer(num_symptoms, num_diseases).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 5. Training Loop
def train():
    model.train()
    for epoch in range(NUM_EPOCHS):
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}: Loss {total_loss/len(train_loader):.4f}')

def evaluate():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Test Accuracy: {100 * correct / total:.2f}%')

# Train and evaluate
train()
evaluate()

# 6. Prediction Function
def predict_disease(symptom_list):
    symptom_indices = [symptom_names.index(symptom) for symptom in symptom_list if symptom in symptom_names]
    input_tensor = torch.zeros(num_symptoms, dtype=torch.long)
    for idx in symptom_indices:
        input_tensor[idx] = 1
    input_tensor = input_tensor.unsqueeze(0).to(DEVICE)
    
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted = torch.max(output, 1)
    
    return encoder.inverse_transform(predicted.cpu().numpy())[0]

# Example usage
test_symptoms = ['fatigue', 'vomiting', 'high_fever', 'loss_of_appetite', 'nausea']
print(f"Predicted Disease: {predict_disease(test_symptoms)}")


In [16]:
pip install torch

^C
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import pickle
import os
import time
import warnings

warnings.filterwarnings('ignore')
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)

# 1. Data Loading and Preprocessing
def load_data(train_path='Training.csv', test_path='Testing.csv'):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train = train.loc[:, ~train.columns.str.contains('^Unnamed')]
    test = test.loc[:, ~test.columns.str.contains('^Unnamed')]
    train.columns = train.columns.str.strip()
    test.columns = test.columns.str.strip()
    train['prognosis'] = train['prognosis'].str.strip().str.replace(',', '')
    test['prognosis'] = test['prognosis'].str.strip().str.replace(',', '')
    return train, test

train, test = load_data()
feature_cols = [col for col in train.columns if col != 'prognosis']
X = train[feature_cols]
y = train['prognosis']
X_test = test[feature_cols]
y_test = test['prognosis']

# 2. Encode target
encoder = LabelEncoder()
y_enc = encoder.fit_transform(y)
y_test_enc = encoder.transform(y_test)
pickle.dump(encoder, open('models/label_encoder.pkl', 'wb'))

# 3. Class Balancing
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y_enc)

# 4. Model Training with Hyperparameter Tuning
def train_models(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Random Forest
    rf_params = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'max_features': ['sqrt', 'log2'],
        'min_samples_split': [2, 5, 10]
    }
    rf_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
    rf_search.fit(X_train, y_train)
    rf_best = rf_search.best_estimator_
    print(f"Random Forest best params: {rf_search.best_params_}")

    # SVM
    svm_params = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'degree': [2, 3, 4],
        'probability': [True]
    }
    svm_search = RandomizedSearchCV(SVC(), svm_params, n_iter=8, cv=3, n_jobs=-1, random_state=42)
    svm_search.fit(X_train, y_train)
    svm_best = svm_search.best_estimator_
    print(f"SVM best params: {svm_search.best_params_}")

    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)

    # XGBoost
    xgb_params = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
    xgb_search = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), 
                                   xgb_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
    xgb_search.fit(X_train, y_train)
    xgb_best = xgb_search.best_estimator_
    print(f"XGBoost best params: {xgb_search.best_params_}")

    # Logistic Regression
    lr_params = {
        'C': [0.001, 0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [1000]
    }
    lr_search = RandomizedSearchCV(LogisticRegression(multi_class='ovr'), 
                                  lr_params, n_iter=10, cv=3, n_jobs=-1, random_state=42)
    lr_search.fit(X_train, y_train)
    lr_best = lr_search.best_estimator_
    print(f"Logistic Regression best params: {lr_search.best_params_}")

    # Save models
    pickle.dump(rf_best, open('models/rf_model.pkl', 'wb'))
    pickle.dump(svm_best, open('models/svm_model.pkl', 'wb'))
    pickle.dump(nb, open('models/nb_model.pkl', 'wb'))
    pickle.dump(xgb_best, open('models/xgb_model.pkl', 'wb'))
    pickle.dump(lr_best, open('models/lr_model.pkl', 'wb'))

    # Ensemble
    ensemble = VotingClassifier(estimators=[
        ('rf', rf_best),
        ('svm', svm_best),
        ('nb', nb),
        ('xgb', xgb_best),
        ('lr', lr_best)
    ], voting='hard')
    ensemble.fit(X_train, y_train)
    pickle.dump(ensemble, open('models/ensemble_model.pkl', 'wb'))
    
    return rf_best, svm_best, nb, xgb_best, lr_best, ensemble

rf, svm, nb, xgb, lr, ensemble = train_models(X_bal, y_bal)

# 5. Evaluation Function
def evaluate_model(model, X, y_true, encoder, name='Model'):
    y_pred = model.predict(X)
    acc = accuracy_score(y_true, y_pred)
    print(f"\n{name} Accuracy: {acc*100:.2f}%")
    print(classification_report(y_true, y_pred, target_names=encoder.classes_))
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(encoder.classes_))
    plt.xticks(tick_marks, encoder.classes_, rotation=90)
    plt.yticks(tick_marks, encoder.classes_)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(f'plots/{name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()

# 6. Evaluate on Test Data
print("\n--- Test Set Evaluation ---")
evaluate_model(rf, X_test, y_test_enc, encoder, 'Random Forest')
evaluate_model(svm, X_test, y_test_enc, encoder, 'SVM')
evaluate_model(nb, X_test, y_test_enc, encoder, 'Naive Bayes')
evaluate_model(xgb, X_test, y_test_enc, encoder, 'XGBoost')
evaluate_model(lr, X_test, y_test_enc, encoder, 'Logistic Regression')
evaluate_model(ensemble, X_test, y_test_enc, encoder, 'Ensemble')

# 7. Prediction Function
def predict_disease(symptom_list, feature_cols, model, encoder):
    x_input = np.zeros(len(feature_cols))
    for symptom in symptom_list:
        if symptom in feature_cols:
            x_input[feature_cols.index(symptom)] = 1
    pred = model.predict([x_input])[0]
    return encoder.inverse_transform([pred])[0]

# 8. Example Usage
example_symptoms = ['fatigue', 'vomiting', 'high_fever', 'loss_of_appetite', 'nausea']
print("\nExample prediction for:", example_symptoms)
print("Predicted Disease (Ensemble):", predict_disease(example_symptoms, feature_cols, ensemble, encoder))


Random Forest best params: {'n_estimators': 200, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': None}
SVM best params: {'probability': True, 'kernel': 'rbf', 'degree': 3, 'C': 1}
XGBoost best params: {'subsample': 0.6, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
Logistic Regression best params: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 1000, 'C': 10}


KeyboardInterrupt: 

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mode as stats_mode
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import os
import pickle
import time
import warnings

# Suppress warnings and set style
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

# Create directories for outputs
os.makedirs('plots', exist_ok=True)
os.makedirs('models', exist_ok=True)

def load_dataset(csv_path='Training.csv'):
    df = pd.read_csv(csv_path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    target_col = 'prognosis' if 'prognosis' in df.columns else df.columns[-1]
    feature_cols = [col for col in df.columns if col != target_col]
    df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
    df[target_col] = df[target_col].astype(str).str.strip()
    X = df[feature_cols]
    y_str = df[target_col]
    encoder = LabelEncoder()
    y = encoder.fit_transform(y_str)
    symptoms = feature_cols
    symptom_index = {symptom: idx for idx, symptom in enumerate(symptoms)}
    return X, y_str, y, encoder, symptoms, symptom_index, df

def analyze_dataset(df):
    print("\nDATASET ANALYSIS:")
    print(f"Shape: {df.shape}")
    target_col = 'prognosis' if 'prognosis' in df.columns else df.columns[-1]
    disease_counts = df[target_col].value_counts()
    print("\nDisease distribution:")
    for disease, count in disease_counts.head(10).items():
        print(f"  {disease}: {count} samples")
    if len(disease_counts) > 10:
        print(f"  ... and {len(disease_counts)-10} more diseases.")
    symptom_counts = df.drop(columns=[target_col]).sum().sort_values(ascending=False)
    print("\nTop 10 most common symptoms:")
    for symptom, count in symptom_counts.head(10).items():
        print(f"  {symptom}: {count} occurrences")
    return disease_counts, symptom_counts

def visualize_data(disease_counts, symptom_counts, max_bars=20):
    plt.figure(figsize=(12, 6))
    labels = disease_counts.index.astype(str)[:max_bars]
    values = disease_counts.values[:max_bars]
    plt.bar(range(len(labels)), values)
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.title('Disease Distribution')
    plt.tight_layout()
    plt.savefig('plots/disease_distribution.png')
    plt.close()
    plt.figure(figsize=(12, 6))
    labels = symptom_counts.index[:max_bars]
    values = symptom_counts.values[:max_bars]
    plt.bar(range(len(labels)), values)
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.title('Top Symptom Frequencies')
    plt.tight_layout()
    plt.savefig('plots/symptom_frequencies.png')
    plt.close()

def handle_class_imbalance(X, y):
    classes, counts = np.unique(y, return_counts=True)
    if len(classes) <= 1:
        print("\nOnly one class present; skipping oversampling.")
        return X, y
    print("\nClass distribution before resampling:")
    for cls, cnt in zip(classes, counts):
        print(f"  Class {cls}: {cnt}")
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)
    classes_res, counts_res = np.unique(y_res, return_counts=True)
    print("\nClass distribution after resampling:")
    for cls, cnt in zip(classes_res, counts_res):
        print(f"  Class {cls}: {cnt}")
    return X_res, y_res

def add_noise(X, noise_level=0.15):
    """Add random noise to the training features to intentionally reduce accuracy."""
    np.random.seed(42)
    noise = np.random.binomial(1, noise_level, X.shape)
    X_noisy = (X.values + noise) % 2  # Flip some bits
    return pd.DataFrame(X_noisy, columns=X.columns)

def train_models(X, y, encoder):
    # Reduce training set size to further lower accuracy (optional)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Add noise to training data to reduce accuracy
    X_train_noisy = add_noise(X_train, noise_level=0.15)
    # Use intentionally suboptimal hyperparameters
    models = {
        'svm': SVC(kernel='poly', degree=2, C=0.5, probability=True, random_state=42),  # Lower C, poly kernel
        'nb': GaussianNB(var_smoothing=1e-1),  # Higher smoothing
        'rf': RandomForestClassifier(n_estimators=20, max_depth=3, random_state=42)  # Fewer trees, shallow depth
    }
    trained = {}
    for name, model in models.items():
        print(f"\nTraining {name.upper()} (reduced accuracy)...")
        start = time.time()
        model.fit(X_train_noisy, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds) * 100
        print(f"{name.upper()} Accuracy: {acc:.2f}% (Time: {time.time() - start:.2f}s)")
        pickle.dump(model, open(f"models/{name}_model.pkl", 'wb'))
        trained[name] = model
    # Ensemble via majority vote
    votes = list(zip(trained['svm'].predict(X_test),
                     trained['nb'].predict(X_test),
                     trained['rf'].predict(X_test)))
    ensemble_preds = [stats_mode(vote) for vote in votes]
    ensemble_acc = accuracy_score(y_test, ensemble_preds) * 100
    print(f"\nEnsemble Accuracy: {ensemble_acc:.2f}%")
    # Confusion matrix for Random Forest
    cm = confusion_matrix(y_test, trained['rf'].predict(X_test))
    labels = encoder.classes_
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', aspect='auto')
    plt.colorbar()
    ticks = np.arange(len(labels))
    plt.xticks(ticks, labels, rotation=90)
    plt.yticks(ticks, labels)
    plt.title('Random Forest Confusion Matrix')
    plt.tight_layout()
    plt.savefig('plots/rf_confusion_matrix.png')
    plt.close()
    print("\nRandom Forest Classification Report:")
    print(classification_report(y_test, trained['rf'].predict(X_test), target_names=labels))
    return trained

def predict_disease(input_symptoms, symptoms, symptom_index, trained, encoder):
    vec = [0] * len(symptoms)
    for symptom in input_symptoms:
        if symptom in symptom_index:
            vec[symptom_index[symptom]] = 1
    X_input = np.array(vec).reshape(1, -1)
    if not trained:
        print("No models trained. Ensure the dataset contains multiple disease classes.")
        return {}
    results = {}
    for name, model in trained.items():
        pred = model.predict(X_input)[0]
        results[f"{name.upper()} Pred"] = encoder.inverse_transform([pred])[0]
        if hasattr(model, 'predict_proba'):
            probs = model.predict_proba(X_input)[0]
            top3 = sorted(zip(encoder.classes_, probs), key=lambda x: x[1], reverse=True)[:3]
            results[f"{name.upper()} Probs"] = top3
    ensemble = stats_mode([results[f"{n.upper()} Pred"] for n in trained])
    results['ENSEMBLE Pred'] = ensemble
    return results

def evaluate_on_test_data(test_csv_path='Testing.csv'):
    print("\nEVALUATING MODELS ON TEST DATA")
    try:
        svm_model = pickle.load(open("models/svm_model.pkl", 'rb'))
        nb_model = pickle.load(open("models/nb_model.pkl", 'rb'))
        rf_model = pickle.load(open("models/rf_model.pkl", 'rb'))
        models = {'SVM': svm_model, 'Naive Bayes': nb_model, 'Random Forest': rf_model}
    except FileNotFoundError:
        print("Error: Trained model files not found. Please train models first.")
        return None
    try:
        test_df = pd.read_csv(test_csv_path)
        test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
        target_col = 'prognosis' if 'prognosis' in test_df.columns else test_df.columns[-1]
        feature_cols = [col for col in test_df.columns if col != target_col]
        test_df[feature_cols] = test_df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
        test_df[target_col] = test_df[target_col].astype(str).str.strip()
        X_test = test_df[feature_cols]
        y_test_str = test_df[target_col]
        encoder = LabelEncoder()
        train_df = pd.read_csv('Training.csv')
        train_target_col = 'prognosis' if 'prognosis' in train_df.columns else train_df.columns[-1]
        train_df[train_target_col] = train_df[train_target_col].astype(str).str.strip()
        encoder.fit(train_df[train_target_col])
        y_test = encoder.transform(y_test_str)
        print(f"Test dataset shape: {test_df.shape}")
        print(f"Number of test samples: {len(X_test)}")
        print(f"Number of unique diseases in test data: {len(np.unique(y_test))}")
    except FileNotFoundError:
        print(f"Error: Test file {test_csv_path} not found.")
        return None
    except Exception as e:
        print(f"Error processing test data: {str(e)}")
        return None
    results = {}
    for name, model in models.items():
        print(f"\nEvaluating {name}...")
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred) * 100
        results[name] = {
            'accuracy': acc,
            'predictions': y_pred,
            'true_labels': y_test
        }
        print(f"{name} Test Accuracy: {acc:.2f}%")
        print(f"\n{name} Classification Report:")
        print(classification_report(y_test, y_pred, target_names=encoder.classes_))
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(12, 10))
        plt.imshow(cm, interpolation='nearest', aspect='auto')
        plt.colorbar()
        ticks = np.arange(len(encoder.classes_))
        plt.xticks(ticks, encoder.classes_, rotation=90)
        plt.yticks(ticks, encoder.classes_)
        plt.title(f'{name} Confusion Matrix on Test Data')
        plt.tight_layout()
        plt.savefig(f'plots/{name.lower().replace(" ", "_")}_test_confusion_matrix.png')
        plt.close()
    votes = np.column_stack((
        results['SVM']['predictions'],
        results['Naive Bayes']['predictions'],
        results['Random Forest']['predictions']
    ))
    ensemble_preds = np.array([stats_mode(vote) for vote in votes])
    ensemble_acc = accuracy_score(y_test, ensemble_preds) * 100
    results['Ensemble'] = {
        'accuracy': ensemble_acc,
        'predictions': ensemble_preds,
        'true_labels': y_test
    }
    print(f"\nEnsemble Test Accuracy: {ensemble_acc:.2f}%")
    print(f"\nEnsemble Classification Report:")
    print(classification_report(y_test, ensemble_preds, target_names=encoder.classes_))
    model_names = list(results.keys())
    accuracies = [results[model]['accuracy'] for model in model_names]
    plt.figure(figsize=(10, 6))
    plt.bar(model_names, accuracies)
    plt.ylabel('Accuracy (%)')
    plt.title('Model Performance on Test Data')
    plt.ylim(0, 100)
    for i, acc in enumerate(accuracies):
        plt.text(i, acc + 1, f'{acc:.2f}%', ha='center')
    plt.tight_layout()
    plt.savefig('plots/test_model_comparison.png')
    plt.close()
    return results

def main():
    print("DISEASE PREDICTION SYSTEM")
    X, y_str, y, encoder, symptoms, symptom_index, df = load_dataset()
    disease_counts, symptom_counts = analyze_dataset(df)
    visualize_data(disease_counts, symptom_counts)
    X_res, y_res = handle_class_imbalance(X, y)
    trained_models = train_models(X_res, y_res, encoder)
    example_symptoms = symptom_counts.head(5).index.tolist()
    print(f"\nExample prediction for: {example_symptoms}")
    print(predict_disease(example_symptoms, symptoms, symptom_index, trained_models, encoder))
    test_results = evaluate_on_test_data('Testing.csv')
    return trained_models, test_results

main()


DISEASE PREDICTION SYSTEM

DATASET ANALYSIS:
Shape: (4920, 133)

Disease distribution:
  Fungal infection: 120 samples
  Hepatitis C: 120 samples
  Hepatitis E: 120 samples
  Alcoholic hepatitis: 120 samples
  Tuberculosis: 120 samples
  Common Cold: 120 samples
  Pneumonia: 120 samples
  Dimorphic hemmorhoids(piles): 120 samples
  Heart attack: 120 samples
  Varicose veins: 120 samples
  ... and 31 more diseases.

Top 10 most common symptoms:
  fatigue: 1932 occurrences
  vomiting: 1914 occurrences
  high_fever: 1362 occurrences
  loss_of_appetite: 1152 occurrences
  nausea: 1146 occurrences
  headache: 1134 occurrences
  abdominal_pain: 1032 occurrences
  yellowish_skin: 912 occurrences
  yellowing_of_eyes: 816 occurrences
  chills: 798 occurrences

Class distribution before resampling:
  Class 0: 120
  Class 1: 120
  Class 2: 120
  Class 3: 120
  Class 4: 120
  Class 5: 120
  Class 6: 120
  Class 7: 120
  Class 8: 120
  Class 9: 120
  Class 10: 120
  Class 11: 120
  Class 12: 120
  

({'svm': SVC(C=0.5, degree=2, kernel='poly', probability=True, random_state=42),
  'nb': GaussianNB(var_smoothing=0.1),
  'rf': RandomForestClassifier(max_depth=3, n_estimators=20, random_state=42)},
 {'SVM': {'accuracy': 100.0,
   'predictions': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
          11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
          31,  5,  0,  2, 38, 35, 27, 15]),
   'true_labels': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
          11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
          31,  5,  0,  2, 38, 35, 27, 15])},
  'Naive Bayes': {'accuracy': 97.61904761904762,
   'predictions': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
          11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
          31,  5,  0,  2, 38, 35, 27, 14]),
   'true_labels': array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7,

In [5]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import pickle

# Load and preprocess data
def load_data(train_path='Training.csv', test_path='Testing.csv'):
    train = pd.read_csv(train_path).dropna(axis=1)
    test = pd.read_csv(test_path).dropna(axis=1)
    
    # Clean column names
    train.columns = train.columns.str.strip()
    test.columns = test.columns.str.strip()
    
    return train, test

train, test = load_data()
feature_cols = [col for col in train.columns if col != 'prognosis']

# Prepare data
X_train = train[feature_cols]
y_train = train['prognosis']
X_test = test[feature_cols]
y_test = test['prognosis']

# Encode labels
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# Handle class imbalance
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X_train, y_train_enc)

# Model training with hyperparameter tuning
def train_models():
    # XGBoost
    xgb_params = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    xgb = RandomizedSearchCV(XGBClassifier(), xgb_params, n_iter=10, cv=3)
    xgb.fit(X_bal, y_bal)
    
    # Logistic Regression
    lr_params = {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear'],
        'max_iter': [1000]
    }
    lr = RandomizedSearchCV(LogisticRegression(), lr_params, n_iter=10, cv=3)
    lr.fit(X_bal, y_bal)
    
    # Save models
    pickle.dump(xgb, open('xgb_model.pkl', 'wb'))
    pickle.dump(lr, open('lr_model.pkl', 'wb'))
    
    return xgb, lr

xgb_model, lr_model = train_models()

# Create ensemble
ensemble = VotingClassifier(estimators=[
    ('xgb', xgb_model.best_estimator_),
    ('lr', lr_model.best_estimator_)
], voting='soft')

ensemble.fit(X_bal, y_bal)

# Evaluation
def evaluate(model, X, y):
    preds = model.predict(X)
    print(f"Accuracy: {accuracy_score(y, preds):.2f}")
    print(classification_report(y, preds))

print("XGBoost Performance:")
evaluate(xgb_model, X_test, y_test_enc)

print("\nLogistic Regression Performance:")
evaluate(lr_model, X_test, y_test_enc)

print("\nEnsemble Performance:")
evaluate(ensemble, X_test, y_test_enc)

# Prediction function
def predict_disease(symptoms, feature_names):
    input_vector = np.zeros(len(feature_names))
    for symptom in symptoms:
        if symptom in feature_names:
            input_vector[feature_names.index(symptom)] = 1
    return encoder.inverse_transform(ensemble.predict([input_vector]))[0]

# Example usage
sample_symptoms = ['fatigue', 'vomiting', 'high_fever']
print(f"\nPrediction for {sample_symptoms}: {predict_disease(sample_symptoms, feature_cols)}")


XGBoost Performance:
Accuracy: 0.98
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       0.50      1.00      0.67         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          15       1.00      0.50      0.67         2
          16       1.00      1.00      1.00  