In [2]:
pip install optuna

Collecting optuna
  Using cached optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading SQLAlchemy-2.0.37-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Using cached Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.1.1-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Using cached optuna-4.2.0-py3-none-any.whl (383 kB)
Using cached alembic-1.14.1-py3-none-any.whl (233 kB)
Downloading SQLAlchemy-2.0.37-cp311-cp311-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\yogin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\yogin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_curve, 
    roc_auc_score,
    precision_recall_curve
)
from sklearn.feature_selection import mutual_info_classif, SelectKBest

class HeartDiseaseClassifier:
    def __init__(self, random_state: int = 42):
        self.random_state = random_state
        self.label_encoder = LabelEncoder()
        self.preprocessor = None
        self.model = None
        self.feature_names = None
        self.disease_labels = None

    def preprocess_data(self, data: pd.DataFrame):
        # Preserve original disease labels
        self.disease_labels = data['Disease'].unique()
        
        # Separate features and target
        X = data.drop('Disease', axis=1)
        y = data['Disease']
        
        # Encode target variable
        y = self.label_encoder.fit_transform(y)
        
        # Identify numeric and categorical columns
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object']).columns
        
        # Save feature names
        self.feature_names = list(numeric_features) + list(categorical_features)
        
        # Create preprocessing steps
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        # Combine preprocessing steps
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])
        
        # Apply preprocessing to the entire dataset
        X_preprocessed = self.preprocessor.fit_transform(X)
        
        return X_preprocessed, y
    
    def create_model(self):
        # Create stacked ensemble with multiple base models
        base_models = [
            ('rf', RandomForestClassifier(
                n_estimators=200, 
                max_depth=15, 
                random_state=self.random_state
            )),
            ('svm', SVC(
                probability=True, 
                kernel='rbf', 
                random_state=self.random_state
            ))
        ]
        
        self.model = Pipeline([
            ('feature_selection', SelectKBest(score_func=mutual_info_classif, k=10)),
            ('classifier', StackingClassifier(
                estimators=base_models,
                final_estimator=LogisticRegression(multi_class='ovr', max_iter=1000),
                cv=5
            ))
        ])

    def train_and_evaluate(self, X, y):
        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=self.random_state
        )
        
        # Train model
        self.model.fit(X_train, y_train)
        
        # Predictions
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)
        
        # Evaluation metrics
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Visualizations
        self._plot_confusion_matrix(y_test, y_pred)
        self._plot_roc_curve(X_test, y_test)
        self._plot_precision_recall_curve(X_test, y_test)
        
        return {
            'classification_report': report,
            'accuracy': report['accuracy'],
            'macro_f1': report['macro avg']['f1-score'],
            'y_test': y_test,
            'y_pred_proba': y_pred_proba
        }
    
    def _plot_confusion_matrix(self, y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.savefig('confusion_matrix.png')
        plt.close()
    
    def _plot_roc_curve(self, X_test, y_test):
        y_pred_proba = self.model.predict_proba(X_test)
        
        plt.figure(figsize=(10, 8))
        for i in range(len(np.unique(y_test))):
            fpr, tpr, _ = roc_curve(y_test == i, y_pred_proba[:, i])
            roc_auc = roc_auc_score(y_test == i, y_pred_proba[:, i])
            plt.plot(fpr, tpr, label=f'ROC curve (class {i}, AUC = {roc_auc:.2f})')
        
        plt.plot([0, 1], [0, 1], linestyle='--')
        plt.title('Receiver Operating Characteristic')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc="lower right")
        plt.savefig('roc_curve.png')
        plt.close()
    
    def _plot_precision_recall_curve(self, X_test, y_test):
        y_pred_proba = self.model.predict_proba(X_test)
        
        plt.figure(figsize=(10, 8))
        for i in range(len(np.unique(y_test))):
            precision, recall, _ = precision_recall_curve(y_test == i, y_pred_proba[:, i])
            plt.plot(recall, precision, label=f'Precision-Recall curve (class {i})')
        
        plt.title('Precision-Recall Curve')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.legend(loc="lower right")
        plt.savefig('precision_recall_curve.png')
        plt.close()
    
    def manual_testing(self, test_data: pd.DataFrame):
        # Ensure test data has correct column names
        test_df = pd.DataFrame(test_data, columns=self.feature_names)
        
        # Preprocess and predict probabilities
        preprocessed_test = self.preprocessor.transform(test_df)
        probabilities = self.model.predict_proba(preprocessed_test)
        
        results = []
        for prob in probabilities:
            # Map probabilities to disease labels
            disease_risks = [
                {
                    'Disease': self.disease_labels[i],
                    'Risk Score': round(p * 100, 2)
                } 
                for i, p in enumerate(prob)
            ]
            
            # Sort risks from highest to lowest
            disease_risks_sorted = sorted(disease_risks, key=lambda x: x['Risk Score'], reverse=True)
            
            results.append(disease_risks_sorted)
        
        return results

def print_detailed_predictions(manual_predictions):
    """
    Print detailed disease risk predictions
    """
    for i, risks in enumerate(manual_predictions):
        print(f"\nSample {i + 1} Detailed Risk Analysis:")
        
        # Highlight the highest risk disease
        highest_risk = risks[0]
        print(f"  🚨 Highest Risk Disease: {highest_risk['Disease']}")
        print(f"     PREDICTED RISK: {highest_risk['Risk Score']}%")
        
        # Print other diseases in descending risk order
        print("  Additional Disease Risks:")
        for risk in risks[1:]:
            print(f"    - {risk['Disease']}: {risk['Risk Score']}%")
        print("-" * 40)

def main():
    # Load data
    data = pd.read_csv('heart_disease_imputed_clinical.csv')
    
    # Initialize and train classifier
    classifier = HeartDiseaseClassifier()
    X, y = classifier.preprocess_data(data)
    classifier.create_model()
    
    # Evaluate model
    results = classifier.train_and_evaluate(X, y)
    
    # Print results
    print("Classification Report:")
    print(results['classification_report'])
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Macro F1-Score: {results['macro_f1']:.4f}")
    
    # Manual testing example
    test_samples = data.sample(5).drop('Disease', axis=1)
    manual_predictions = classifier.manual_testing(test_samples)
    
    # Print detailed predictions
    print_detailed_predictions(manual_predictions)

if __name__ == "__main__":
    main()



Classification Report:
{'0': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 60.0}, '1': {'precision': 0.9767441860465116, 'recall': 0.7, 'f1-score': 0.8155339805825242, 'support': 60.0}, '2': {'precision': 0.896551724137931, 'recall': 0.8524590163934426, 'f1-score': 0.8739495798319328, 'support': 61.0}, '3': {'precision': 0.8235294117647058, 'recall': 0.9180327868852459, 'f1-score': 0.8682170542635659, 'support': 61.0}, '4': {'precision': 0.8461538461538461, 'recall': 0.9016393442622951, 'f1-score': 0.873015873015873, 'support': 61.0}, '5': {'precision': 0.9672131147540983, 'recall': 0.9833333333333333, 'f1-score': 0.9752066115702479, 'support': 60.0}, '6': {'precision': 0.8529411764705882, 'recall': 0.9508196721311475, 'f1-score': 0.8992248062015504, 'support': 61.0}, '7': {'precision': 0.9838709677419355, 'recall': 1.0, 'f1-score': 0.991869918699187, 'support': 61.0}, 'accuracy': 0.9134020618556701, 'macro avg': {'precision': 0.9183755533837021, 'recall': 0.91328551912

In [7]:
import pandas as pd
import joblib

# Load your training data
data = pd.read_csv('heart_disease_imputed_clinical.csv')

# Initialize and train classifier
classifier = HeartDiseaseClassifier()
X, y = classifier.preprocess_data(data)
classifier.create_model()
classifier.train_and_evaluate(X, y)

# Save the model components
model_data = {
    'preprocessor': classifier.preprocessor,
    'model': classifier.model,
    'feature_names': classifier.feature_names,
    'disease_labels': classifier.disease_labels
}

# Save to pickle file
joblib.dump(model_data, 'heart_disease_model.pkl')



['heart_disease_model.pkl']