In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
# Load the dataset
url = "https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease?datasetId=1936563&sortBy=voteCount"
data = pd.read_csv(url)

# Display first 5 rows
print(data.head())

In [None]:
# Check the shape of the data
print(data.shape)

# General information
print(data.info())

In [None]:
# Statistical summary
print(data.describe())

In [None]:
# Check missing values
missing_values = data.isnull().sum()
missing_percentage = (missing_values / data.shape[0]) * 100
print(missing_percentage)

# Drop missing values if any
data = data.dropna()

In [None]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f"Duplicate Rows: {duplicates}")

# Remove duplicates if any
data = data.drop_duplicates()

In [None]:
# Distribution of the target variable
sns.countplot(data['HeartDisease'])
plt.title('Heart Disease Distribution')
plt.show()

In [None]:
# Visualize 'HeartDisease' with respect to categorical features
sns.countplot(data['Sex'], hue=data['HeartDisease'])
plt.title('Heart Disease by Sex')
plt.show()

In [None]:
# Replace 'Yes (during pregnancy)' as 'Yes' and 'No, borderline diabetes' as 'No'
data['Diabetic'] = data['Diabetic'].replace({
    'Yes (during pregnancy)': 'Yes',
    'No, borderline diabetes': 'No'
})

In [None]:
# Encode 'HeartDisease'
data['HeartDisease'] = data['HeartDisease'].map({'No': 0, 'Yes': 1})

In [None]:
# Label encode categorical columns
label_encoder = LabelEncoder()
data['AgeCategory'] = label_encoder.fit_transform(data['AgeCategory'])
data['Race'] = label_encoder.fit_transform(data['Race'])
data['GenHealth'] = label_encoder.fit_transform(data['GenHealth'])

# One-hot encode the rest of the categorical columns
data = pd.get_dummies(data, drop_first=True)

In [None]:
# Features and target variable
X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

In [None]:
# Split data into train and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Standardize numerical columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred)

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'GradientBoost': GradientBoostingClassifier()
}

# Evaluate models
results = {}
for name, model in models.items():
    accuracy, cm, cr = train_and_evaluate(model, X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {'Accuracy': accuracy, 'Confusion Matrix': cm, 'Classification Report': cr}

# Display results
for name, result in results.items():
    print(f"Model: {name}")
    print(f"Accuracy: {result['Accuracy']}")
    print(f"Confusion Matrix:\n{result['Confusion Matrix']}")
    print(f"Classification Report:\n{result['Classification Report']}")