In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier # Corrected typo here
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score

# Set plot style
sns.set_style('whitegrid')

In [None]:
# Download the dataset using the Kaggle Hub API
print("Downloading dataset...")
path = kagglehub.dataset_download("redwankarimsony/heart-disease-data")

# Load the dataset from the downloaded path
file_path = f'{path}/heart_disease_uci.csv'
df = pd.read_csv(file_path)

print("Dataset downloaded and loaded successfully.")
print(f"Data shape: {df.shape}")
df.head()

In [None]:
# Initial inspection
print("Dataset Information:")
df.info()

print("\nDescriptive Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum().sum())

In [None]:
df.isnull().sum()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='num', data=df, palette='viridis', hue='num', legend=False)
plt.title('Distribution of Heart Disease (1 = Disease, 0 = No Disease)')
plt.xlabel('Target')
plt.ylabel('Count')
plt.show()

In [None]:
# Let's visualize the relationship between key features and the target
fig, axes = plt.subplots(2, 2, figsize=(18, 14))
fig.suptitle('Key Features vs. Heart Disease', fontsize=16)

# Age vs. Target
sns.histplot(ax=axes[0, 0], data=df, x='age', hue='num', multiple='stack', palette='plasma').set_title('Age Distribution by Target')

# Max Heart Rate vs. Target
sns.boxplot(ax=axes[0, 1], data=df, x='num', y='thalch', palette='magma', hue='num', legend=False).set_title('Max Heart Rate by Target')

# Chest Pain Type vs. Target
cp_plot = sns.countplot(ax=axes[1, 0], data=df, x='cp', hue='num', palette='cividis')
cp_plot.set_title('Chest Pain Type by Target')
cp_plot.set_xticks(range(len(df['cp'].unique())))
cp_plot.set_xticklabels(['Typical Angina', 'Atypical Angina', 'Non-anginal Pain', 'Asymptomatic'])

# Sex vs. Target
sex_plot = sns.countplot(ax=axes[1, 1], data=df, x='sex', hue='num', palette='inferno')
sex_plot.set_title('Sex by Target')
sex_plot.set_xticks(range(len(df['sex'].unique())))
sex_plot.set_xticklabels(['Female', 'Male'])

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(16, 12))
# Select only numerical columns for correlation calculation
numerical_df = df.select_dtypes(include=np.number)
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
df.head()

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns
print(f"Categorical columns: {list(categorical_cols)}")

# Apply One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_data = encoder.fit_transform(df[categorical_cols])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Drop the original categorical columns and concatenate the encoded DataFrame
df_encoded = df.drop(categorical_cols, axis=1)
df_encoded = pd.concat([df_encoded, encoded_df], axis=1)

print("\nDataFrame after One-Hot Encoding:")
display(df_encoded.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns # Import seaborn

# Drop the original target column and the 'id' column from the features
X = df_encoded.drop(['num', 'id'], axis=1)
y = df_encoded['num']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values using the mean strategy for numerical columns
# Impute missing values with the mean in training and testing sets
for col in X_train.columns:
    if X_train[col].isnull().any():
        mean_val = X_train[col].mean()
        X_train[col] = X_train[col].fillna(mean_val)
        if col in X_test.columns:
            X_test[col] = X_test[col].fillna(mean_val)

# Scale the numerical features
scaler = StandardScaler()
numerical_cols = X_train.select_dtypes(include=np.number).columns # Identify numerical columns *after* one-hot encoding
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Plot the confusion matrix with annotations
plt.figure(figsize=(8, 6)) # Add figure for better visualization
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt='d') # Add annot=True and fmt='d'
plt.title('Confusion Matrix') # Add title for clarity
plt.xlabel('Predicted Label') # Add xlabel
plt.ylabel('True Label') # Add ylabel
plt.show() # Show the plot

print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

In [None]:
model = DecisionTreeClassifier(criterion="gini",max_depth=10)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Plot the confusion matrix with annotations
plt.figure(figsize=(8, 6)) # Add figure for better visualization
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt='d') # Add annot=True and fmt='d'
plt.title('Confusion Matrix') # Add title for clarity
plt.xlabel('Predicted Label') # Add xlabel
plt.ylabel('True Label') # Add ylabel
plt.show() # Show the plot

print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

In [None]:
model = RandomForestClassifier(n_estimators=10 , criterion="gini")
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Plot the confusion matrix with annotations
plt.figure(figsize=(8, 6)) # Add figure for better visualization
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt='d') # Add annot=True and fmt='d'
plt.title('Confusion Matrix') # Add title for clarity
plt.xlabel('Predicted Label') # Add xlabel
plt.ylabel('True Label') # Add ylabel
plt.show() # Show the plot

print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

In [None]:
import xgboost as xgb

# Initialize and train the XGBoost model
# Use 'multi:softprob' for multi-class classification with probabilities
# Use 'multi:softmax' for multi-class classification with raw predictions
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), eval_metric='mlogloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Plot the confusion matrix with annotations
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt='d')
plt.title('Confusion Matrix - XGBoost')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

In [None]:
# Store results in a DataFrame
results = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost'],
    'Accuracy': [
        accuracy_score(y_test, LogisticRegression(max_iter=1000).fit(X_train, y_train).predict(X_test)),
        accuracy_score(y_test, DecisionTreeClassifier(criterion="gini",max_depth=10).fit(X_train, y_train).predict(X_test)),
        accuracy_score(y_test, RandomForestClassifier(n_estimators=10 , criterion="gini").fit(X_train, y_train).predict(X_test)),
        accuracy_score(y_test, xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), eval_metric='mlogloss').fit(X_train, y_train).predict(X_test))
    ],
    'Precision (weighted)': [
        precision_score(y_test, LogisticRegression(max_iter=1000).fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        precision_score(y_test, DecisionTreeClassifier(criterion="gini",max_depth=10).fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        precision_score(y_test, RandomForestClassifier(n_estimators=10 , criterion="gini").fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        precision_score(y_test, xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), eval_metric='mlogloss').fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0)
    ],
    'Recall (weighted)': [
        recall_score(y_test, LogisticRegression(max_iter=1000).fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        recall_score(y_test, DecisionTreeClassifier(criterion="gini",max_depth=10).fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        recall_score(y_test, RandomForestClassifier(n_estimators=10 , criterion="gini").fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        recall_score(y_test, xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), eval_metric='mlogloss').fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0)
    ],
    'F1-score (weighted)': [
        f1_score(y_test, LogisticRegression(max_iter=1000).fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        f1_score(y_test, DecisionTreeClassifier(criterion="gini",max_depth=10).fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        f1_score(y_test, RandomForestClassifier(n_estimators=10 , criterion="gini").fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0),
        f1_score(y_test, xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), eval_metric='mlogloss').fit(X_train, y_train).predict(X_test), average='weighted', zero_division=0)
    ]
}

results_df = pd.DataFrame(results)
display(results_df)

# Pick best model based on Accuracy
best_model_name = results_df.loc[results_df['Accuracy'].idxmax(), 'Model']
print(f"\nBest performing model based on Accuracy: {best_model_name}")

In [None]:
import joblib

# Train the best model on the full dataset (assuming best model is Logistic Regression based on previous output)
# You might need to change this based on the actual best model from the previous step
best_model = LogisticRegression(max_iter=1000)

# Prepare the full dataset (handle missing values and scale)
X_full = df_encoded.drop(['num', 'id'], axis=1)
y_full = df_encoded['num']

for col in X_full.columns:
    if X_full[col].isnull().any():
        mean_val = X_full[col].mean()
        X_full[col] = X_full[col].fillna(mean_val)

scaler_full = StandardScaler()
numerical_cols_full = X_full.select_dtypes(include=np.number).columns
X_full[numerical_cols_full] = scaler_full.fit_transform(X_full[numerical_cols_full])


best_model.fit(X_full, y_full)

# Save the model
filename = 'best_heart_disease_model.joblib'
joblib.dump(best_model, filename)

print(f"\nBest model ({best_model_name}) trained on full dataset and saved as {filename}")