In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    print("XGBoost not available - will use GradientBoosting as alternative")
    from sklearn.ensemble import GradientBoostingClassifier
    XGBOOST_AVAILABLE = False
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score,
                             recall_score, f1_score, matthews_corrcoef,
                             confusion_matrix, classification_report)
import pickle
import warnings
warnings.filterwarnings('ignore')

In [43]:
import os

for dirname, f, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(f)
        print(os.path.join(dirname, filename))

[]
/kaggle/lib/kaggle/gcp.py
[]
/kaggle/input/datasets/saikatkumar/heart-disease-dataset-uci-new/HeartDiseaseTrain-Test.csv
[]
/kaggle/input/datasets/ketangangal/heart-disease-dataset-uci/HeartDiseaseTrain-Test.csv
['.virtual_documents']
/kaggle/working/model_xgboost.pkl
['.virtual_documents']
/kaggle/working/model_knn.pkl
['.virtual_documents']
/kaggle/working/model_comparison.csv
['.virtual_documents']
/kaggle/working/model_logistic_regression.pkl
['.virtual_documents']
/kaggle/working/model_random_forest.pkl
['.virtual_documents']
/kaggle/working/heart_disease_test.csv
['.virtual_documents']
/kaggle/working/scaler.pkl
['.virtual_documents']
/kaggle/working/label_encoders.pkl
['.virtual_documents']
/kaggle/working/model_naive_bayes.pkl
['.virtual_documents']
/kaggle/working/model_decision_tree.pkl
[]
/kaggle/working/.virtual_documents/__notebook_source__.ipynb


In [45]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv("/kaggle/input/datasets/saikatkumar/heart-disease-dataset-uci-new/HeartDiseaseTrain-Test.csv")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:\n{df.head()}")
print(f"\nDataset info:")
print(df.info())
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nTarget distribution:\n{df['target'].value_counts()}")

Loading dataset...
Dataset shape: (1025, 14)

First few rows:
   age     sex chest_pain_type  resting_blood_pressure  cholestoral  \
0   52    Male  Typical angina                     125          212   
1   53    Male  Typical angina                     140          203   
2   70    Male  Typical angina                     145          174   
3   61    Male  Typical angina                     148          203   
4   62  Female  Typical angina                     138          294   

      fasting_blood_sugar               rest_ecg  Max_heart_rate  \
0    Lower than 120 mg/ml  ST-T wave abnormality             168   
1  Greater than 120 mg/ml                 Normal             155   
2    Lower than 120 mg/ml  ST-T wave abnormality             125   
3    Lower than 120 mg/ml  ST-T wave abnormality             161   
4  Greater than 120 mg/ml  ST-T wave abnormality             106   

  exercise_induced_angina  oldpeak        slope vessels_colored_by_flourosopy  \
0                    

In [46]:
# Data Preprocessing
print("\n" + "="*50)
print("PREPROCESSING DATA")
print("="*50)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('target')  # Remove target from numerical columns

print(f"\nCategorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")


PREPROCESSING DATA

Categorical columns: ['sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 'exercise_induced_angina', 'slope', 'vessels_colored_by_flourosopy', 'thalassemia']
Numerical columns: ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak']


In [47]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [48]:
X = df.drop('target', axis=1)
y = df['target']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [50]:
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 820
Test set size: 205


In [51]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [52]:
test_data = X_test.copy()
test_data['target'] = y_test
test_data.to_csv('heart_disease_test.csv', index=False)
print("\nTest data saved as 'heart_disease_test.csv'")


Test data saved as 'heart_disease_test.csv'


In [53]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)


In [54]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42,max_depth=10,min_samples_split=10,min_samples_leaf=5),
    'kNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=70, max_depth=15,min_samples_split=8, min_samples_leaf=6,max_features='sqrt',random_state=42)
}


if XGBOOST_AVAILABLE:
    models['XGBoost'] = XGBClassifier(n_estimators=70,random_state=42,max_depth=3,learning_rate=0.05)
else:
    models['XGBoost'] = GradientBoostingClassifier(n_estimators=70,random_state=42,max_depth=3,learning_rate=0.05)
    print("\nNote: Using GradientBoostingClassifier instead of XGBoost")


In [55]:
results = []

In [56]:
print("\n" + "="*50)
print("TRAINING AND EVALUATING MODELS")
print("="*50)

for model_name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {model_name}...")
    print(f"{'='*50}")

    model.fit(X_train_scaled, y_train)
    test_acc = model.score(X_test_scaled, y_test)

    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    mcc = matthews_corrcoef(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC Score: {auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"MCC Score: {mcc:.4f}")

    # Store results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'MCC': mcc
    })

    # Save the trained model
    model_filename = f"model_{model_name.replace(' ', '_').lower()}.pkl"
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved as {model_filename}")

results_df = pd.DataFrame(results)
print("\n" + "="*50)
print("FINAL RESULTS COMPARISON")
print("="*50)
print(results_df.to_string(index=False))

# Save results to CSV
results_df.to_csv('model_comparison.csv', index=False)
print("\nResults saved to 'model_comparison.csv'")





TRAINING AND EVALUATING MODELS

Training Logistic Regression...
Accuracy: 0.8439
AUC Score: 0.9320
Precision: 0.8230
Recall: 0.8857
F1 Score: 0.8532
MCC Score: 0.6891
Model saved as model_logistic_regression.pkl

Training Decision Tree...
Accuracy: 0.9366
AUC Score: 0.9885
Precision: 0.9510
Recall: 0.9238
F1 Score: 0.9372
MCC Score: 0.8736
Model saved as model_decision_tree.pkl

Training kNN...
Accuracy: 0.8634
AUC Score: 0.9689
Precision: 0.8598
Recall: 0.8762
F1 Score: 0.8679
MCC Score: 0.7267
Model saved as model_knn.pkl

Training Naive Bayes...
Accuracy: 0.8439
AUC Score: 0.9135
Precision: 0.8288
Recall: 0.8762
F1 Score: 0.8519
MCC Score: 0.6884
Model saved as model_naive_bayes.pkl

Training Random Forest...
Accuracy: 0.9415
AUC Score: 0.9847
Precision: 0.9189
Recall: 0.9714
F1 Score: 0.9444
MCC Score: 0.8842
Model saved as model_random_forest.pkl

Training XGBoost...
Accuracy: 0.9073
AUC Score: 0.9677
Precision: 0.8772
Recall: 0.9524
F1 Score: 0.9132
MCC Score: 0.8173
Model saved

In [59]:
print("\n" + "="*50)
print("MODEL PERFORMANCE OBSERVATIONS")
print("="*50)

observations = []

for _, row in results_df.iterrows():
    model = row['Model']
    acc = row['Accuracy']
    auc = row['AUC']
    f1 = row['F1']
    mcc = row['MCC']

    # Generate observation based on performance
    if acc >= 0.85 and auc >= 0.90:
        obs = f"Excellent performance with high accuracy ({acc:.3f}) and AUC ({auc:.3f}). Well-balanced model suitable for deployment."
    elif acc >= 0.80:
        obs = f"Good performance with accuracy of {acc:.3f} and AUC of {auc:.3f}. Reliable predictions with balanced precision-recall."
    elif acc >= 0.75:
        obs = f"Moderate performance with accuracy of {acc:.3f}. May need hyperparameter tuning or feature engineering for improvement."
    else:
        obs = f"Below average performance (accuracy: {acc:.3f}). Consider different algorithms or data preprocessing strategies."

    observations.append({'Model': model, 'Observation': obs})
    print(f"\n{model}:")
    print(f"  {obs}")

# Save observations
obs_df = pd.DataFrame(observations)
obs_df.to_csv('model_observations.csv', index=False)

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print("\nGenerated files:")
print("  - model_*.pkl (6 trained models)")
print("  - scaler.pkl (feature scaler)")
print("  - label_encoders.pkl (categorical encoders)")
print("  - heart_disease_test.csv (test dataset for Streamlit)")
print("  - model_comparison.csv (evaluation results)")
print("  - model_observations.csv (performance observations)")


MODEL PERFORMANCE OBSERVATIONS

Logistic Regression:
  Good performance with accuracy of 0.844 and AUC of 0.932. Reliable predictions with balanced precision-recall.

Decision Tree:
  Excellent performance with high accuracy (0.937) and AUC (0.988). Well-balanced model suitable for deployment.

kNN:
  Excellent performance with high accuracy (0.863) and AUC (0.969). Well-balanced model suitable for deployment.

Naive Bayes:
  Good performance with accuracy of 0.844 and AUC of 0.914. Reliable predictions with balanced precision-recall.

Random Forest:
  Excellent performance with high accuracy (0.941) and AUC (0.985). Well-balanced model suitable for deployment.

XGBoost:
  Excellent performance with high accuracy (0.907) and AUC (0.968). Well-balanced model suitable for deployment.

TRAINING COMPLETE!

Generated files:
  - model_*.pkl (6 trained models)
  - scaler.pkl (feature scaler)
  - label_encoders.pkl (categorical encoders)
  - heart_disease_test.csv (test dataset for Streamlit)