In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/heart-disease-dataset


In [5]:
import pandas as pd
import kagglehub
import os # Import the os module

# Download latest version
# This downloads the dataset to a specific path and stores it in the 'path' variable
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

print("Path to dataset files:", path)

# Construct the full path to the heart.csv file within the downloaded directory
# Assumes the heart.csv file is directly inside the downloaded folder.
# If it's in a subdirectory, adjust the path accordingly.
csv_file_path = os.path.join(path, 'heart.csv')

# Load the dataset using the correct path
data = pd.read_csv(csv_file_path)
X = data.drop('target', axis=1)  # Features
y = data['target']               # Target variable

print("Dataset loaded successfully.")

Path to dataset files: /kaggle/input/heart-disease-dataset
Dataset loaded successfully.


In [6]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz

# Train the decision tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X, y)

# Export to DOT format
dot_data = export_graphviz(clf, out_file=None,
                           feature_names=X.columns,
                           class_names=['No Disease', 'Disease'],
                           filled=True, rounded=True)

# Visualize with Graphviz
graph = graphviz.Source(dot_data)
graph.render("decision_tree", view=True)  # Saves as 'decision_tree.pdf'

'decision_tree.pdf'

In [7]:
# Train a pruned decision tree
clf_pruned = DecisionTreeClassifier(max_depth=3, random_state=42)
clf_pruned.fit(X, y)

# Visualize the pruned tree
dot_data_pruned = export_graphviz(clf_pruned, out_file=None,
                                  feature_names=X.columns,
                                  class_names=['No Disease', 'Disease'],
                                  filled=True, rounded=True)
graph_pruned = graphviz.Source(dot_data_pruned)
graph_pruned.render("pruned_decision_tree", view=True)

'pruned_decision_tree.pdf'

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train both models
clf.fit(X_train, y_train)  # Unpruned decision tree
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_tree = clf.predict(X_test)
y_pred_rf = rf.predict(X_test)
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_tree):.2f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")

Decision Tree Accuracy: 0.99
Random Forest Accuracy: 0.99


In [9]:
# Random forest feature importances
importances = rf.feature_importances_
feature_names = X.columns
for name, importance in sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True):
    print(f"{name}: {importance:.3f}")

cp: 0.135
ca: 0.127
thalach: 0.122
oldpeak: 0.122
thal: 0.111
age: 0.078
chol: 0.075
trestbps: 0.071
exang: 0.058
slope: 0.046
sex: 0.029
restecg: 0.019
fbs: 0.008


In [10]:
from sklearn.model_selection import cross_val_score

# Decision tree cross-validation
tree_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(f"Decision Tree CV Accuracy: {tree_scores.mean():.2f} ± {tree_scores.std():.2f}")

# Random forest cross-validation
rf_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print(f"Random Forest CV Accuracy: {rf_scores.mean():.2f} ± {rf_scores.std():.2f}")

Decision Tree CV Accuracy: 1.00 ± 0.00
Random Forest CV Accuracy: 1.00 ± 0.01
