# Data Science Specialization (Spring 2025, RUC)
## Workshop: Decision Trees
## Exercise Part I

## 1. Imports

In [None]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

## 2. Data Preparation

In [None]:
data = pd.read_csv(r'heart_kaggle.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data['output'].unique()

In [None]:
# Luckily, all features are numeric already.
# In general, there would be two options to handle 'object' type features:
# 1. Factorize:
#    data['XYZ'] = pd.factorize(data['XYZ'])[0]   # 'XYZ' is a categorical feature
# 2. One-hot encoding:
#    data = pd.get_dummies(data, columns=['XYZ'])

In [None]:
# The data set does not contain any NaN (not a number) values. Otherwise, we can drop corresponding rows using:
# data.dropna()

Apparently, the output column contains the class label. So it should be excluded from the features.

In [None]:
feature_cols = data.columns.drop(['output'])
feature_cols

In [None]:
X = data[feature_cols]
y = data.output

We split the data into training (70%) and test set (30%).

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

## 3. Training

In [None]:
dtree = DecisionTreeClassifier(criterion='entropy')
dtree = dtree.fit(X_train, y_train)

## 4. Testing and Evaluating

In [None]:
y_pred = dtree.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
print("Recall:", metrics.recall_score(y_test, y_pred, pos_label=1))

In [None]:
print("Precision:", metrics.precision_score(y_test, y_pred, pos_label=1))

In [None]:
print("Confusion Matrix:", metrics.confusion_matrix(y_test.values, y_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = metrics.confusion_matrix(y_test.values, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=dtree.classes_)
disp.plot()

## 6. Visualizing Decision Trees

You can use Scikit-learn's export_graphviz function for display the tree within a Jupyter notebook. For plotting tree, you also need to install graphviz and pydotplus.

* pip install graphviz

* pip install pydotplus

**export_graphviz** function converts decision tree classifier into dot file and pydotplus convert this dot file to png or displayable form on Jupyter.

In [None]:
from sklearn import tree
from sklearn.tree import export_graphviz
import graphviz

export_graphviz(dtree, out_file='heart_kaggle.dot', 
                class_names=True, 
                #class_names=['Negative', 'Positive'], 
                feature_names=feature_cols,
                impurity=False,
                filled=True)
  
tree.plot_tree(dtree)

In [None]:
! dot -Tpng heart_kaggle.dot -o heart_kaggle.png

from IPython import display
display.Image("heart_kaggle.png")

In [None]:
import graphviz

with open('heart_kaggle.dot') as f:
    tree_graph = f.read()

graph = graphviz.Source(tree_graph)
# This render the .dot file into a PDF
graph.render("heart_kaggle", view=True)