In [1]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from ucimlrepo import fetch_ucirepo
import pandas as pd

In [2]:
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 

# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets
y = y.rename(columns={"num":"diagnosis"})

#Finds the index for an 80:20 split of the data
split = int(len(X)*0.8)


In [3]:
#Creates a decision tree model based on the provided data and prints a report of its performance

def decision_tree (X_train, X_test, y_train, y_test):
    classification_tree = tree.DecisionTreeClassifier()
    classification_tree = classification_tree.fit(X_train, y_train)

    y_pred = classification_tree.predict(X_test)
    model_report = classification_report(y_test, y_pred, zero_division=0)
    print(model_report)

In [4]:
#Decision Tree, no feature or target modification

X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

decision_tree(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.61      0.86      0.71        29
           1       0.20      0.08      0.12        12
           2       0.50      0.23      0.32        13
           3       0.29      0.40      0.33         5
           4       0.00      0.00      0.00         2

    accuracy                           0.51        61
   macro avg       0.32      0.32      0.30        61
weighted avg       0.46      0.51      0.46        61



In [5]:
#Features with no correlation to target removed

X_mod = X.copy()
X_mod = X_mod.drop(['trestbps', 'chol', 'fbs'], axis=1)

X_train = X_mod[:split]
X_test = X_mod[split:]
y_train = y[:split]
y_test = y[split:]

decision_tree(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.62      0.79      0.70        29
           1       0.12      0.08      0.10        12
           2       0.33      0.15      0.21        13
           3       0.17      0.20      0.18         5
           4       0.00      0.00      0.00         2

    accuracy                           0.44        61
   macro avg       0.25      0.25      0.24        61
weighted avg       0.40      0.44      0.41        61



In [6]:
#Target data condensed down binary options

y_mod = y.copy()
y_mod['diagnosis'] = y_mod['diagnosis'].map(lambda d: 1 if d != 0 else 0)

X_train = X[:split]
X_test = X[split:]
y_train = y_mod[:split]
y_test = y_mod[split:]

decision_tree(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.62      0.86      0.72        29
           1       0.81      0.53      0.64        32

    accuracy                           0.69        61
   macro avg       0.72      0.70      0.68        61
weighted avg       0.72      0.69      0.68        61



In [7]:
#Target data condensed to binary option, features with no correlation to diagnosis removed

X_mod = X.copy()
X_mod = X_mod.drop(['trestbps', 'chol', 'fbs'], axis=1)
y_mod = y.copy()
y_mod['diagnosis'] = y_mod['diagnosis'].map(lambda d: 1 if d != 0 else 0)

X_train = X_mod[:split]
X_test = X_mod[split:]
y_train = y_mod[:split]
y_test = y_mod[split:]

decision_tree(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.56      0.76      0.65        29
           1       0.68      0.47      0.56        32

    accuracy                           0.61        61
   macro avg       0.62      0.61      0.60        61
weighted avg       0.63      0.61      0.60        61



In [8]:
#Creates a Random Forest Classifier based on provided data and prints a report on the model's performance

def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=100, 
                            max_depth=5, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            max_features='sqrt',
                            criterion='gini', 
                            )

    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    model_report = classification_report(y_test, y_pred, zero_division=0)
    print(model_report)

In [9]:
#Creates a random forest model based on unmodified data

X_train = X[:split]
X_test = X[split:]
y_train = y[:split]['diagnosis']
y_test = y[split:]['diagnosis']

random_forest(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.61      0.97      0.75        29
           1       0.67      0.17      0.27        12
           2       0.00      0.00      0.00        13
           3       0.18      0.40      0.25         5
           4       0.00      0.00      0.00         2

    accuracy                           0.52        61
   macro avg       0.29      0.31      0.25        61
weighted avg       0.44      0.52      0.43        61



In [10]:
#Creates a random forest model; target labels have been condensed into binary options

y_mod = y.copy()
y_mod['diagnosis'] = y_mod['diagnosis'].map(lambda d: 1 if d != 0 else 0)

X_train = X[:split]
X_test = X[split:]
y_train = y_mod[:split]['diagnosis']
y_test = y_mod[split:]['diagnosis']

random_forest(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.67      0.90      0.76        29
           1       0.86      0.59      0.70        32

    accuracy                           0.74        61
   macro avg       0.77      0.75      0.73        61
weighted avg       0.77      0.74      0.73        61

