In [None]:
# Initial imports
import datetime
import sqlalchemy
import pandas as pd
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

import pydotplus
from IPython.display import Image

In [None]:
# Database Setup
engine = create_engine(f"postgresql+psycopg2://postgres:postgres@localhost:5432/final_project")

# Reflect existing database into a new model
Base = automap_base()

# Reflect the tables
Base.prepare(autoload_with=engine)

# Create session (link) from Python to PG Admin
session = Session(engine)

# Create list and load to dataframe
heart_data_list = []
heart_data = engine.execute("SELECT * FROM heart_failure")
for results in heart_data:
    heart_failure = {}
    heart_failure["age"] = results[0]
    heart_failure["sex"] = results[1]
    heart_failure["chest_pain_type"] = results[2]
    heart_failure["resting_bp"] = results[3]
    heart_failure["cholesterol"] = results[4]
    heart_failure["fasting_bs"] = results[5]
    heart_failure["resting_ecg"] = results[6]
    heart_failure["max_hr"] = results[7]
    heart_failure["exercise_aniga"] = results[8]
    heart_failure["old_peak"] = results[9]
    heart_failure["st_slope"] = results[10]
    heart_failure["heart_disease"] = results[11]
    heart_data_list.append(heart_failure)

session.close()
raw_heart_df = pd.DataFrame(heart_data_list)
raw_heart_df

In [None]:
X = raw_heart_df.copy()
X.drop("heart_disease", axis=1, inplace=True)
X.head()

In [None]:
X = pd.get_dummies(X)
X.head()

In [None]:
X.keys()

# Top 5 features from v1 importance run
# X = X['st_slope_Up', 'cholesterol', 'max_hr', 'old_peak', 'st_slope_Flat']

In [None]:
y = raw_heart_df["heart_disease"].ravel()
y[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix: Test 1")
print("All features")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Create DOT data
dot_data = tree.export_graphviz(
    model, out_file=None, feature_names=X.columns, class_names=["0", "1"], filled=True
)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
Image(graph.create_png())

In [None]:
# Saving the tree as PNG
file_path = "../Resources/heart_disease_tree_v1.png"
graph.write_png(file_path)