In [2]:
# Initial imports
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Define the connection string
# Replace 'your_username', 'your_password', 'your_host', 'your_port', and 'your_database_name' with your actual PostgreSQL credentials
connection_string = 'postgresql://postgres:postgres@localhost:5432/Heart_db'
# Create the engine
engine = create_engine(connection_string)
# Example: execute a SQL query and fetch data into a pandas DataFrame
query = "SELECT * FROM heart_disease_health_indicators;"
indicators_df = pd.read_sql_query(query, engine)
# Display the DataFrame
indicators_df

Unnamed: 0,heartdiseaseorattack,highbp,highchol,cholcheck,bmi,smoker,stroke,diabetes,physactivity,fruits,...,anyhealthcare,nodocbccost,genhlth,menthlth,physhlth,diffwalk,sex,age,education,income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253656,0,0,0,1,25,0,0,0,1,1,...,1,0,1,0,0,0,0,4,6,8
253657,0,0,1,1,24,0,0,0,0,0,...,1,0,3,0,0,0,0,7,5,3
253658,0,0,0,0,27,0,0,0,1,0,...,1,1,2,0,0,0,0,3,6,5
253659,0,0,1,1,37,0,0,2,0,0,...,1,0,4,0,0,0,0,6,4,1


In [4]:
indicators_df.dtypes

heartdiseaseorattack    int64
highbp                  int64
highchol                int64
cholcheck               int64
bmi                     int64
smoker                  int64
stroke                  int64
diabetes                int64
physactivity            int64
fruits                  int64
veggies                 int64
hvyalcoholconsump       int64
anyhealthcare           int64
nodocbccost             int64
genhlth                 int64
menthlth                int64
physhlth                int64
diffwalk                int64
sex                     int64
age                     int64
education               int64
income                  int64
dtype: object

In [5]:
indicators_df.isnull().sum()

heartdiseaseorattack    0
highbp                  0
highchol                0
cholcheck               0
bmi                     0
smoker                  0
stroke                  0
diabetes                0
physactivity            0
fruits                  0
veggies                 0
hvyalcoholconsump       0
anyhealthcare           0
nodocbccost             0
genhlth                 0
menthlth                0
physhlth                0
diffwalk                0
sex                     0
age                     0
education               0
income                  0
dtype: int64

In [7]:
# Define features set
y = indicators_df['heartdiseaseorattack']
X = indicators_df.drop(columns=['heartdiseaseorattack'], axis=1)

In [8]:
X

Unnamed: 0,highbp,highchol,cholcheck,bmi,smoker,stroke,diabetes,physactivity,fruits,veggies,...,anyhealthcare,nodocbccost,genhlth,menthlth,physhlth,diffwalk,sex,age,education,income
0,1,1,1,40,1,0,0,0,0,1,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,25,1,0,0,1,0,0,...,0,1,3,0,0,0,0,7,6,1
2,1,1,1,28,0,0,0,0,1,0,...,1,1,5,30,30,1,0,9,4,8
3,1,0,1,27,0,0,0,1,1,1,...,1,0,2,0,0,0,0,11,3,6
4,1,1,1,24,0,0,0,1,1,1,...,1,0,2,3,0,0,0,11,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253656,0,0,1,25,0,0,0,1,1,1,...,1,0,1,0,0,0,0,4,6,8
253657,0,1,1,24,0,0,0,0,0,1,...,1,0,3,0,0,0,0,7,5,3
253658,0,0,0,27,0,0,0,1,0,0,...,1,1,2,0,0,0,0,3,6,5
253659,0,1,1,37,0,0,2,0,0,1,...,1,0,4,0,0,0,0,6,4,1


In [9]:
y

0         0
1         0
2         0
3         0
4         0
         ..
253656    0
253657    0
253658    0
253659    0
253660    0
Name: heartdiseaseorattack, Length: 253661, dtype: int64

In [10]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [12]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [13]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,56736,832
Actual 1,5206,642


Accuracy Score : 0.9047874353475464
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     57568
           1       0.44      0.11      0.18      5848

    accuracy                           0.90     63416
   macro avg       0.68      0.55      0.56     63416
weighted avg       0.87      0.90      0.88     63416



In [16]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:21]

[(0.19155203040889884, 'bmi'),
 (0.10988433282705962, 'age'),
 (0.1039239431330232, 'income'),
 (0.08433705211920092, 'physhlth'),
 (0.07277474549739804, 'education'),
 (0.06925751423838533, 'genhlth'),
 (0.06403909727364314, 'menthlth'),
 (0.032983739178781155, 'fruits'),
 (0.028888241766365525, 'physactivity'),
 (0.02861232328343158, 'diabetes'),
 (0.028599838729251082, 'stroke'),
 (0.02743618805677107, 'highbp'),
 (0.027253490765514532, 'veggies'),
 (0.0256928768405911, 'diffwalk'),
 (0.023988558336111358, 'highchol'),
 (0.022811659547207156, 'sex'),
 (0.022582707660285636, 'smoker'),
 (0.014813326263858582, 'nodocbccost'),
 (0.009065926175324906, 'hvyalcoholconsump'),
 (0.00767737781075282, 'anyhealthcare'),
 (0.0038250300881445187, 'cholcheck')]