In [17]:
# Initial imports
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [18]:
# Define the connection string
# Replace 'your_username', 'your_password', 'your_host', 'your_port', and 'your_database_name' with your actual PostgreSQL credentials
connection_string = 'postgresql://postgres:postgres@localhost:5432/Heart_db'
# Create the engine
engine = create_engine(connection_string)
# Example: execute a SQL query and fetch data into a pandas DataFrame
query = "SELECT * FROM CHDIndicators;"
indicators_df = pd.read_sql_query(query, engine)
# Display the DataFrame
indicators_df

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bpmeds,prevalentstroke,prevalenthyp,diabetes,totchol,sysbp,diabp,bmi,heartrate,glucose,tenyearchd
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
3654,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
3655,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
3656,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [19]:
indicators_df.dtypes

male                 int64
age                  int64
education          float64
currentsmoker        int64
cigsperday         float64
bpmeds             float64
prevalentstroke      int64
prevalenthyp         int64
diabetes             int64
totchol            float64
sysbp              float64
diabp              float64
bmi                float64
heartrate          float64
glucose            float64
tenyearchd           int64
dtype: object

In [20]:
indicators_df.isnull().sum()

male               0
age                0
education          0
currentsmoker      0
cigsperday         0
bpmeds             0
prevalentstroke    0
prevalenthyp       0
diabetes           0
totchol            0
sysbp              0
diabp              0
bmi                0
heartrate          0
glucose            0
tenyearchd         0
dtype: int64

In [31]:
# Define features set
y = indicators_df['tenyearchd']
X = indicators_df.drop(columns=['tenyearchd'], axis=1)

In [22]:
X

Unnamed: 0,male,age,education,currentsmoker,cigsperday,bpmeds,prevalentstroke,prevalenthyp,diabetes,totchol,sysbp,diabp,bmi,heartrate,glucose
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0
3654,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0
3655,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0
3656,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0


In [23]:
y

0       0
1       0
2       0
3       1
4       0
       ..
3653    1
3654    0
3655    0
3656    0
3657    0
Name: tenyearchd, Length: 3658, dtype: int64

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\senel\anaconda3\envs\dev\lib\site-packages\sklearn\utils\_param_validation.py)

In [24]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [25]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [26]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [27]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,762,4
Actual 1,143,6


Accuracy Score : 0.839344262295082
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       766
           1       0.60      0.04      0.08       149

    accuracy                           0.84       915
   macro avg       0.72      0.52      0.49       915
weighted avg       0.80      0.84      0.78       915



In [30]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:15]

[(0.1368495636410372, 'sysbp'),
 (0.12964237317108168, 'bmi'),
 (0.12569806797028962, 'age'),
 (0.12133417946939376, 'totchol'),
 (0.11829947025086886, 'diabp'),
 (0.11429026856564829, 'glucose'),
 (0.0970858272551195, 'heartrate'),
 (0.04918888390349, 'cigsperday'),
 (0.03668859273141856, 'education'),
 (0.0218199791946412, 'male'),
 (0.01703948358564989, 'prevalenthyp'),
 (0.012903669737742016, 'currentsmoker'),
 (0.007650841785660607, 'bpmeds'),
 (0.006543868842288572, 'diabetes'),
 (0.004964929895670184, 'prevalentstroke')]