In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import password
from sqlalchemy import inspect

In [2]:
# Create a DB connection string and connect to the database to retrieve the data
db_string = f"postgresql://postgres:{password}@localhost:5432/Be_Heart_Smart"
engine = create_engine(db_string)
con = engine.connect()

In [3]:
# Read the data to dataframe

cardio_df = pd.read_sql("Select * From cardio_data_cleaned", con)

cardio_df.head()


Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio
0,51.0,1.0,171.0,29.0,110.0,70.0,2.0,1.0,0.0,0.0,1.0,1.0
1,49.0,1.0,160.0,30.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0
2,58.0,1.0,143.0,30.0,103.0,61.0,2.0,1.0,0.0,0.0,1.0,0.0
3,47.0,2.0,170.0,31.0,150.0,90.0,2.0,2.0,0.0,0.0,1.0,1.0
4,42.0,1.0,146.0,32.0,100.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0


In [4]:
# Sample a quarter of the data to run mock ML algorithms
cardio_df_sample = cardio_df.sample(frac =.25)
  
cardio_df_sample.head()

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio
29887,60.0,1.0,163.0,60.0,120.0,80.0,3.0,3.0,0.0,0.0,1.0,1.0
23060,41.0,1.0,160.0,67.0,100.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0
18269,53.0,1.0,158.0,68.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0
66795,39.0,2.0,180.0,100.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
16804,63.0,1.0,157.0,78.0,100.0,60.0,1.0,1.0,0.0,0.0,1.0,1.0


In [5]:
cardio_df_sample = cardio_df_sample.astype(int)
cardio_df_sample.head()

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio
29887,60,1,163,60,120,80,3,3,0,0,1,1
23060,41,1,160,67,100,70,1,1,0,0,0,0
18269,53,1,158,68,140,90,1,1,0,0,1,1
66795,39,2,180,100,120,80,1,1,0,0,1,0
16804,63,1,157,78,100,60,1,1,0,0,1,1


In [6]:
# Seperate our target(output) and features (input)

#Target variable
y = cardio_df_sample["cardio"]

#Feature variables
X = cardio_df_sample.drop(columns="cardio", axis=1)

In [7]:
X.head()

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active
29887,60,1,163,60,120,80,3,3,0,0,1
23060,41,1,160,67,100,70,1,1,0,0,0
18269,53,1,158,68,140,90,1,1,0,0,1
66795,39,2,180,100,120,80,1,1,0,0,1
16804,63,1,157,78,100,60,1,1,0,0,1


In [8]:
from collections import Counter
Counter(y)

Counter({1: 8472, 0: 8602})

In [9]:
# split the data into test and train sets (the output variable looks quite evenlyy split, not sure if we need to stratify)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(12805, 11)

In [10]:
from sklearn.preprocessing import StandardScaler

# Fitting Standard Scaler
X_scaler = StandardScaler().fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [11]:
# Create a logistic regression model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [12]:
# Train our logistic regression model

model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [13]:
# Predict the model
y_pred = model.predict(X_test_scaled)
predictions = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
predictions.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,1,1
5,0,1
6,1,0
7,0,0
8,0,0
9,1,1


In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7362379948465683


In [15]:
from sklearn.metrics import confusion_matrix, classification_report

matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1730  443]
 [ 683 1413]]


In [16]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.80      0.75      2173
           1       0.76      0.67      0.72      2096

    accuracy                           0.74      4269
   macro avg       0.74      0.74      0.73      4269
weighted avg       0.74      0.74      0.74      4269



In [17]:
# Create Random Forest classifier

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [18]:
# Train our Random forest model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
# Predict the RF model
y_pred = rf_model.predict(X_test_scaled)

In [20]:
# Put the predicted values into a dataframe
predictions = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
predictions.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,1,0
4,1,1
5,0,1
6,1,0
7,1,0
8,0,0
9,1,1


In [21]:
# Print the accuracy score
print(accuracy_score(y_test, y_pred))

0.7205434527992504


In [22]:
# Print the confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1584  589]
 [ 604 1492]]


In [23]:
# Print the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.72      0.73      0.73      2173
           1       0.72      0.71      0.71      2096

    accuracy                           0.72      4269
   macro avg       0.72      0.72      0.72      4269
weighted avg       0.72      0.72      0.72      4269

