In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine
from config import password
from sqlalchemy import inspect

In [24]:
# Create a DB connection string and connect to the database to retrieve the data
db_string = f"postgresql://postgres:{password}@localhost:5432/Be_Heart_Smart"
engine = create_engine(db_string)
con = engine.connect()

In [25]:
# Read the data to dataframe

cardio_df = pd.read_sql("Select * From cardio_data_cleaned", con)

cardio_df.head()


Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio
0,51.0,1.0,171.0,29.0,110.0,70.0,2.0,1.0,0.0,0.0,1.0,1.0
1,49.0,1.0,160.0,30.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0
2,58.0,1.0,143.0,30.0,103.0,61.0,2.0,1.0,0.0,0.0,1.0,0.0
3,47.0,2.0,170.0,31.0,150.0,90.0,2.0,2.0,0.0,0.0,1.0,1.0
4,42.0,1.0,146.0,32.0,100.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0


In [26]:
# Sample a quarter of the data to run mock ML algorithms
cardio_df_sample = cardio_df.sample(frac =.25)
  
cardio_df_sample.head()

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio
61139,51.0,2.0,174.0,100.0,120.0,80.0,1.0,1.0,0.0,0.0,0.0,0.0
19707,46.0,1.0,158.0,94.0,110.0,70.0,3.0,3.0,0.0,0.0,1.0,0.0
34222,61.0,1.0,164.0,82.0,120.0,90.0,1.0,1.0,0.0,0.0,1.0,0.0
15409,44.0,1.0,156.0,81.0,140.0,80.0,1.0,1.0,0.0,0.0,0.0,1.0
37700,48.0,1.0,165.0,69.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0


In [29]:
cardio_df_sample = cardio_df_sample.astype(int)
cardio_df_sample.head()

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio
61139,51,2,174,100,120,80,1,1,0,0,0,0
19707,46,1,158,94,110,70,3,3,0,0,1,0
34222,61,1,164,82,120,90,1,1,0,0,1,0
15409,44,1,156,81,140,80,1,1,0,0,0,1
37700,48,1,165,69,120,80,1,1,0,0,1,1


In [30]:
# Seperate our target(output) and features (input)

#Target variable
y = cardio_df_sample["cardio"]

#Feature variables
X = cardio_df_sample.drop(columns="cardio", axis=1)

In [31]:
X.head()

Unnamed: 0,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active
61139,51,2,174,100,120,80,1,1,0,0,0
19707,46,1,158,94,110,70,3,3,0,0,1
34222,61,1,164,82,120,90,1,1,0,0,1
15409,44,1,156,81,140,80,1,1,0,0,0
37700,48,1,165,69,120,80,1,1,0,0,1


In [32]:
from collections import Counter
Counter(y)

Counter({0: 8675, 1: 8399})

In [33]:
# split the data into test and train sets (the output variable looks quite evem;y split, not sure if we need to stratify)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(12805, 11)

In [34]:
from sklearn.preprocessing import StandardScaler

# Fitting Standard Scaler
X_scaler = StandardScaler().fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [35]:
# Create a logistic regression model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [36]:
# Train our logistic regression model

model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [37]:
# Predict the model
y_pred = model.predict(X_test_scaled)
predictions = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
predictions.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,1,0
4,0,1
5,0,1
6,0,0
7,0,0
8,0,0
9,1,1


In [38]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7303818224408527


In [39]:
from sklearn.metrics import confusion_matrix, classification_report

matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1722  447]
 [ 704 1396]]


In [40]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.71      0.79      0.75      2169
           1       0.76      0.66      0.71      2100

    accuracy                           0.73      4269
   macro avg       0.73      0.73      0.73      4269
weighted avg       0.73      0.73      0.73      4269



In [41]:
# Create Random Forest classifier

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [42]:
# Train our Random forest model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [43]:
# Predict the RF model
y_pred = rf_model.predict(X_test_scaled)

In [44]:
# Put the predicted values into a dataframe
predictions = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
predictions.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,1,0
3,1,0
4,0,1
5,0,1
6,0,0
7,0,0
8,0,0
9,1,1


In [45]:
print(accuracy_score(y_test, y_pred))

0.7074256266104474


In [46]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1568  601]
 [ 648 1452]]


In [47]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.71      0.72      0.72      2169
           1       0.71      0.69      0.70      2100

    accuracy                           0.71      4269
   macro avg       0.71      0.71      0.71      4269
weighted avg       0.71      0.71      0.71      4269

