In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

In [3]:
from config import db_password

In [4]:
# Create a connection with the database in postgres
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/Be_Heart_Smart"
engine = create_engine(db_string)

In [44]:
# Read the data from the database table
cardio_df = pd.read_sql_table("cardio_combined", engine)
cardio_df.sample(5)

Unnamed: 0,id,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio_disease,BMI,weight_status,obesity_status
57344,83500.0,46.0,1.0,168.0,71.0,110.0,70.0,1.0,1.0,0.0,0.0,1.0,1.0,25.2,overweight,no
35819,15104.0,41.0,1.0,163.0,54.5,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,20.5,normal,no
43948,1104.0,63.0,1.0,158.0,88.0,130.0,80.0,1.0,2.0,0.0,0.0,1.0,1.0,35.3,obese,yes
49392,2010.0,51.0,1.0,163.0,67.0,140.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0,25.2,overweight,no
54972,44136.0,56.0,2.0,166.0,81.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,29.4,overweight,yes


### Pre-process the data before applying supervised machine learning

In [45]:
# Check the datatypes of the column
cardio_df.dtypes

id                float64
age               float64
gender            float64
height            float64
weight            float64
systolic_bp       float64
diastolic_bp      float64
cholesterol       float64
glucose           float64
smoker            float64
alcohol_intake    float64
active            float64
cardio_disease    float64
BMI               float64
weight_status      object
obesity_status     object
dtype: object

In [22]:
# # Change the continuous variables weight_status, and obesity_status from string to numeric.
# # Defining a function string_to_numeric.
# def string_to_numeric(variable):
#     if variable == "underweight":
#         return 1
#     elif variable == "normal":
#         return 2
#     elif variable == "overweight":
#         return 3
#     else:
#         return 4

In [17]:
# # Call the function string_to_numeric on column weight_status 
# cardio_df["weight_status"] = cardio_df["weight_status"].apply(string_to_numeric)

# # Change the obesity_status to numeric
# cardio_df["obesity_status"] = cardio_df["obesity_status"].apply(lambda x: 1 if x == "yes" else 0)

In [46]:
# Generate binary values for weight_status, and obesity_status using get_dummies
cardio_encoded_df = pd.get_dummies(cardio_df, columns=["weight_status", "obesity_status"], prefix=["weight", "obesity"] )

# Re-aarange columns in the merged DataFrame
rearranged_columns = ["id", "age", "gender", "height", "weight", "BMI", "weight_underweight", "weight_normal", 
                      "weight_overweight", "weight_obese", "obesity_no", "obesity_yes", 
                       "systolic_bp", "diastolic_bp", "cholesterol", "glucose", "smoker", "alcohol_intake", 
                      "active", "cardio_disease"]
cardio_encoded_df = cardio_encoded_df[rearranged_columns]
cardio_encoded_df.head()

Unnamed: 0,id,age,gender,height,weight,BMI,weight_underweight,weight_normal,weight_overweight,weight_obese,obesity_no,obesity_yes,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio_disease
0,26503.0,49.0,1.0,160.0,30.0,11.7,1,0,0,0,1,0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0
1,54851.0,59.0,1.0,154.0,32.0,13.5,1,0,0,0,1,0,110.0,60.0,1.0,1.0,0.0,0.0,1.0,0.0
2,21040.0,62.0,1.0,143.0,34.0,16.6,1,0,0,0,1,0,100.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0
3,47872.0,57.0,1.0,153.0,34.0,14.5,1,0,0,0,1,0,110.0,70.0,3.0,3.0,0.0,0.0,1.0,1.0
4,23318.0,59.0,1.0,165.0,35.0,12.9,1,0,0,0,1,0,100.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0


In [47]:
# Drop column id as it is not related to onsent of cardiovascular disease
df = cardio_encoded_df.drop(["id"], axis=1)

In [48]:
# Set the Target and Features of the model
y = df["cardio_disease"]
X = df.drop(columns="cardio_disease")

In [78]:
# Import model dependencies, and divide data into train and test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [95]:
# Examine the split
print(X_train.shape)
print(X_test.shape)

(51222, 18)
(17075, 18)


In [79]:
# We will scale our data, and use standard scalar.
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

# Create a StandardScaler instances
scaler = StandardScaler()

# # Fit the training set only on the continuous variables
# X_scaler = scaler.fit(X_train[["age", "height", "weight", "BMI", "systolic_bp", "diastolic_bp"]])

# # Transform on the contunuous variables
# X_train_scaled = X_scaler.transform(X_train[["age", "height", "weight", "BMI", "systolic_bp", "diastolic_bp"]])
# X_test_scaled = X_scaler.transform(X_test[["age", "height", "weight", "BMI", "systolic_bp", "diastolic_bp"]])

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [80]:
# Check the scaled data
len(X_train_scaled[0])

18

In [81]:
X_train_scaled[0:5]

array([[ 1.20827254, -0.73110306, -1.0805279 ,  0.49188852,  1.1361569 ,
        -0.10146851, -0.75624205, -0.76074519,  1.68968822, -1.50203811,
         1.50203811,  3.32952374, -0.12859304, -0.53770738, -0.39477347,
        -0.31083677, -0.23782221, -2.01406702],
       [-0.41617536, -0.73110306, -1.0805279 ,  0.21076451,  0.80874914,
        -0.10146851, -0.75624205, -0.76074519,  1.68968822, -1.50203811,
         1.50203811, -0.3935414 , -0.12859304, -0.53770738, -0.39477347,
        -0.31083677, -0.23782221,  0.49650781],
       [ 0.91291838, -0.73110306, -1.59283818, -1.19485553, -0.52014118,
        -0.10146851,  1.32232794, -0.76074519, -0.59182516,  0.66576207,
        -0.66576207, -0.3935414 , -1.20551748, -0.53770738, -0.39477347,
        -0.31083677, -0.23782221,  0.49650781],
       [ 0.91291838, -0.73110306,  0.07217022,  0.0702025 ,  0.01911866,
        -0.10146851, -0.75624205,  1.31450059, -0.59182516,  0.66576207,
        -0.66576207,  1.46799117, -0.12859304, -0.537

In [82]:
df_scaled = pd.DataFrame(X_train_scaled)
df_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.208273,-0.731103,-1.080528,0.491889,1.136157,-0.101469,-0.756242,-0.760745,1.689688,-1.502038,1.502038,3.329524,-0.128593,-0.537707,-0.394773,-0.310837,-0.237822,-2.014067
1,-0.416175,-0.731103,-1.080528,0.210765,0.808749,-0.101469,-0.756242,-0.760745,1.689688,-1.502038,1.502038,-0.393541,-0.128593,-0.537707,-0.394773,-0.310837,-0.237822,0.496508
2,0.912918,-0.731103,-1.592838,-1.194856,-0.520141,-0.101469,1.322328,-0.760745,-0.591825,0.665762,-0.665762,-0.393541,-1.205517,-0.537707,-0.394773,-0.310837,-0.237822,0.496508
3,0.912918,-0.731103,0.07217,0.070203,0.019119,-0.101469,-0.756242,1.314501,-0.591825,0.665762,-0.665762,1.467991,-0.128593,-0.537707,-0.394773,-0.310837,4.204822,0.496508
4,-0.859207,1.367796,-0.055907,0.351327,0.385045,-0.101469,-0.756242,1.314501,-0.591825,-1.502038,1.502038,0.226969,0.948331,2.4099,-0.394773,3.217123,-0.237822,0.496508


In [83]:
# Check if standard deviation is 1, and mean is at zero
df_scaled.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
count,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0,51222.0
mean,-2.314243e-16,-3.645213e-16,1.487597e-15,2.179351e-16,-8.111681e-16,-9.401961e-16,8.745103000000001e-17,9.322258e-16,2.193743e-16,-2.096029e-15,2.506366e-15,-9.516561e-16,-4.69291e-16,-1.28503e-15,4.342123e-15,-6.872623000000001e-17,-1.885776e-15,-3.503399e-15
std,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001
min,-3.517394,-0.7311031,-3.770157,-3.162724,-3.370515,-0.1014685,-0.7562421,-0.7607452,-0.5918252,-1.502038,-0.6657621,-2.875585,-4.436291,-0.5377074,-0.3947735,-0.3108368,-0.2378222,-2.014067
25%,-0.7115295,-0.7311031,-0.6962952,-0.6326075,-0.6742154,-0.1014685,-0.7562421,-0.7607452,-0.5918252,-1.502038,-0.6657621,-0.3935414,-0.128593,-0.5377074,-0.3947735,-0.3108368,-0.2378222,0.4965078
50%,0.02685588,-0.7311031,0.07217022,-0.1406405,-0.2119927,-0.1014685,-0.7562421,-0.7607452,-0.5918252,0.6657621,-0.6657621,-0.3935414,-0.128593,-0.5377074,-0.3947735,-0.3108368,-0.2378222,0.4965078
75%,0.7652413,1.367796,0.7125581,0.5621695,0.5198599,-0.1014685,1.322328,1.314501,1.689688,0.6657621,1.502038,0.8474803,0.9483314,0.9360965,-0.3947735,-0.3108368,-0.2378222,0.4965078
max,1.651304,1.367796,4.29873,8.855328,11.24728,9.855275,1.322328,1.314501,1.689688,0.6657621,1.502038,3.329524,4.179105,2.4099,3.101265,3.217123,4.204822,0.4965078


In [96]:
# Import PCA dependency
from sklearn.decomposition import PCA

In [97]:
# Applying PCA to reduce dimensions from 18 to 3

# Initialize PCA model
pca = PCA(n_components=3)

# Get three principal components for the data.
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)

### Perform the logistic regression

In [98]:
# Create an instance of the logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [99]:
# Fit the model to the train set
classifier.fit(X_train_pca, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [102]:
# Make predictions using the test set
y_pred = classifier.predict(X_test_pca)

In [103]:
# Compare the predictions with the actual values and visualize in a DataFrame
prediction_df = pd.DataFrame({"Prediction": y_pred, "Actual":y_test})

In [104]:
prediction_df.head(20)

Unnamed: 0,Prediction,Actual
63577,1.0,0.0
1893,0.0,0.0
3303,0.0,0.0
44226,0.0,0.0
37902,0.0,0.0
60087,0.0,0.0
39496,1.0,0.0
26927,1.0,1.0
11650,1.0,1.0
38227,0.0,0.0


In [105]:
# Access performance of the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6684626647144949


In [106]:
# Obtain the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[5711 2941]
 [2720 5703]]


In [107]:
# Obtain the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.68      0.66      0.67      8652
         1.0       0.66      0.68      0.67      8423

    accuracy                           0.67     17075
   macro avg       0.67      0.67      0.67     17075
weighted avg       0.67      0.67      0.67     17075



In [109]:
# Validate the model
print(f"Training Data Score: {classifier.score(X_train_pca, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_pca, y_test)}")

Training Data Score: 0.6822459099605638
Testing Data Score: 0.6684626647144949
