### Performing a Linear Regression on the Raw Data

In [20]:
# Import dependencies
import pandas as pd

In [2]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

In [3]:
from config import db_password

In [4]:
# Create a connection with the database in postgres
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/Be_Heart_Smart"
engine = create_engine(db_string)

In [5]:
# Read the data from the database table
cardio_raw_df = pd.read_sql_table("cardio_info", engine)

In [6]:
# Inspect at the DataFrame
cardio_raw_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,82567.0,18804.0,2.0,165.0,10.0,180.0,1100.0,2.0,2.0,0.0,0.0,1.0,1.0
1,48318.0,21582.0,2.0,178.0,11.0,130.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0
2,85931.0,21855.0,1.0,162.0,21.0,120.0,80.0,2.0,1.0,0.0,0.0,1.0,1.0
3,42156.0,20408.0,2.0,177.0,22.0,120.0,80.0,1.0,1.0,1.0,1.0,1.0,0.0
4,38312.0,23284.0,1.0,157.0,23.0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0


In [7]:
# Drop column id as it is not related to onsent of cardiovascular disease
df = cardio_raw_df.drop(["id"], axis=1)

In [8]:
# Set the Target and Features of the model
y = df["cardio"]
X = df.drop(columns="cardio")

In [9]:
# Import model dependencies, and divide data into train and test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [10]:
# Create an instance of the logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [11]:
# Fit the model to the train set
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [12]:
# Make predictions using the test set
y_pred = classifier.predict(X_test)

In [13]:
# Compare the predictions with the actual values and visualize in a DataFrame
prediction_df = pd.DataFrame({"Prediction": y_pred, "Actual":y_test})
prediction_df.head(20)

Unnamed: 0,Prediction,Actual
10703,1.0,0.0
36513,0.0,1.0
27085,1.0,0.0
35942,1.0,1.0
54583,1.0,1.0
24211,1.0,1.0
15970,1.0,0.0
55405,0.0,1.0
16291,1.0,1.0
62443,0.0,0.0


In [14]:
# Access performance of the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6912571428571429


In [15]:
# Obtain the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[6285 2470]
 [2933 5812]]


In [16]:
# Obtain the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.68      0.72      0.70      8755
         1.0       0.70      0.66      0.68      8745

    accuracy                           0.69     17500
   macro avg       0.69      0.69      0.69     17500
weighted avg       0.69      0.69      0.69     17500



### Performing a Linear Regression on the initial Cleaned Data

In [33]:
# Read the data from the database table
cardio_cleaned_df = pd.read_sql_table("cardio_cleaned", engine)

In [34]:
# Inspect at the DataFrame
cardio_cleaned_df.head()

Unnamed: 0,id,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio_disease
0,86650.0,51.0,1.0,171.0,29.0,110.0,70.0,2.0,1.0,0.0,0.0,1.0,1.0
1,26503.0,49.0,1.0,160.0,30.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0
2,59853.0,58.0,1.0,143.0,30.0,103.0,61.0,2.0,1.0,0.0,0.0,1.0,0.0
3,24167.0,47.0,2.0,170.0,31.0,150.0,90.0,2.0,2.0,0.0,0.0,1.0,1.0
4,31439.0,42.0,1.0,146.0,32.0,100.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0


In [37]:
# Drop column id as it is not related to onsent of cardiovascular disease
df = cardio_cleaned_df.drop(["id"], axis=1)

In [38]:
# Set the Target and Features
y = df["cardio_disease"]
X = df.drop(columns="cardio_disease")

In [39]:
# Divide data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [41]:
# Create an instance of the logistic regression model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [42]:
# Fit the model to the train set
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [43]:
# Make predictions using the test set
y_pred = classifier.predict(X_test)

In [44]:
# Compare the predictions with the actual values and visualize in a DataFrame
prediction_df = pd.DataFrame({"Prediction": y_pred, "Actual":y_test})
prediction_df.head(20)

Unnamed: 0,Prediction,Actual
63457,0.0,0.0
1849,1.0,0.0
3136,0.0,0.0
44350,0.0,0.0
38367,0.0,0.0
60083,0.0,0.0
40040,0.0,0.0
27238,1.0,1.0
12882,1.0,1.0
38785,0.0,0.0


In [45]:
# Access performance of the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7165446559297218


In [46]:
# Obtain the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[6681 1971]
 [2869 5554]]


In [47]:
# Obtain the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.70      0.77      0.73      8652
         1.0       0.74      0.66      0.70      8423

    accuracy                           0.72     17075
   macro avg       0.72      0.72      0.72     17075
weighted avg       0.72      0.72      0.72     17075



### Notice that the in initial clean up of data has resulted in overall better scores.