## Performing a Linear Regression on the Raw Data

In [1]:
# Import dependencies
import pandas as pd

In [2]:
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func

In [3]:
from config import db_password

In [4]:
# Create a connection with the database in postgres
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/Be_Heart_Smart"
engine = create_engine(db_string)

In [5]:
# Read the data table from the database into a DataFrame
cardio_raw_df = pd.read_sql_table("cardio_info", engine)

In [6]:
# Inspect at the DataFrame
cardio_raw_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,82567.0,18804.0,2.0,165.0,10.0,180.0,1100.0,2.0,2.0,0.0,0.0,1.0,1.0
1,48318.0,21582.0,2.0,178.0,11.0,130.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0
2,85931.0,21855.0,1.0,162.0,21.0,120.0,80.0,2.0,1.0,0.0,0.0,1.0,1.0
3,42156.0,20408.0,2.0,177.0,22.0,120.0,80.0,1.0,1.0,1.0,1.0,1.0,0.0
4,38312.0,23284.0,1.0,157.0,23.0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0


In [7]:
# Drop column id as it is not related to onsent of cardiovascular disease
df = cardio_raw_df.drop(["id"], axis=1)

In [8]:
# Set the Target and Features of the model
y = df["cardio"]
X = df.drop(columns="cardio")

In [9]:
# Import model dependencies, and divide data into train and test sets.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [10]:
# Create an instance of the logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [11]:
# Fit the model to the train set
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [12]:
# Make predictions using the test set
y_pred = classifier.predict(X_test)

In [13]:
# Compare the predictions with the actual values and visualize in a DataFrame
prediction_df = pd.DataFrame({"Prediction": y_pred, "Actual":y_test})
prediction_df.head(20)

Unnamed: 0,Prediction,Actual
10703,1.0,0.0
36513,0.0,1.0
27085,1.0,0.0
35942,1.0,1.0
54583,1.0,1.0
24211,1.0,1.0
15970,1.0,0.0
55405,0.0,1.0
16291,1.0,1.0
62443,0.0,0.0


In [14]:
# Access performance of the model
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6912571428571429


In [15]:
# Obtain the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[6285 2470]
 [2933 5812]]


In [16]:
# Obtain the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.68      0.72      0.70      8755
         1.0       0.70      0.66      0.68      8745

    accuracy                           0.69     17500
   macro avg       0.69      0.69      0.69     17500
weighted avg       0.69      0.69      0.69     17500



## Performing a Linear Regression on the initial Cleaned Data

In [17]:
# Read the data table from the database into a DataFrame
cardio_cleaned_df = pd.read_sql_table("cardio_cleaned", engine)

In [18]:
# Inspect at the DataFrame
cardio_cleaned_df.head()

Unnamed: 0,id,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio_disease
0,86650.0,51.0,1.0,171.0,29.0,110.0,70.0,2.0,1.0,0.0,0.0,1.0,1.0
1,26503.0,49.0,1.0,160.0,30.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0
2,59853.0,58.0,1.0,143.0,30.0,103.0,61.0,2.0,1.0,0.0,0.0,1.0,0.0
3,24167.0,47.0,2.0,170.0,31.0,150.0,90.0,2.0,2.0,0.0,0.0,1.0,1.0
4,31439.0,42.0,1.0,146.0,32.0,100.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0


In [19]:
# Drop column id as it is not related to onsent of cardiovascular disease
df = cardio_cleaned_df.drop(["id"], axis=1)

In [20]:
# Set the Target and Features
y = df["cardio_disease"]
X = df.drop(columns="cardio_disease")

In [21]:
# Divide data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [22]:
# Create an instance of the logistic regression model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

In [23]:
# Fit the model to the train set
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [24]:
# Make predictions using the test set
y_pred = classifier.predict(X_test)

In [25]:
# Compare the predictions with the actual values and visualize in a DataFrame
prediction_df = pd.DataFrame({"Prediction": y_pred, "Actual":y_test})
prediction_df.head(20)

Unnamed: 0,Prediction,Actual
63457,0.0,0.0
1849,1.0,0.0
3136,0.0,0.0
44350,0.0,0.0
38367,0.0,0.0
60083,0.0,0.0
40040,0.0,0.0
27238,1.0,1.0
12882,1.0,1.0
38785,0.0,0.0


In [26]:
# Access performance of the model
print(accuracy_score(y_test, y_pred))

0.7165446559297218


In [27]:
# Obtain the confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[6681 1971]
 [2869 5554]]


In [28]:
# Obtain the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.70      0.77      0.73      8652
         1.0       0.74      0.66      0.70      8423

    accuracy                           0.72     17075
   macro avg       0.72      0.72      0.72     17075
weighted avg       0.72      0.72      0.72     17075



### Notice that the in initial clean up of data has resulted in overall better scores.

## Performing Logistic Regression after adding the BMI info to the cleaned data

In [29]:
# Read the data tables from the database into a DataFrame
BMI_df = pd.read_sql_table("bmi_status", engine)

In [30]:
# Merge the two DataFrames, cardio_cleaned_df and BMI_df, on id.
cardio_df = pd.merge(cardio_cleaned_df, BMI_df, on="id")
cardio_df.sample(10)

Unnamed: 0,id,age,gender,height,weight,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio_disease,BMI,weight_status,obesity_status
1844,20008.0,47.0,1.0,151.0,52.0,120.0,80.0,2.0,1.0,0.0,0.0,1.0,1.0,22.8,normal,no
58374,64842.0,57.0,2.0,172.0,89.0,120.0,80.0,1.0,2.0,0.0,0.0,1.0,0.0,30.1,obese,yes
65716,77870.0,45.0,2.0,179.0,69.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,21.5,normal,no
21796,42113.0,54.0,1.0,160.0,60.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,23.4,normal,no
35846,67772.0,56.0,1.0,165.0,63.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,23.1,normal,no
7399,39044.0,39.0,2.0,183.0,59.0,100.0,70.0,2.0,1.0,0.0,0.0,0.0,0.0,17.6,underweight,no
37276,83551.0,43.0,1.0,165.0,67.0,120.0,80.0,1.0,2.0,0.0,0.0,0.0,0.0,24.6,normal,no
8289,87781.0,45.0,1.0,146.0,73.0,150.0,100.0,1.0,1.0,0.0,0.0,0.0,1.0,34.2,obese,yes
59346,29328.0,64.0,2.0,173.0,82.0,140.0,110.0,1.0,1.0,0.0,0.0,1.0,1.0,27.4,overweight,no
36956,94446.0,59.0,2.0,165.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,0.0,1.0,23.9,normal,no


In [31]:
# Re-aarange columns in the merged DataFrame
rearranged_columns = ["id", "age", "gender", "height", "weight", "BMI", "weight_status", "obesity_status", 
                       "systolic_bp", "diastolic_bp", "cholesterol", "glucose", "smoker", "alcohol_intake", 
                       "active", "cardio_disease"]
cardio_df = cardio_df[rearranged_columns]
cardio_df.head()

Unnamed: 0,id,age,gender,height,weight,BMI,weight_status,obesity_status,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio_disease
0,86650.0,51.0,1.0,171.0,29.0,9.9,underweight,no,110.0,70.0,2.0,1.0,0.0,0.0,1.0,1.0
1,26503.0,49.0,1.0,160.0,30.0,11.7,underweight,no,120.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0
2,59853.0,58.0,1.0,143.0,30.0,14.7,underweight,no,103.0,61.0,2.0,1.0,0.0,0.0,1.0,0.0
3,24167.0,47.0,2.0,170.0,31.0,10.7,underweight,no,150.0,90.0,2.0,2.0,0.0,0.0,1.0,1.0
4,31439.0,42.0,1.0,146.0,32.0,15.0,underweight,no,100.0,70.0,1.0,1.0,0.0,0.0,0.0,0.0


In [32]:
# Change the continuous variables weight_status, and obesity_status from string to numeric.
# Defining a function string_to_numeric.
def string_to_numeric(variable):
    if variable == "underweight":
        return 1
    elif variable == "normal":
        return 2
    elif variable == "overweight":
        return 3
    else:
        return 4
    
# Call the function string_to_numeric on column weight_status 
cardio_df["weight_status"] = cardio_df["weight_status"].apply(string_to_numeric)

# Change the obesity_status to numeric
cardio_df["obesity_status"] = cardio_df["obesity_status"].apply(lambda x: 1 if x == "yes" else 0)

cardio_df.sample(10)

Unnamed: 0,id,age,gender,height,weight,BMI,weight_status,obesity_status,systolic_bp,diastolic_bp,cholesterol,glucose,smoker,alcohol_intake,active,cardio_disease
558,52650.0,54.0,2.0,154.0,47.0,19.8,2,0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
1203,48064.0,40.0,1.0,153.0,50.0,21.4,2,0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
39792,41282.0,60.0,1.0,165.0,87.0,32.0,4,1,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
2225,93977.0,53.0,1.0,157.0,52.0,21.1,2,0,110.0,60.0,1.0,1.0,0.0,0.0,1.0,0.0
55097,86068.0,60.0,2.0,170.0,90.0,31.1,4,1,160.0,80.0,1.0,1.0,0.0,0.0,1.0,1.0
52823,95396.0,62.0,2.0,170.0,70.0,24.2,2,0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
5575,88498.0,64.0,2.0,164.0,57.0,21.2,2,0,110.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
569,59997.0,59.0,1.0,145.0,47.0,22.4,2,0,130.0,100.0,1.0,1.0,0.0,0.0,1.0,1.0
54973,50017.0,51.0,2.0,170.0,89.0,30.8,4,1,140.0,90.0,1.0,1.0,0.0,0.0,0.0,0.0
12458,13019.0,53.0,1.0,154.0,93.0,39.2,4,1,160.0,100.0,1.0,1.0,0.0,0.0,1.0,1.0


In [33]:
# Drop column id as it is not related to onsent of cardiovascular disease
df = cardio_df.drop(["id"], axis=1)

# Set the Target and Features of the model
y = df["cardio_disease"]
X = df.drop(columns="cardio_disease")

# Divide data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [34]:
# Create an instance of the logistic regression model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

# Fit the model to the train set
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [35]:
# Make predictions using the test set
y_pred = classifier.predict(X_test)

# Compare the predictions with the actual values and visualize in a DataFrame
prediction_df = pd.DataFrame({"Prediction": y_pred, "Actual":y_test})
prediction_df.head(20)

Unnamed: 0,Prediction,Actual
63457,0.0,0.0
1849,0.0,0.0
3136,0.0,0.0
44350,0.0,0.0
38367,0.0,0.0
60083,0.0,0.0
40040,0.0,0.0
27238,1.0,1.0
12882,1.0,1.0
38785,0.0,0.0


In [36]:
# Access performance of the model
print(accuracy_score(y_test, y_pred))

0.7255636896046852


In [37]:
# Obtain the confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[6783 1869]
 [2817 5606]]


In [38]:
# Obtain the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.71      0.78      0.74      8652
         1.0       0.75      0.67      0.71      8423

    accuracy                           0.73     17075
   macro avg       0.73      0.72      0.72     17075
weighted avg       0.73      0.73      0.72     17075

