In [1]:
import datetime
import sqlalchemy
import pandas as pd
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Database Setup
engine = create_engine(f"postgresql+psycopg2://postgres:yourpasswordhere@localhost:5432/final_project")

# Reflect existing database into a new model
Base = automap_base()

# Reflect the tables
Base.prepare(autoload_with=engine)

# Create session (link) from Python to PG Admin
session = Session(engine)

# Create list and load to dataframe
heart_data_list = []
heart_data = engine.execute("SELECT * FROM heart_failure")
for results in heart_data:
    heart_failure = {}
    heart_failure["age"] = results[0]
    heart_failure["sex"] = results[1]
    heart_failure["chest_pain_type"] = results[2]
    heart_failure["resting_bp"] = results[3]
    heart_failure["cholesterol"] = results[4]
    heart_failure["fasting_bs"] = results[5]
    heart_failure["resting_ecg"] = results[6]
    heart_failure["max_hr"] = results[7]
    heart_failure["exercise_aniga"] = results[8]
    heart_failure["old_peak"] = results[9]
    heart_failure["st_slope"] = results[10]
    heart_failure["heart_disease"] = results[11]
    heart_data_list.append(heart_failure)

session.close()
heart_df = pd.DataFrame(heart_data_list)


In [3]:
heart_df.head(10)

Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholesterol,fasting_bs,resting_ecg,max_hr,exercise_aniga,old_peak,st_slope,heart_disease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [4]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              918 non-null    int64  
 1   sex              918 non-null    object 
 2   chest_pain_type  918 non-null    object 
 3   resting_bp       918 non-null    int64  
 4   cholesterol      918 non-null    int64  
 5   fasting_bs       918 non-null    int64  
 6   resting_ecg      918 non-null    object 
 7   max_hr           918 non-null    int64  
 8   exercise_aniga   918 non-null    object 
 9   old_peak         918 non-null    float64
 10  st_slope         918 non-null    object 
 11  heart_disease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
missing_values = heart_df.isnull().sum()
missing_values

age                0
sex                0
chest_pain_type    0
resting_bp         0
cholesterol        0
fasting_bs         0
resting_ecg        0
max_hr             0
exercise_aniga     0
old_peak           0
st_slope           0
heart_disease      0
dtype: int64

In [6]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = heart_df['heart_disease']

# Separate the X variable, the features
X = heart_df.drop(columns='heart_disease')

In [7]:
# Review the y variable Series
y.head()

0    0
1    1
2    0
3    1
4    0
Name: heart_disease, dtype: int64

In [8]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,age,sex,chest_pain_type,resting_bp,cholesterol,fasting_bs,resting_ecg,max_hr,exercise_aniga,old_peak,st_slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up


In [9]:
# Check the balance of target values
y.value_counts()

1    508
0    410
Name: heart_disease, dtype: int64

In [10]:
#Split the data into training and testing datasets by using train_test_split

# Import the train_test_learn module
from sklearn.model_selection import train_test_split


In [11]:
#There are categorical variables: 'sex', 'chest_pain_type', 'resting_ecg', 'exercise_aniga', and 'st_slope'. 
#These variables need to be transformed into numerical columns using one-hot encoding before training the model.
#using get_dumies()
X_encoded = pd.get_dummies(X, columns=['sex', 'chest_pain_type', 'resting_ecg', 'exercise_aniga', 'st_slope'])


In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=1)

In [13]:
#Create a Logistic Regression Model with the Original Data

#Step 1: Fit a logistic regression model by using the training data (X_train and y_train).

# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
#model = LogisticRegression(random_state = 1, max_iter=1000)

# Fit the model using training data
#model.fit(X_train, y_train)

from sklearn.preprocessing import StandardScaler

# Preprocess the data (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate the Logistic Regression model
model = LogisticRegression(random_state=1)

# Fit the model using scaled training data
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [14]:
#Step 2: Save the predictions on the testing data labels by using 
#the testing feature data (X_test) and the fitted model.

# Make a prediction using the testing data
y_pred = model.predict(X_test_scaled)



In [15]:
y_pred

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0])

In [16]:
#Step 3: Evaluate the model’s performance by doing the following:
#Calculate the accuracy score of the model.
#Generate a confusion matrix.
#Print the classification report.

# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.8779185592477489

In [17]:
###logistic regression model is correctly predicting the target variable for about 88% of the test data samples.

In [18]:
# Generate a confusion matrix for the model
confusion_mat = confusion_matrix(y_test, y_pred)
confusion_mat

array([[ 78,  11],
       [ 17, 124]])

In [19]:
#####################################
#True Positives (TP): 124
#False Positives (FP): 11
#True Negatives (TN): 78
#False Negatives (FN): 17

In [20]:
# Print the classification report for the model
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.82      0.88      0.85        89
           1       0.92      0.88      0.90       141

    accuracy                           0.88       230
   macro avg       0.87      0.88      0.87       230
weighted avg       0.88      0.88      0.88       230



In [21]:
##Predict a Logistic Regression Model with Resampled Training Data
#Step 1: Use the RandomOverSampler module from the imbalanced-learn library to resample the data. 
#Be sure to confirm that the labels have an equal number of data points.


In [22]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler


# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
random_oversample_model = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_resample, y_resample = random_oversample_model.fit_resample(X_train_scaled, y_train)

In [23]:
# Count the distinct values of the resampled labels data
y_resample.value_counts()

1    367
0    367
Name: heart_disease, dtype: int64

In [24]:
#Step 2: Use the LogisticRegression classifier and the resampled data to fit the model and make predictions.

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
 = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
model_resampled.fit(X_resample, y_resample)

# Make a prediction using the testing data
y_pred_resampled = model_resampled.predict(X_test_scaled)

In [25]:
#Step 3: Evaluate the model’s performance by doing the following:
#Calculate the accuracy score of the model.
#Generate a confusion matrix.
#Print the classification report.

In [26]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, y_pred_resampled)

0.8856084150131485

In [27]:
###The resampling has led to a slight improvement in accuracy compared to the initial accuracy of around 0.88

In [28]:
# Generate a confusion matrix for the model
confusion_mat = confusion_matrix(y_test, y_pred_resampled)
confusion_mat

array([[ 80,   9],
       [ 18, 123]])

In [29]:
############################
#True Positives (TP): 123
#False Positives (FP): 9
#True Negatives (TN): 80
#False Negatives (FN): 18

In [30]:
# Print the classification report for the model
report = classification_report(y_test, y_pred_resampled)
print(report)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86        89
           1       0.93      0.87      0.90       141

    accuracy                           0.88       230
   macro avg       0.87      0.89      0.88       230
weighted avg       0.89      0.88      0.88       230



In [31]:
#######################
#The number of true positive predictions decreased slightly in the resampled data.
#The number of false positive predictions decreased in the resampled data.
#The number of true negative predictions increased in the resampled data. 
#The number of false negative predictions increased slightly in the resampled data. 

In [32]:
#####################
#After resampling, the model's performance has improved slightly, the accuracy remains at 88%, but the weighted average 
#precision, recall, and F1-score have slightly improved, there is a better overall balance between the classes.

#Class 0: Recall increased from 0.88 to 0.90.
#Class 1: Precision increased from 0.92 to 0.93

In [None]:
#your_model.save("model_name.h5")

In [34]:
import pickle

# Save the model to a file using pickle
model_filename = 'model_resampled.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

In [37]:
with open('model_resampled.pkl', 'rb') as f:
    model = pickle.load(f)

In [41]:
preds = model.predict(X_test_scaled)

In [42]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85        89
           1       0.92      0.88      0.90       141

    accuracy                           0.88       230
   macro avg       0.87      0.88      0.87       230
weighted avg       0.88      0.88      0.88       230

