In [None]:
# import library functions

import pandas as pd
import numpy as np
from numpy import log,dot,exp,shape
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [354]:
# import the .csv file into Python

df = pd.read_csv (r'C:\Users\e135634\OneDrive - Blue Cross Blue Shield of Michigan\Desktop\Test Folder\heart.csv')

In [355]:
# quick read of the dataset
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [356]:
# first I want to see how many unique variables are in the categorical columns

df['Sex'].unique()

array(['M', 'F'], dtype=object)

In [357]:
df['ExerciseAngina'].unique()

array(['N', 'Y'], dtype=object)

In [358]:
df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [359]:
df['RestingECG'].unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [360]:
df['ST_Slope'].unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [361]:
# now I will turn these categorical variables into dummy variables so I can run regression analysis

df_dummies = pd.get_dummies(df, columns = ['Sex' , 'ExerciseAngina' , 'ChestPainType' , 'RestingECG' , 'ST_Slope'])

# moving HeartDisease column to the end as a visual indicator that it is our Dependent Variable for the logistic regression analysis
df_dummies = df_dummies.reindex(columns = [col for col in df_dummies.columns if col != 'HeartDisease'] + ['HeartDisease'])
df_dummies.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ExerciseAngina_N,ExerciseAngina_Y,...,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,40,140,289,0,172,0.0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,0
1,49,160,180,0,156,1.0,1,0,1,0,...,0,1,0,0,1,0,0,1,0,1
2,37,130,283,0,98,0.0,0,1,1,0,...,1,0,0,0,0,1,0,0,1,0
3,48,138,214,0,108,1.5,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1
4,54,150,195,0,122,0.0,0,1,1,0,...,0,1,0,0,1,0,0,0,1,0


In [362]:
df_dummies.dtypes

Age                    int64
RestingBP              int64
Cholesterol            int64
FastingBS              int64
MaxHR                  int64
Oldpeak              float64
Sex_F                  uint8
Sex_M                  uint8
ExerciseAngina_N       uint8
ExerciseAngina_Y       uint8
ChestPainType_ASY      uint8
ChestPainType_ATA      uint8
ChestPainType_NAP      uint8
ChestPainType_TA       uint8
RestingECG_LVH         uint8
RestingECG_Normal      uint8
RestingECG_ST          uint8
ST_Slope_Down          uint8
ST_Slope_Flat          uint8
ST_Slope_Up            uint8
HeartDisease           int64
dtype: object

In [363]:
df = df_dummies.astype({"Age" : 'float' , "RestingBP" : 'float' ,"Cholesterol" : 'float' ,"FastingBS" : 'float' ,
                                "MaxHR" : 'float' ,"Sex_F" : 'float' ,"Sex_M" : 'float' ,"ExerciseAngina_N" : 'float' ,
                                "ExerciseAngina_Y" : 'float' ,"ChestPainType_ASY" : 'float' ,"ChestPainType_ATA" : 'float' ,
                                "ChestPainType_NAP" : 'float' ,"ChestPainType_TA" : 'float' ,"RestingECG_LVH" : 'float' ,
                                "RestingECG_Normal" : 'float' ,"RestingECG_ST" : 'float' ,"ST_Slope_Down" : 'float' ,"ST_Slope_Flat" : 'float' ,"ST_Slope_Up" : 'float'})

In [364]:
df.dtypes

Age                  float64
RestingBP            float64
Cholesterol          float64
FastingBS            float64
MaxHR                float64
Oldpeak              float64
Sex_F                float64
Sex_M                float64
ExerciseAngina_N     float64
ExerciseAngina_Y     float64
ChestPainType_ASY    float64
ChestPainType_ATA    float64
ChestPainType_NAP    float64
ChestPainType_TA     float64
RestingECG_LVH       float64
RestingECG_Normal    float64
RestingECG_ST        float64
ST_Slope_Down        float64
ST_Slope_Flat        float64
ST_Slope_Up          float64
HeartDisease           int64
dtype: object

In [365]:
# Noticed that several patients have incomplete data in Cholestrol- filtering out those incomplete records
df1 = df[df.Cholesterol > 0]
df1.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ExerciseAngina_N,ExerciseAngina_Y,...,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,40.0,140.0,289.0,0.0,172.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
1,49.0,160.0,180.0,0.0,156.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
2,37.0,130.0,283.0,0.0,98.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,48.0,138.0,214.0,0.0,108.0,1.5,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
4,54.0,150.0,195.0,0.0,122.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0


In [366]:
# Adding a MinMax scaler to the data to get all data points between 0 and 1 before I start Logistic Regression analysis

df_scaled = df1

MMscal=MinMaxScaler()
features=['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'Sex_F', 'Sex_M', 'ExerciseAngina_N', 'ExerciseAngina_Y',
       'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP',
       'ChestPainType_TA', 'RestingECG_LVH', 'RestingECG_Normal',
       'RestingECG_ST', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up']
df_scaled[features] = MMscal.fit_transform(df1[features])
df_scaled.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ExerciseAngina_N,ExerciseAngina_Y,...,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,0.244898,0.444444,0.393822,0.0,0.774436,0.015873,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
1,0.428571,0.62963,0.183398,0.0,0.654135,0.174603,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
2,0.183673,0.351852,0.382239,0.0,0.218045,0.015873,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,0.408163,0.425926,0.249035,0.0,0.293233,0.253968,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
4,0.530612,0.537037,0.212355,0.0,0.398496,0.015873,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0


In [367]:
#splitting the dataset into training and test sets using 8:2 ratio

X = df_scaled.drop('HeartDisease', axis = 1)
y = df_scaled.HeartDisease


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [368]:
class Logistic_Regression:
    
    def __init__(self, lr = 0.001, n_iters = 1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)
            
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_class = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_class
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    

In [369]:
model = Logistic_Regression(lr = 0.0001, n_iters = 1000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [370]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [371]:
print(confusion_matrix(y_test, predictions))

[[76 11]
 [ 7 56]]


In [372]:
accuracy = accuracy_score(y_test, predictions)
print(accuracy)

0.88


In [373]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89        87
           1       0.84      0.89      0.86        63

    accuracy                           0.88       150
   macro avg       0.88      0.88      0.88       150
weighted avg       0.88      0.88      0.88       150



In [374]:
# running data through pre-built sklearn function to see how my formula compares

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
predict = lr.predict(X_test)

print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))
accuracy1 = accuracy_score(y_test, predict)
print("accuracy:" , accuracy1)

[[77 10]
 [ 6 57]]
              precision    recall  f1-score   support

           0       0.93      0.89      0.91        87
           1       0.85      0.90      0.88        63

    accuracy                           0.89       150
   macro avg       0.89      0.89      0.89       150
weighted avg       0.90      0.89      0.89       150

accuracy: 0.8933333333333333


In [375]:
# so my model isn't performing as well as the pre-built sklearn logistic regression function, but is close
# I now want to look at adding Kfold cross validation technique to my model to see if there is an improvement in accuracy

In [394]:
from sklearn.model_selection import KFold, StratifiedKFold

k = 10
k_fold = KFold(n_splits = k, random_state = None)
model = Logistic_Regression(lr = 0.0001, n_iters = 10000)

acc_scores = []

In [392]:
for training_index, testing_index in k_fold.split(X):  
    X_train, X_test = X.iloc[training_index,:], X.iloc[testing_index,:]  
    y_train, y_test = y.iloc[training_index] , y.iloc[testing_index]  
    
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    
    acc = accuracy_score(predictions, y_test)
    acc_scores.append(acc)

In [393]:
mean_acc_score = sum(acc_scores) / k  
   
print("Accuracy score of each fold: ", acc_scores)  
print("Mean accuracy score: ", mean_acc_score)  

Accuracy score of each fold:  [0.9066666666666666, 0.8666666666666667, 0.9333333333333333, 0.9066666666666666, 0.9333333333333333, 0.84, 0.7432432432432432, 0.7972972972972973, 0.8108108108108109, 0.7567567567567568]
Mean accuracy score:  0.8494774774774774


In [310]:
# so adding KFold cross validation did not improve my model's accuracy score (average) although several folds were higher
# will try looking at Stratified KFold method to see if having same fold proportion across all labels improves performance

In [398]:
k = 5
stratk_fold = StratifiedKFold(n_splits = k, random_state = None)
model = Logistic_Regression(lr = 0.0001, n_iters = 10000)

acc_scores = []

for training_index, testing_index in stratk_fold.split(X,y):  
    X_train, X_test = X.iloc[training_index,:], X.iloc[testing_index,:]  
    y_train, y_test = y.iloc[training_index] , y.iloc[testing_index]  
    
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    
    acc = accuracy_score(predictions, y_test)
    acc_scores.append(acc)
    
mean_acc_score = sum(acc_scores) / k  
   
print("Accuracy score of each fold: ", acc_scores)  
print("Mean accuracy score: ", mean_acc_score) 

Accuracy score of each fold:  [0.88, 0.9194630872483222, 0.87248322147651, 0.7785234899328859, 0.7919463087248322]
Mean accuracy score:  0.8484832214765099


In [None]:
# running a high iteration number takes several minutes to compute and may not be as efficient when computing larger datasets
# there was no noticeable difference in running different KFold methods in overall accuracy