In [112]:
# Importing Important Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [125]:
# Importing and Customizing the datase
df = pd.read_csv('Heart_Disease.csv')
df = df.drop_duplicates()
df = df.dropna()
df = df.interpolate()
print(df.head())
df = df[['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'TenYearCHD']]
print(df.head())

# All continuous features
x1 = np.array(df['age'])
x2 = np.array(df['cigsPerDay'])
x3 = np.array(df['totChol'])
x4 = np.array(df['sysBP'])
x5 = np.array(df['diaBP'])
y = np.array(df['TenYearCHD'])

   male  age  education  currentSmoker  cigsPerDay  BPMeds  prevalentStroke  \
0     1   39        4.0              0         0.0     0.0                0   
1     0   46        2.0              0         0.0     0.0                0   
2     1   48        1.0              1        20.0     0.0                0   
3     0   61        3.0              1        30.0     0.0                0   
4     0   46        3.0              1        23.0     0.0                0   

   prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  heartRate  glucose  \
0             0         0    195.0  106.0   70.0  26.97       80.0     77.0   
1             0         0    250.0  121.0   81.0  28.73       95.0     76.0   
2             0         0    245.0  127.5   80.0  25.34       75.0     70.0   
3             1         0    225.0  150.0   95.0  28.58       65.0    103.0   
4             0         0    285.0  130.0   84.0  23.10       85.0     85.0   

   TenYearCHD  
0           0  
1           0  
2 

In [142]:
# Performaing train test split
x1, x1_t, \
x2, x2_t, \
x3, x3_t, \
x4, x4_t, \
x5, x5_t, \
y_train, y_test = train_test_split(
    x1, x2, x3, x4, x5, y,
    test_size = 0.2,
    random_state = 42
)

# Extracting minimum and maximum values of all
x1_min = min(x1)
x1_max = max(x1)
x2_min = min(x2)
x2_max = max(x2)
x3_min = min(x3)
x3_max = max(x3)
x4_min = min(x4)
x4_max = max(x4)
x5_min = min(x5)
x5_max = max(x5)
x1_test_min = min(x1_test)
x1_test_max = max(x1_test)
x2_test_min = min(x2_test)
x2_test_max = max(x2_test)
x3_test_min = min(x3_test)
x3_test_max = max(x3_test)
x4_test_min = min(x4_test)
x4_test_max = max(x4_test)
x5_test_min = min(x5_test)
x5_test_max = max(x5_test)

# Feature Scaling
def feature_scaling(x, x_min, x_max):
    return (x - x_min) / (x_max - x_min)

x1_train = feature_scaling(x1, x1_min, x1_max)
x2_train = feature_scaling(x2, x2_min, x2_max)
x3_train = feature_scaling(x3, x3_min, x3_max)
x4_train = feature_scaling(x4, x4_min, x4_max)
x5_train = feature_scaling(x5, x5_min, x5_max)
x1_test = feature_scaling(x1_test, x1_test_min, x1_test_max)
x2_test = feature_scaling(x2_test, x2_test_min, x2_test_max)
x3_test = feature_scaling(x3_test, x3_test_min, x3_test_max)
x4_test = feature_scaling(x4_test, x4_test_min, x4_test_max)
x5_test = feature_scaling(x5_test, x5_test_min, x5_test_max)

In [146]:
# Training model on data and getting results
model = LogisticRegression()
model.fit(
    np.array([x1_train, x2_train, x3_train, x4_train, x5_train]).T,
    y_train
)

# Predicting probabilities (not classes)
y_pred_prob = model.predict_proba(
    np.array([x1_train, x2_train, x3_train, x4_train, x5_train]).T
)[:, 1]  # probabilities of class 1 (CHD = 1)

# Custom threshold
threshold = 0.2

# Applying on training data
y_pred = (y_pred_prob >= threshold).astype(int)

# Evaluate performance
print("Threshold:", threshold)
print("Accuracy:", accuracy_score(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

# Predict probabilities on test data
y_test_pred_prob = model.predict_proba(
    np.array([x1_test, x2_test, x3_test, x4_test, x5_test]).T
)[:, 1]

# Applying on test data
y_test_pred = (y_test_pred_prob >= threshold).astype(int)

# Evaluate performance for test data
print("Threshold:", threshold)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Threshold: 0.2
Accuracy: 0.746922024623803
[[1967  522]
 [ 218  217]]
              precision    recall  f1-score   support

           0       0.90      0.79      0.84      2489
           1       0.29      0.50      0.37       435

    accuracy                           0.75      2924
   macro avg       0.60      0.64      0.61      2924
weighted avg       0.81      0.75      0.77      2924

Threshold: 0.2
Test Accuracy: 0.726775956284153
[[481 129]
 [ 71  51]]
              precision    recall  f1-score   support

           0       0.87      0.79      0.83       610
           1       0.28      0.42      0.34       122

    accuracy                           0.73       732
   macro avg       0.58      0.60      0.58       732
weighted avg       0.77      0.73      0.75       732



In [133]:
# Writing custom Logistic Regression code with regularization
class LogisticRegressionCustom:
    def __init__(self, x1, x2, x3, x4, x5, y):
        self.x1 = x1
        self.x2 = x2
        self.x3 = x3
        self.x4 = x4
        self.x5 = x5
        self.y = y
        self.m = len(x1)

    def calculate_hypothesis(
        self, weight1, weight2, weight3, weight4, weight5,
        bias, x1_data, x2_data, x3_data, x4_data, x5_data
        ):

        hypothesis = []

        for i in range(len(x1_data)):
          z = weight1 * x1_data[i] + weight2 * x2_data[i] + weight3 * x3_data[i] + weight4 * x4_data[i] + weight5 * x5_data[i] + bias
          hypothesis.append(1 / (1 + np.exp(-z)))
        return np.array(hypothesis) # Convert the list to a NumPy array

    def calculate_cost(
        self, weight1, weight2, weight3, weight4, weight5,
        bias, lambda_
        ):

        hypothesis = self.calculate_hypothesis(
                          weight1, weight2, weight3, weight4, weight5,
                          bias, self.x1, self.x2, self.x3, self.x4, self.x5
                          )

        loss = -self.y * np.log(hypothesis) - (1 - self.y) * np.log(1 - hypothesis)
        cost = np.sum(loss) / self.m
        regularization_term = (lambda_ / (2 * self.m)) * (weight1 ** 2 + weight2 ** 2 + weight3 ** 2 + weight4 ** 2 + weight5 ** 2)
        cost += regularization_term
        return cost

    def gradient_descent(self, learning_rate, weight1, weight2, weight3, weight4, weight5, bias, lambda_):
        hypothesis = self.calculate_hypothesis(
                          weight1, weight2, weight3, weight4, weight5,
                          bias, self.x1, self.x2, self.x3, self.x4, self.x5
                          )

        weight1_gradient = 0
        weight2_gradient = 0
        weight3_gradient = 0
        weight4_gradient = 0
        weight5_gradient = 0
        bias_gradient = 0

        for i in range(self.m):
            weight1_gradient += (hypothesis[i] - self.y[i]) * self.x1[i]
            weight2_gradient += (hypothesis[i] - self.y[i]) * self.x2[i]
            weight3_gradient += (hypothesis[i] - self.y[i]) * self.x3[i]
            weight4_gradient += (hypothesis[i] - self.y[i]) * self.x4[i]
            weight5_gradient += (hypothesis[i] - self.y[i]) * self.x5[i]
            bias_gradient += (hypothesis[i] - self.y[i])

        # Apply averaging and regularization correctly
        weight1_gradient = (weight1_gradient / self.m) + (lambda_ / self.m) * weight1
        weight2_gradient = (weight2_gradient / self.m) + (lambda_ / self.m) * weight2
        weight3_gradient = (weight3_gradient / self.m) + (lambda_ / self.m) * weight3
        weight4_gradient = (weight4_gradient / self.m) + (lambda_ / self.m) * weight4
        weight5_gradient = (weight5_gradient / self.m) + (lambda_ / self.m) * weight5
        bias_gradient = bias_gradient / self.m

        # Update weights
        weight1 -= learning_rate * weight1_gradient
        weight2 -= learning_rate * weight2_gradient
        weight3 -= learning_rate * weight3_gradient
        weight4 -= learning_rate * weight4_gradient
        weight5 -= learning_rate * weight5_gradient
        bias -= learning_rate * bias_gradient

        return weight1, weight2, weight3, weight4, weight5, bias

    def train(self, learning_rate, weight1, weight2, weight3, weight4, weight5, bias, lambda_, epochs):
        cost_history = []
        weight1_history = []
        weight2_history = []
        weight3_history = []
        weight4_history = []
        weight5_history = []
        bias_history = []

        for i in range(epochs):
            weight1, weight2, weight3, weight4, weight5, bias = self.gradient_descent(
                                                                    learning_rate, weight1, weight2, weight3, weight4, weight5,
                                                                    bias, lambda_
                                                                    )

            cost = self.calculate_cost(weight1, weight2, weight3, weight4, weight5, bias, lambda_)

            weight1_history.append(weight1)
            weight2_history.append(weight2)
            weight3_history.append(weight3)
            weight4_history.append(weight4)
            weight5_history.append(weight5)
            bias_history.append(bias)
            cost_history.append(cost)

        return cost_history, weight1_history, weight2_history, weight3_history, weight4_history, weight5_history, bias_history


In [150]:
# Calling my own Logistic Regression Function for computation
model = LogisticRegressionCustom(x1_train, x2_train, x3_train, x4_train, x5_train, y_train)

# Call the train method and unpack its return values
cost_history, weight1_history, weight2_history, weight3_history, weight4_history, weight5_history, bias_history = model.train(
    learning_rate=0.01, weight1=1, weight2=1, weight3=1, weight4=1, weight5=1, bias=1, lambda_=0.9, epochs=1000
)

# Extrating final values to pass on predictions
w1 = weight1_history[-1]
w2 = weight2_history[-1]
w3 = weight3_history[-1]
w4 = weight4_history[-1]
w5 = weight5_history[-1]
b = bias_history[-1]

# Checking for Probabilities
def predict(weight1, weight2, weight3, weight4, weight5, bias, threshold):
    hypothesis = model.calculate_hypothesis(
                      weight1, weight2, weight3, weight4, weight5,
                      bias, x1_train, x2_train, x3_train, x4_train, x5_train
                      )

    y_pred = (hypothesis >= threshold).astype(int)
    return y_pred

threshold = 0.2
y_pred = predict(w1, w2, w3, w4, w5, b, threshold)

# Evaluate performance for training data
print("Threshold:", threshold)
print("Accuracy:", accuracy_score(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

# Applying on test data
y_test_pred_prob = model.calculate_hypothesis(
                      w1, w2, w3, w4, w5,
                      b, x1_test, x2_test, x3_test, x4_test, x5_test
                      )
y_test_pred = (y_test_pred_prob >= threshold).astype(int)

# Evaluate performance for test data
print("Threshold:", threshold)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Threshold: 0.2
Accuracy: 0.622093023255814
[[1601  888]
 [ 217  218]]
              precision    recall  f1-score   support

           0       0.88      0.64      0.74      2489
           1       0.20      0.50      0.28       435

    accuracy                           0.62      2924
   macro avg       0.54      0.57      0.51      2924
weighted avg       0.78      0.62      0.67      2924

Threshold: 0.2
Test Accuracy: 0.5819672131147541
[[366 244]
 [ 62  60]]
              precision    recall  f1-score   support

           0       0.86      0.60      0.71       610
           1       0.20      0.49      0.28       122

    accuracy                           0.58       732
   macro avg       0.53      0.55      0.49       732
weighted avg       0.75      0.58      0.63       732



Regularization avoided larger weight values to cause overfitting on the training data.