In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("framingham.csv")
df = df.dropna()
print(df.columns)
print(df.head(20))
labels = df.TenYearCHD.to_numpy()
print(labels)
## I used this line to take different combinations of different feature values and then see their rate of positives,
## even in very severe conditions just like being now, there is still just 25% of them who have 1 as response.
len(df[(df.prevalentHyp == 1) & (df.education == 1) & (df.totChol <= 400) & (df.totChol >= 250) & (df.age <= 60) & (df.currentSmoker == 1) & (df.TenYearCHD == 1)])/len(df[(df.prevalentHyp == 1) & (df.education == 1) & (df.totChol <= 400) & (df.totChol >= 250) & (df.age <= 60) & (df.currentSmoker == 1)])
df = df.drop(columns=["TenYearCHD"])

Now will try to analyze some features in order to get the correlations between features or the relevant features in order to improve the accuracy, but it did not gave too much of help

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap of Correlation Matrix')
plt.show()

In [None]:
from scipy.stats import pointbiserialr

phi_corr_matrix = pd.DataFrame(index=df.columns, columns=df.columns)

# Compute the phi correlation coefficient for each pair of binary variables
for col1 in df.columns:
    for col2 in df.columns:
        if col1 != col2:
            phi_corr, _ = pointbiserialr(df[col1], df[col2])
            phi_corr_matrix.loc[col1, col2] = phi_corr

# Fill diagonal with NaNs (since correlation of a variable with itself is always 1)
# phi_corr_matrix.values[[range(len(df.columns))]*2] = np.nan

# Print the phi correlation matrix
print("Phi Correlation Matrix:")
print(phi_corr_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(phi_corr_matrix.astype(float), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Phi Correlation Matrix Heatmap')
plt.show()


In [None]:
# here I plotted many graphs between different features and try to understand if taking log, exp or sqrt or squaring
# of the features will help or not, but none of them have a much significant impact on accuracy, ultimately there 
# are many outliers in the data due to which accuracy is not that great
plt.scatter(np.sqrt(df.currentSmoker), np.sqrt(df.cigsPerDay))
print(df.columns)

In [None]:
# now bp related columns have some significant effect on response, also they are quite interrelated, so now adding some
# features including the product of these features

df["nf1"] = df["prevalentHyp"]*df["diaBP"]
df["nf2"] = df["prevalentHyp"]*df["sysBP"]
df["nf3"] = df["sysBP"]*df["diaBP"]*df["prevalentHyp"] ## this feature will highlight more of the hypertension and hypertension has
## a good corelation with 10yearchd in comparison to other features.
# df["nf4"] = df["age"]*df["sysBP"]
# df["nf5"] = df["glucose"]*df["diabetes"] ## this 
# df["nf6"] = df["age"]*df["totChol"]
## although adding above features have almost no effect on the accuracy of the data

In [None]:
def another_preprocess_data(df, labels):
    mean_val = np.mean(df, axis=0)
    std_dev_val = np.std(df, axis=0)
    df = df - mean_val
    df /= std_dev_val
    l = len(df)
    l_train = (4*l)//5
    train_df = df[:l_train].to_numpy()
    test_df = df[l_train:].to_numpy()
    return train_df, test_df, labels[:l_train], labels[l_train:]

In [None]:
train_data, test_data, train_labels, test_labels = another_preprocess_data(df, labels)

In [None]:
print(train_data.shape)
print(train_labels)

In [None]:
class logistic_regression:
    def __init__(self, a, epochs):
        self.a = a
        self.epochs = epochs
        
    def sigmoid(self, n):
        return 1/(1+np.exp(-n))
    
    def normalize(self, data):
        data = (data-self.mean)/self.std
        return data
    
    def update_w_and_b(self, data,labels,w, b, a):
        # w being a numpy array with dimensions feat,
        # data dim n*feat
        # labels dimensions n,
        feat = np.shape(data)[0]
        n = np.shape(data)[0]
        diff = self.sigmoid(np.sum(data*w, axis=1).astype(float)+b)-labels
        dw = np.dot(data.T, diff) / n
        db = np.sum(diff) / n
        w -= a*dw
        b -= a*db
        return w, b
    
    def train(self, data, labels):
        feat = np.shape(data)[1]
        n = np.shape(data)[0]
        self.w = np.zeros(feat).astype(float)
        self.b = 0
        self.mean = np.mean(data, axis=0)
        self.std = np.std(data, axis=0)
        epochs = self.epochs
        for epoch in range(epochs):
            self.w, self.b = self.update_w_and_b(data, labels, self.w, self.b, self.a)
            fnp, fpp = self.evaluate(data, labels)
            print("fnr: ", fnp, "fpr, ", fpp)
    
    def evaluate(self,data, labels):
        w = self.w
        b = self.b
        z = np.dot(data, w) + b
        result = self.sigmoid(z)
        result[result > 0.15]=1
        result[result <= 0.15] = 0
        inaccuracies = result - labels
        false_negatives_rate = np.sum(inaccuracies == -1)/len(labels[labels == 1])
        false_positive_rate = np.sum(inaccuracies == 1)/len(labels[labels == 0]) 
        return false_negatives_rate, false_positive_rate
    
    def predict(self, X):
        X = self.normalize(X)
        w = self.w
        b = self.b
        z = np.dot(X, w) + b
        result = self.sigmoid(z)
        result[result > 0.15]=1
        result[result <= 0.15] = 0
        return result

In [None]:
model = logistic_regression(a = 0.01, epochs = 10000)
model.train(train_data, train_labels)

In [None]:
new_prediction = model.predict(test_data)

In [None]:
print(new_prediction.shape)
print(test_labels.shape)

In [None]:
test_fnr, test_fpr = model.evaluate(test_data, test_labels)
print(f"Test False negative rate: {test_fnr}")
print(f"Test False positive rate: {test_fpr}")