# Tools


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from pandas import DataFrame
from sklearn import linear_model
from numpy import mean
from numpy import std
from numpy import array
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Data Loading

In [None]:
#Here we are reading the dataset and looking at the first five instances
chd_raw = pd.read_csv('chd-data.csv', sep=',')

# fetch dataset 
chd_raw.head()


Here we're changing the name of the columns "male" to "gender" (because why the fuck is it called male???) and "TenYearCHD" to "CHD" because it's more simple

In [None]:
chd_raw = chd_raw.rename(columns={'male': 'gender', 'TenYearCHD': 'CHD'})

In [None]:
chd_raw.head()

# Data observation

In [None]:
#Here we are describing the number of instances and columns
chd_raw.shape

In [None]:
chd_raw.info()

In [None]:
chd_raw.describe()

To see which variables that are categorical

In [None]:
chd_raw._get_numeric_data()

# Exploratory Data Analysis

Here we're checking how the dataset is distributed on our target variable

In [None]:
sns.set_style("whitegrid")

# Plot the target variable 'CHD'
plt.figure(figsize=(6, 6))
sns.countplot(x=chd_raw['CHD'])
plt.title('Distribution of Target Variable (Coronary Heart Disease)')
plt.show()

Interpretation of the result:

-
-
-
-
-


Here we're plotting the distribution of the categorical variables

In [None]:
categorical_variables = ['gender', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']

# Plotting the bar plots
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 20))

for var, subplot in zip(categorical_variables, axs.flatten()):
    sns.countplot(x=var, data=chd_raw, ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(0)

plt.tight_layout()
plt.show()

Interpretation of the result:

-
-
-
-
-

Here we're checking the variables with categorical values against the target variable "CHD"

In [None]:
# Plot the categorical variables against the target variable 'CHD'
fig, axs = plt.subplots(3, 2, figsize=(20, 20))

for var, subplot in zip(categorical_variables, axs.flatten()):
    sns.countplot(x=chd_raw[var], hue=chd_raw['CHD'], ax=subplot)
    for label in subplot.get_xticklabels():
        label.set_rotation(0)

plt.tight_layout()
plt.show()

Interpretation of the result:

-
-
-
-
-

Here we're ...

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(chd_raw.corr(),linewidths=0.1,annot=True)
# linewidths is white space between boxes and annot gives value
plt.show()

Interpretation of the result:

-
-
-
-
-

Here we're

In [None]:
# Compute the correlation of each feature with the target variable 'stroke'
target_corr_one_hot = chd_raw.corr()['CHD'].drop('CHD')

# Sort correlation values in descending order - maybe change names
target_corr_sorted_one_hot = target_corr_one_hot.sort_values(ascending=False)

# Plot a heatmap of the correlations with the target column
plt.figure(figsize=(5, 10))
sns.set(font_scale=0.8)
sns.heatmap(target_corr_sorted_one_hot.to_frame(), annot=True, cbar=False)
plt.title('Correlation with CHD')
plt.show()

Interpretation of the result:

-
-
-
-
-

Here we're creating histograms to see the distribution of data

In [None]:
fig = plt.figure(figsize = (15,15))
chd_raw.hist(ax = fig.gca())
plt.show()

Interpretation of the result:

-
-


# Data Cleaning

Then we're checking how many duplicates there are in our dataset

In [None]:
chd_raw.duplicated().sum()

There's 0 so nothing to do with this

Finding how many columns that have missing values

In [None]:
missing_rows = chd_raw.isna().any(axis=1).sum()
missing_rows

In [None]:
null_counts = chd_raw.isna().sum()
null_counts

Finding how many columns that have more than 1 missing value in the same row

In [None]:
more_than_one_missing = chd_raw.isna().sum(axis=1)>1
more_than_one_missing_count = more_than_one_missing.sum()
more_than_one_missing_count

Result: we want to drop the rows that have more than 1 missing value, because 61 rows is very little compared to over 4000 instances

We didn't use dummy variables because there's no columns that has anything else than numeric values. 

Here we're dropping the rows that have more than 1 missing value

In [None]:
chd_df = chd_raw.dropna(thresh=len(chd_raw.columns)-1)

In [None]:
chd_df.isnull().sum()

We're dropping the variable 'education' because there's no explanation of the numbers in the column

In [None]:
chd_df.drop(['education'], axis=1, inplace=True)

In [None]:
chd_df.isnull().sum()

Now we're filling in the rest of the missing values - decision between mode, mean or median is based on the histograms above 

In [None]:
chd_df['cigsPerDay'].fillna(chd_df['cigsPerDay'].mode()[0], inplace=True)
chd_df['BPMeds'].fillna(chd_df['BPMeds'].mode()[0], inplace=True)
chd_df['totChol'].fillna(chd_df['totChol'].median(), inplace=True)  
chd_df['BMI'].fillna(chd_df['BMI'].median(), inplace=True)
chd_df['heartRate'].fillna(chd_df['heartRate'].mean(), inplace=True) 
chd_df['glucose'].fillna(chd_df['glucose'].median(), inplace=True)

In [None]:
chd_df.isnull().sum()

# Feature Selection

Here we're checking which variables that have the strongest correlation with the target variable with SelectKBest and Chi2

In [None]:
X = chd_df.iloc[:,0:14]  
y = chd_df.iloc[:,-1]    

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  
print(featureScores.nlargest(14,'Score'))

Based on the above results we have chosen to drop BMI, heartRate and currentSmoker as they have the least impact on target variable.

# Logistic Regression

In [None]:
columns_new = ['sysBP', 'glucose', 'age', 'totChol', 'cigsPerDay', 'diaBP', 'prevalentHyp', 'diabetes', 'BPMeds', 'gender', 'prevalentStroke']
X = chd_df[columns_new]
y = chd_df["CHD"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(("Xtrain", X_train.shape, "y_train", 
      y_train.shape, "X_test", X_test.shape, "y_test", y_test.shape))

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train) 

print('Without SMOTE')

print('Distribution of No CHD (0) and CHD (1) %s' % Counter(y))
print(f'''Shape of X: {X_train.shape}
Shape of y: {y_train.shape}''')
print('')
print('With SMOTE')
print('Distribution of No CHD (0) and CHD (1) %s ' % Counter(y_res))

print(f'''Shape of X: {X_train_res.shape}
Shape of y: {y_train_res.shape}''')

In [None]:
sum(y_train_res)/len(y_train_res)

WITHOUT SMOTE

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg_y_pred = log_reg.predict(X_test)
print("Score on training set: {:.3f}".format(log_reg.score(X_train, y_train)))
print("Score on test set: {:.3f}".format(log_reg.score(X_test, y_test)))

WITH SMOTE

In [None]:
log_reg_sm = LogisticRegression(max_iter=1000)
log_reg_sm.fit(X_res, y_res)
log_reg_sm_y_pred = log_reg_sm.predict(X_test)
print("Accuracy on training set: {:.3f}".format(log_reg_sm.score(X_train_res, y_train_res)))
print("Accuracy on test set: {:.3f}".format(log_reg_sm.score(X_test, y_test)))

WITHOUT SMOTE

In [None]:
print(classification_report(y_test, log_reg_y_pred))

- Precision measures the accuracy of positive predictions. The model has a high precision (0.84) for predicting no CHD, meaning that when the model predicts that a patient will not have a CHD, it is correct 84% of the time. However, the precision is much lower for predicting stroke (0.5), meaning that when the model predicts a CHD, it is correct only 50% of the time

- Recall (also known as sensitivity or true positive rate) measures the fraction of positives that were correctly identified. The model has a recall of 1.0 for no CHD and 0.02 for CHD. This means that the model correctly identifies 100% of the patients who will not have a CHD and 0.2% of the patients who will have a CHD. The lower recall for CHD indicates that the model is missing a large number of patients who will have a CHD.

- The F1 score is a weighted average of precision and recall. The F1 scores for no CHD and CHD are 0.91 and 0.03, respectively. The low F1 score for CHD indicates that the model's performance is very poor when it comes to predicting CHD.

- The accuracy of the model is 0.84, which means that the model correctly predicts whether a patient will have a CHD or not in 84% of the cases. However, accuracy can be misleading when dealing with imbalanced classes, as it is in this case.

WITH SMOTE

In [None]:
print(classification_report(y_test, log_reg_sm_y_pred))

It's evident that the recall and F1 score for predicting CHD correctly is much better with SMOTE

Confusion matrix WITHOUT SMOTE

In [None]:
cm1 = confusion_matrix(y_train,log_reg.predict(X_train))
ax= plt.subplot()
sns.heatmap(cm1,annot=True, ax = ax, fmt='d')
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Logistic Regression - Training');
ax.xaxis.set_ticklabels(['0', '1']); ax.yaxis.set_ticklabels(['0', '1']);


447 people are false negative = not so good

Confusion matrix WITH SMOTE

In [None]:
cm2 = confusion_matrix(y_train_res,log_reg_sm.predict(X_train_res))
ax= plt.subplot()
sns.heatmap(cm2,annot=True, ax = ax, fmt='d')
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Logistic Regression - Training');
ax.xaxis.set_ticklabels(['0', '1']); ax.yaxis.set_ticklabels(['0', '1']);

ROC WITHOUT SMOTE

In [None]:
# Step 1: Compute the predicted probabilities for the positive class
y_prob = log_reg.predict_proba(X_test)[:,1]

# Step 2: Compute FPR, TPR, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Step 3: Compute the AUC
roc_auc = auc(fpr, tpr)

# Step 4: Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

This ROC value indicates a poor performance - there is a 72% chance that the model will be able to distinguish between positive class and negative class

ROC WITH SMOTE

In [None]:
# Step 1: Compute the predicted probabilities for the positive class
y_prob_sm = log_reg_sm.predict_proba(X_test)[:,1]

# Step 2: Compute FPR, TPR, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob_sm)

# Step 3: Compute the AUC
roc_auc = auc(fpr, tpr)

# Step 4: Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Random Forest

WITHOUT SMOTE

In [None]:
rf = RandomForestClassifier(max_depth=10)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
print("Accuracy on training set: {:.3f}".format(rf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf.score(X_test, y_test)))

WITH SMOTE

In [None]:
rf_sm = RandomForestClassifier(max_depth=10)
rf_sm.fit(X_train_res, y_train_res)
rf_sm_y_pred = rf.predict(X_test)
print("Accuracy on training set: {:.3f}".format(rf.score(X_train_res, y_train_res)))
print("Accuracy on test set: {:.3f}".format(rf.score(X_test, y_test)))

Classification report WITHOUT SMOTE

In [None]:
print(classification_report(y_test, rf_y_pred))

Classifciation report WITH SMOTE

In [None]:
print(classification_report(y_test, rf_sm_y_pred))

# Decision Tree Classifier 

In [None]:
columns = ['sysBP', 'glucose', 'age', 'totChol', 'cigsPerDay', 'diaBP', 'prevalentHyp', 'diabetes', 'BPMeds', 'gender', 'prevalentStroke', 'BMI', 'heartRate', 'currentSmoker']
X = chd_df[columns]
X
y = chd_df["CHD"]
y

decisionTree = tree.DecisionTreeClassifier() 
decisionTree.fit(X, y)
print("Accuracy on trained data set - gotta change this: {:.3f}".format(decisionTree.score(X, y)))



In [None]:
decisionTree = tree.DecisionTreeClassifier(max_depth=5) 
decisionTree.fit(X, y)
print("Accuracy on training set: {:.3f}".format(decisionTree.score(X, y)))