In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
healthData = pd.read_csv('CVD_cleaned.csv')
healthData.drop_duplicates()
healthData.rename(columns={'Age_Category': 'Age', 'Height_(cm)': 'Height', 'Weight_(kg)': 'Weight'}, inplace=True)
healthData = healthData.drop(['Checkup', 'Other_Cancer'],  axis=1)
healthData

Unnamed: 0,General_Health,Exercise,Heart_Disease,Skin_Cancer,Depression,Diabetes,Arthritis,Sex,Age,Height,Weight,BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,No,Yes,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Yes,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Yes,Yes,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308849,Very Good,Yes,No,No,No,No,No,Male,25-29,168.0,81.65,29.05,No,4.0,30.0,8.0,0.0
308850,Fair,Yes,No,No,No,Yes,No,Male,65-69,180.0,69.85,21.48,No,8.0,15.0,60.0,4.0
308851,Very Good,Yes,No,No,Yes,"Yes, but female told only during pregnancy",No,Female,30-34,157.0,61.23,24.69,Yes,4.0,40.0,8.0,4.0
308852,Very Good,Yes,No,No,No,No,No,Male,65-69,183.0,79.38,23.73,No,3.0,30.0,12.0,0.0


## KNN

We opted for a KNN model, as it does not assume any underlying data distribution. It works by classifying data points based on the majority class among their nearest neighbors, making it effective for predicting heart disease based on diverse lifestyle factors. 

KNN runs for about 7 mins

We used Minkowski as the metrics, as it is suitable for handling a combination of numerical and categorical variables which is what our dataset comprise of. Unlike other distance metrics, Minkowski metrics can accommodate mixed data types effectively. This allows us to measure distances between data points accurately, even when dealing with diverse types of variables. Hence using Minkowski over other distance metrics.

In [4]:
#Variables to be compared against chosen based on correlation matrix found previously 
X = healthData[['General_Health', 'BMI', 'Age', 'Smoking_History', 'Diabetes', 'Arthritis', 'Alcohol_Consumption', 'Green_Vegetables_Consumption']]

#target variable
y = healthData['Heart_Disease']

# Split training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numerical and categorical columns
numerical_cols = ['BMI', 'Alcohol_Consumption', 'Green_Vegetables_Consumption']
categorical_cols = ['General_Health', 'Age', 'Smoking_History', 'Diabetes', 'Arthritis']

# Define preprocessing steps for numerical and categorical features
numerical_transformer = Pipeline(steps=[
  ('scaler', StandardScaler()) # Standardize numerical features
])

categorical_transformer = Pipeline(steps=[
  ('onehot', OneHotEncoder()) # One-hot encode categorical features
])

# Combine preprocessing steps for both numerical and categorical features
preprocessor = ColumnTransformer(transformers=[
  ('num', numerical_transformer, numerical_cols),
  ('cat', categorical_transformer, categorical_cols)
])

# Classification problem --> Create a KNN model with Minkowski distance 
knn_model = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('classifier', KNeighborsClassifier(n_neighbors = 12, metric='minkowski'))
])

# Fit the KNN model using the pipeline with the training data
knn_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = knn_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

              precision    recall  f1-score   support

          No       0.92      1.00      0.96     56774
         Yes       0.45      0.04      0.08      4997

    accuracy                           0.92     61771
   macro avg       0.69      0.52      0.52     61771
weighted avg       0.88      0.92      0.89     61771

Accuracy: 0.91835974810186
Confusion Matrix:
[[56512   262]
 [ 4781   216]]


Yes class have a low precision, recall and f1-score, which might be due to the large data imbalance for the classes. So we will try down sampling

 ---

### Downsampling Majority Class

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_downsampled, y_train_downsampled = rus.fit_resample(X_train, y_train)

knn_model = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('classifier', KNeighborsClassifier(n_neighbors = 12, metric='minkowski'))
])

# Fit the KNN model using the pipeline with the training data
knn_model.fit(X_train_downsampled, y_train_downsampled)

# Make predictions on the testing set (using consistent preprocessing)

y_pred = knn_model.predict(X_test)

# Evaluate the model using alternative evaluation metrics
print(classification_report(y_test, y_pred))

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

              precision    recall  f1-score   support

          No       0.97      0.72      0.83     56774
         Yes       0.19      0.73      0.30      4997

    accuracy                           0.72     61771
   macro avg       0.58      0.73      0.56     61771
weighted avg       0.91      0.72      0.78     61771

Accuracy: 0.7215521846821324
Confusion Matrix:
[[40899 15875]
 [ 1325  3672]]


In [6]:
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

num_yes_samples = len(y_train[y_train == "Yes"])
target_no_samples = int(num_yes_samples)

sampling_strategy = {'Yes': num_yes_samples, 'No': target_no_samples}

rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)

X_train_downsampled, y_train_downsampled = rus.fit_resample(X_train, y_train)

knn_model = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('classifier', KNeighborsClassifier(n_neighbors = 12, metric='minkowski'))
])

# Fit the KNN model using the pipeline with the training data
knn_model.fit(X_train_downsampled, y_train_downsampled)

# Make predictions on the testing set (using consistent preprocessing)

y_pred = knn_model.predict(X_test)

# Evaluate the model using alternative evaluation metrics
print(classification_report(y_test, y_pred))

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

              precision    recall  f1-score   support

          No       0.97      0.72      0.83     56774
         Yes       0.19      0.73      0.30      4997

    accuracy                           0.72     61771
   macro avg       0.58      0.73      0.56     61771
weighted avg       0.91      0.72      0.78     61771

Accuracy: 0.7215036182027165
Confusion Matrix:
[[40897 15877]
 [ 1326  3671]]


Model is better at identifying the 'No' class than the 'Yes' class. The model's ability to identify the 'Yes' class (recall) is good, but it comes at the cost of many false positives (low precision).

### Resampling minority class using Random Oversampling

In [9]:
import pandas as pd
from sklearn.utils import resample

# Assuming X_train and y_train are your feature and target dataframes
# Combine X_train and y_train
train_data = pd.concat([X_train, y_train], axis=1)

# Separate majority and minority classes
majority_class = train_data[train_data['Heart_Disease'] == 'No']
minority_class = train_data[train_data['Heart_Disease'] == 'Yes']

# Upsample minority class
minority_upsampled = resample(minority_class,
                              replace=True,  # sample with replacement
                              n_samples=len(majority_class),  # to match majority class
                              random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
upsampled_data = pd.concat([majority_class, minority_upsampled])

# Separate features and target after upsampling
X_train_upsampled = upsampled_data.drop('Heart_Disease', axis=1)
y_train_upsampled = upsampled_data['Heart_Disease']


In [11]:

# Fit the KNN model using the pipeline with the training data
knn_model.fit(X_train_upsampled, y_train_upsampled)

# Make predictions on the testing set (using consistent preprocessing)

y_pred = knn_model.predict(X_test)

# Evaluate the model using alternative evaluation metrics
print(classification_report(y_test, y_pred))

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

              precision    recall  f1-score   support

          No       0.96      0.73      0.83     56774
         Yes       0.17      0.62      0.27      4997

    accuracy                           0.72     61771
   macro avg       0.56      0.68      0.55     61771
weighted avg       0.89      0.72      0.78     61771

Accuracy: 0.7213902964174127
Confusion Matrix:
[[41456 15318]
 [ 1892  3105]]


KNN ran for ~30mins

In [20]:
y_train.size


454218

In [80]:
len(y_test[y_test == "Yes"])


4997

---

## Evaluation

KNN might not be the best model to help predict heart disease when we are using this dataset due to the huge imbalance of data where there are much more data collected from people in the 'No' class where they do not have any heart disease as compared to people in the 'Yes' class where they have heart disease. Downsampling the majority class causes us to lose important information needed to allow the model to better recognise people that do not have heart disease, where there were more false positive detected. Resampling however took a much longer time for the KNN to complete execution, yet there were no major improvements where its result were similar to our findings for when we tried downsampling. Thus it would be better to explore other models.