In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

Loads the data, preprocesses and uses KNN, logistical regression and random forest model

Random Forest model won't even finish so that needs to be optimised for it to work

In [18]:
# Load data
clinical_data = pd.read_csv('datasets/clinical_dataset.csv')
lifestyle_data = pd.read_csv('datasets/lifestyle_dataset.csv')

# View basic information
print(clinical_data.head())
print(lifestyle_data.head())

   age  sex  cp  trtbps  chol  fbs  restecg  thalachh  exng  oldpeak  slp  \
0   63    1   3     145   233    1        0       150     0      2.3    0   
1   37    1   2     130   250    0        1       187     0      3.5    0   
2   41    0   1     130   204    0        0       172     0      1.4    2   
3   56    1   1     120   236    0        1       178     0      0.8    2   
4   57    0   0     120   354    0        1       163     1      0.6    2   

   caa  thall  output  
0    0      1       1  
1    0      2       1  
2    0      2       1  
3    0      2       1  
4    0      2       1  
  Patient ID  Age     Sex  Cholesterol Blood Pressure  Heart Rate  Diabetes  \
0    BMW7812   67    Male          208         158/88          72         0   
1    CZE1114   21    Male          389         165/93          98         1   
2    BNI9906   21  Female          324         174/99          72         1   
3    JLN3497   84    Male          383        163/100          73         1  

In [19]:
# Preprocessing the lifestyle dataset by dropping the output column
Xlifestyle = lifestyle_data.drop('Heart Attack Risk', axis=1)
Ylifestyle = lifestyle_data['Heart Attack Risk']

# Preprocessing the clinical dataset by dropping the output column
Xclinical = clinical_data.drop('output', axis=1)
Yclinical = clinical_data['output']

# Convert categorical columns to numeric using one-hot encoding before splitting
Xlifestyle_encoded = pd.get_dummies(Xlifestyle, drop_first=True)
Xclinical_encoded = pd.get_dummies(Xclinical, drop_first=True)

The dataset is barely processed, might need handling of NULL rows and better handling of categorical columns 

In [20]:
# Re-split the datasets after encoding
Xlife_train, Xlife_test, ylife_train, ylife_test = train_test_split(Xlifestyle_encoded, Ylifestyle, test_size=0.2, random_state=42)
Xclin_train, Xclin_test, yclin_train, yclin_test = train_test_split(Xclinical_encoded, Yclinical, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
Xlife_train_scaled = scaler.fit_transform(Xlife_train)
Xlife_test_scaled = scaler.transform(Xlife_test)
Xclin_train_scaled = scaler.fit_transform(Xclin_train)
Xclin_test_scaled = scaler.transform(Xclin_test)

Using logistical regression, KNN and random forest models 

In [21]:
# Initialise models
logistic_model = LogisticRegression()
knn_model = KNeighborsClassifier()
random_forest_model = RandomForestClassifier()

In [22]:
# Perform 5-fold cross-validation for each model ----- LIFESTYLE DATA
life_logistic_cv = cross_val_score(logistic_model, Xlife_train_scaled, ylife_train, cv=5, scoring='accuracy')
print(f'Logistic Regression CV Accuracy: {life_logistic_cv.mean()}')


Logistic Regression CV Accuracy: 0.4643366619115549


In [23]:
life_knn_cv = cross_val_score(knn_model, Xlife_train_scaled, ylife_train, cv=5, scoring='accuracy')
print(f'KNN CV Accuracy: {life_knn_cv.mean()}')


KNN CV Accuracy: 0.6319543509272469


Random Forest takes super long (Probably wont ever finish), not recommended running this at the moment

In [24]:
# life_rf_cv = cross_val_score(random_forest_model, Xlife_train_scaled, ylife_train, cv=5, scoring='accuracy')
# print(f'Random Forest CV Accuracy: {life_rf_cv.mean()}')

In [25]:
# Perform 5-fold cross-validation for each model ----- CLINICAL DATA
clin_logistic_cv = cross_val_score(logistic_model, Xclin_train_scaled, yclin_train, cv=5, scoring='accuracy')
print(f'Logistic Regression CV Accuracy: {clin_logistic_cv.mean()}')

Logistic Regression CV Accuracy: 0.8180272108843537


In [26]:
clin_knn_cv = cross_val_score(knn_model, Xclin_train_scaled, yclin_train, cv=5, scoring='accuracy')
print(f'KNN CV Accuracy: {clin_knn_cv.mean()}')

KNN CV Accuracy: 0.8141156462585034


In [27]:
# RANDOM FOREST: NEEDS TO BE OPTIMISED
# clin_rf_cv = cross_val_score(random_forest_model, Xclin_train_scaled, yclin_train, cv=5, scoring='accuracy')
# print(f'Random Forest CV Accuracy: {clin_rf_cv.mean()}')

In [28]:
# Test set evaluation
logistic_model = LogisticRegression()
logistic_model.fit(Xlife_train_scaled, ylife_train)
y_pred_life_log = logistic_model.predict(Xlife_test_scaled)
print(f'Logistic Regression Test Accuracy (Lifestyle): {accuracy_score(ylife_test, y_pred_life_log)}')
logistic_model.fit(Xclin_train_scaled, yclin_train)
y_pred_clin_log = logistic_model.predict(Xclin_test_scaled)
print(f'Logistic Regression Test Accuracy (Clinical): {accuracy_score(yclin_test, y_pred_clin_log)}')

Logistic Regression Test Accuracy (Lifestyle): 0.5887050770108385
Logistic Regression Test Accuracy (Clinical): 0.8524590163934426


The lifestyle dataset consistently does alot worse than the lifestyle dataset. 
Might suggest that logistical regression is not recommended for lifestyle dataset

In [29]:
# KNN on Lifestyle data
knn_model.fit(Xlife_train_scaled, ylife_train)
y_pred_life_knn = knn_model.predict(Xlife_test_scaled)
print(f'KNN Test Accuracy (Lifestyle): {accuracy_score(ylife_test, y_pred_life_knn)}')

# KNN on Clinical data
knn_model.fit(Xclin_train_scaled, yclin_train)
y_pred_clin_knn = knn_model.predict(Xclin_test_scaled)
print(f'KNN Test Accuracy (Clinical): {accuracy_score(yclin_test, y_pred_clin_knn)}')


KNN Test Accuracy (Lifestyle): 0.6326297775242442
KNN Test Accuracy (Clinical): 0.9016393442622951


The lifestyle dataset also does horrible for KNN than Clinical data
It is worthwhile to explore a few things
1. Were many categorical columns/features or even rows lost during preprocessing?
2. Is the Lifestyle dataset much smaller or larger than the Clinical set?
3. How could features from the lifestyle set have impacted this
4. Could random forest or SVP and MLP find more complex relationships with the lifestyle dataset?