In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
clinical_data = pd.read_csv('datasets/clinical_dataset.csv')
lifestyle_data = pd.read_csv('datasets/lifestyle_dataset.csv')

# Preprocessing the lifestyle dataset by dropping the output column
Xlifestyle = lifestyle_data.drop('Heart Attack Risk', axis=1)
Ylifestyle = lifestyle_data['Heart Attack Risk']

# Preprocessing the clinical dataset by dropping the output column
Xclinical = clinical_data.drop('output', axis=1)
Yclinical = clinical_data['output']

# Convert categorical columns to numeric using one-hot encoding before splitting
Xlifestyle_encoded = pd.get_dummies(Xlifestyle, drop_first=True)
Xclinical_encoded = pd.get_dummies(Xclinical, drop_first=True)

Approach 1: Principle Component Analysis 

When to use: PCA is suitable when you have numerical data and want to capture the most variance in fewer dimensions.
How it works: PCA transforms the data into new dimensions (principal components) that are linear combinations of the original features. The first few components capture the most variance in the data.
Benefit: This method is often useful when working with models like Logistic Regression or KNN, as it reduces the feature space and noise.

In [10]:
# Convert categorical columns to numeric using one-hot encoding
Xlifestyle_encoded = pd.get_dummies(Xlifestyle, drop_first=True)
Xclinical_encoded = pd.get_dummies(Xclinical, drop_first=True)

# Standardize the data before applying PCA
scaler = StandardScaler()

# Scale lifestyle dataset
Xlife_scaled = scaler.fit_transform(Xlifestyle_encoded)

# Scale clinical dataset
Xclinical_scaled = scaler.fit_transform(Xclinical_encoded)

# Apply PCA to reduce dimensions to 10 components for lifestyle dataset
pca_life = PCA(n_components=10)
Xlife_pca = pca_life.fit_transform(Xlife_scaled)

# Apply PCA to reduce dimensions to 10 components for clinical dataset
pca_clinical = PCA(n_components=10)
Xclinical_pca = pca_clinical.fit_transform(Xclinical_scaled)


In [11]:
from sklearn.neural_network import MLPClassifier

# Split the lifestyle dataset into training and test sets
Xlife_train, Xlife_test, Ylife_train, Ylife_test = train_test_split(Xlife_pca, Ylifestyle, test_size=0.3, random_state=42)

# Define and train the MLP classifier for lifestyle dataset
mlp_life = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_life.fit(Xlife_train, Ylife_train)

# Predictions and evaluation for MLP
Ylife_pred_mlp = mlp_life.predict(Xlife_test)
print(f'MLP Accuracy (Lifestyle): {accuracy_score(Ylife_test, Ylife_pred_mlp)}')
print(classification_report(Ylife_test, Ylife_pred_mlp))

# Same for the clinical dataset
Xclinical_train, Xclinical_test, Yclinical_train, Yclinical_test = train_test_split(Xclinical_pca, Yclinical, test_size=0.3, random_state=42)
mlp_clinical = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
mlp_clinical.fit(Xclinical_train, Yclinical_train)

# Predictions and evaluation for MLP
Yclinical_pred_mlp = mlp_clinical.predict(Xclinical_test)
print(f'MLP Accuracy (Clinical): {accuracy_score(Yclinical_test, Yclinical_pred_mlp)}')
print(classification_report(Yclinical_test, Yclinical_pred_mlp))


MLP Accuracy (Lifestyle): 0.6101179155572461
              precision    recall  f1-score   support

           0       0.65      0.84      0.73      1691
           1       0.40      0.20      0.27       938

    accuracy                           0.61      2629
   macro avg       0.53      0.52      0.50      2629
weighted avg       0.56      0.61      0.57      2629

MLP Accuracy (Clinical): 0.7692307692307693
              precision    recall  f1-score   support

           0       0.74      0.76      0.75        41
           1       0.80      0.78      0.79        50

    accuracy                           0.77        91
   macro avg       0.77      0.77      0.77        91
weighted avg       0.77      0.77      0.77        91





In [12]:
from sklearn.svm import SVC

# Define and train the SVC classifier for lifestyle dataset
svc_life = SVC(kernel='linear', random_state=42)
svc_life.fit(Xlife_train, Ylife_train)

# Predictions and evaluation for SVC
Ylife_pred_svc = svc_life.predict(Xlife_test)
print(f'SVC Accuracy (Lifestyle): {accuracy_score(Ylife_test, Ylife_pred_svc)}')
print(classification_report(Ylife_test, Ylife_pred_svc,zero_division=1))

# Same for the clinical dataset
svc_clinical = SVC(kernel='linear', random_state=42)
svc_clinical.fit(Xclinical_train, Yclinical_train)

# Predictions and evaluation for SVC
Yclinical_pred_svc = svc_clinical.predict(Xclinical_test)
print(f'SVC Accuracy (Clinical): {accuracy_score(Yclinical_test, Yclinical_pred_svc)}')
print(classification_report(Yclinical_test, Yclinical_pred_svc, zero_division=1))


SVC Accuracy (Lifestyle): 0.6432103461392165
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1691
           1       1.00      0.00      0.00       938

    accuracy                           0.64      2629
   macro avg       0.82      0.50      0.39      2629
weighted avg       0.77      0.64      0.50      2629

SVC Accuracy (Clinical): 0.8351648351648352
              precision    recall  f1-score   support

           0       0.82      0.80      0.81        41
           1       0.84      0.86      0.85        50

    accuracy                           0.84        91
   macro avg       0.83      0.83      0.83        91
weighted avg       0.83      0.84      0.83        91



Key Insights:
The clinical dataset performed better with 83.52% accuracy, whereas the lifestyle dataset achieved a lower accuracy of 64.32%.
The lifestyle dataset's precision for class 1 is 100%, but with 0% recall, meaning the model did not correctly predict any instances of heart attack risk (1), which explains the low F1-score for that class.

The clinical dataset had a much more balanced performance between precision and recall for both classes.
Next Steps:

Imbalanced Classes: The low recall for the lifestyle dataset suggests an issue with class imbalance, where the model is struggling to correctly predict the minority class. You can address this by:

Using class weights in the SVC model (class_weight='balanced').
Applying techniques like SMOTE (Synthetic Minority Over-sampling Technique) to balance the dataset.

Model Tuning: Consider tuning hyperparameters such as C and kernel for SVC, or try different classifiers like Random Forest or Gradient Boosting to see if they handle the lifestyle data better.