In [1]:
# Import dependencies
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Read in the cardiovascular dataset from Google Sheets
cvd_df = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSDchXr1EhgCSsxlxJ3lWPhh1kT5EJS3yv4DJ2YLeMIC3y4uq-Pp4EQknrs9zAiaI3ulne2Jyi6gR6G/pub?gid=602879552&single=true&output=csv")
cvd_df.head()

Unnamed: 0,general_health,checkup,exercise,heart_disease,skin_cancer,other_cancer,depression,diabetes,arthritis,sex,age_category,height_cm,weight_kg,bmi,smoking_history,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150,32.66,14.54,Yes,0,30,16,12
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165,77.11,28.29,No,0,30,0,4
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163,88.45,33.47,No,4,12,3,16
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180,93.44,28.73,No,0,30,30,8
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191,88.45,24.37,Yes,0,8,4,0


# Data Preprocessing

In [3]:
# Check datatypes
cvd_df.dtypes

general_health                   object
checkup                          object
exercise                         object
heart_disease                    object
skin_cancer                      object
other_cancer                     object
depression                       object
diabetes                         object
arthritis                        object
sex                              object
age_category                     object
height_cm                         int64
weight_kg                       float64
bmi                             float64
smoking_history                  object
alcohol_consumption               int64
fruit_consumption                 int64
green_vegetables_consumption      int64
friedpotato_consumption           int64
dtype: object

In [4]:
# Define categorical columns for encoding and numeric columns for scaling
categorical_cols = cvd_df.dtypes[cvd_df.dtypes == 'object'].index.tolist()

numeric_cols = cvd_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [5]:
# Check unique values in each column
cvd_df[categorical_cols].nunique()

general_health      5
checkup             5
exercise            2
heart_disease       2
skin_cancer         2
other_cancer        2
depression          2
diabetes            2
arthritis           2
sex                 2
age_category       13
smoking_history     2
dtype: int64

In [6]:
# Encode categorical columns using get_dummies
encoded_df = pd.get_dummies(cvd_df, columns=categorical_cols, drop_first=False)
encoded_df.head()

Unnamed: 0,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Excellent,general_health_Fair,general_health_Good,...,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_No,smoking_history_Yes
0,150,32.66,14.54,0,30,16,12,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,165,77.11,28.29,0,30,0,4,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,163,88.45,33.47,4,12,3,16,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,180,93.44,28.73,0,30,30,8,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,191,88.45,24.37,0,8,4,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1


In [7]:
# Scale numerical columns using StandardScaler
scaler = StandardScaler()
encoded_df[numeric_cols] = scaler.fit_transform(encoded_df[numeric_cols])
encoded_df.head()

Unnamed: 0,height_cm,weight_kg,bmi,alcohol_consumption,fruit_consumption,green_vegetables_consumption,friedpotato_consumption,general_health_Excellent,general_health_Fair,general_health_Good,...,age_category_45-49,age_category_50-54,age_category_55-59,age_category_60-64,age_category_65-69,age_category_70-74,age_category_75-79,age_category_80+,smoking_history_No,smoking_history_Yes
0,-1.945527,-2.388688,-2.162285,-0.624388,0.005697,0.0596,0.662925,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,-0.537041,-0.299715,-0.040788,-0.624388,0.005697,-1.012964,-0.268558,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,-0.724839,0.23322,0.758438,-0.137925,-0.717605,-0.811858,1.128667,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0.871445,0.46773,0.0271,-0.624388,0.005697,0.998094,0.197184,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,1.904335,0.23322,-0.645607,-0.624388,-0.878338,-0.744823,-0.734299,0,0,1,...,0,0,0,0,0,0,0,1,0,1


In [8]:
# Assign the target variable 'heart_disease' to y
y = encoded_df['heart_disease_Yes']

# Assign the remaining columns (features) to X
X = encoded_df.drop(columns=['heart_disease_Yes', 'heart_disease_No'])

In [9]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Model 1: Logistic Regression

# Model 2: Support Vector Machine

In [1]:
# testing

# Model 3: Decision Tree

# Model 4: Random Forest

# Model 5: Neural Network

# Model Evaluation