In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('heart_2022_with_nans.csv')

In [5]:
df.dropna(subset=['HadHeartAttack'], inplace = True)

In [6]:
X = df.drop(columns = 'HadHeartAttack')

In [7]:
y = df['HadHeartAttack']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .15/.85, random_state =42)

In [9]:
num_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy ='median')),
    ('scale', StandardScaler())
     ])

In [10]:
training_num_X = X_train.select_dtypes(include = ['float64'])
training_nom_X = X_train[['State',
 'Sex',
 'PhysicalActivities',
 'HadAngina',
 'HadStroke',
 'HadAsthma',
 'HadSkinCancer',
 'HadCOPD',
 'HadDepressiveDisorder',
 'HadKidneyDisease',
 'HadArthritis',
 'DeafOrHardOfHearing',
 'BlindOrVisionDifficulty',
 'DifficultyConcentrating',
 'DifficultyWalking',
 'DifficultyDressingBathing',
 'DifficultyErrands',
 'ChestScan',
 'RaceEthnicityCategory',
 'AlcoholDrinkers',
 'HIVTesting',
 'FluVaxLast12',
 'PneumoVaxEver',
 'TetanusLast10Tdap',
 'HighRiskLastYear']]

In [11]:
nom_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('OneHotEncoder', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

In [12]:
ord_columns = df[['RemovedTeeth', 'AgeCategory', 'LastCheckupTime',
                  'GeneralHealth', 'SmokerStatus', 'ECigaretteUsage', 'CovidPos', 'HadDiabetes']]

In [13]:
#delete all ordencs
teeth_cats = ['None of them', '1 to 5', '6 or more, but not all', 'All']
teeth_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[teeth_cats])),
    ('scale', StandardScaler())
])

In [14]:
age_cats = ['Age 18 to 24','Age 25 to 29','Age 30 to 34','Age 35 to 39',
                                        'Age 40 to 44', 'Age 45 to 49','Age 50 to 54','Age 55 to 59',
                                        'Age 60 to 64','Age 65 to 69','Age 70 to 74','Age 75 to 79',
                                        'Age 80 or older']
age_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[age_cats])),
    ('scale', StandardScaler())
])

In [15]:
last_checkup_cats = ['Within past year (anytime less than 12 months ago)',
                                        'Within past 2 years (1 year but less than 2 years ago)',
                                        'Within past 5 years (2 years but less than 5 years ago)',
                                        '5 or more years ago']
last_checkup_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[last_checkup_cats])),
    ('scale', StandardScaler())
])

In [16]:
health_cats = ['Poor', 'Fair', 'Good', 'Very good', 'Excellent']
health_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[health_cats])),
    ('scale', StandardScaler())
])

In [17]:
smoker_cats = ['Never smoked', 'Former smoker','Current smoker - now smokes some days','Current smoker - now smokes every day']
smoker_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[smoker_cats])),
    ('scale', StandardScaler())
])

In [19]:
ecig_cats = ['Never used e-cigarettes in my entire life',
        'Not at all (right now)','Use them some days', 'Use them every day',]
ecig_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[ecig_cats])),
    ('scale', StandardScaler())
])

In [20]:
covid_cats = ['No',
        'Tested positive using home test without a health professional',
        'Yes']
covid_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[covid_cats])),
    ('scale', StandardScaler())
])

In [22]:
diabetes_cats = ['No', 'No, pre-diabetes or borderline diabetes',
        'Yes, but only during pregnancy (female)', 'Yes']
diabetes_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('encode',OrdinalEncoder(categories=[diabetes_cats])),
    ('scale', StandardScaler())
])

In [25]:
num_columns = df.select_dtypes(include = ['float64'])

In [26]:
dropped_columns = num_columns.columns.tolist() + ord_columns.columns.tolist() + ['HadHeartAttack']
print(dropped_columns)

['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI', 'RemovedTeeth', 'AgeCategory', 'LastCheckupTime', 'GeneralHealth', 'SmokerStatus', 'ECigaretteUsage', 'CovidPos', 'HadDiabetes', 'HadHeartAttack']


In [29]:
nom_columns = df.drop(dropped_columns, axis = 1)

In [41]:
num_columns_list = num_columns.columns.tolist()
nom_columns_list = nom_columns.columns.tolist()
ord_columns_list = ord_columns.columns.tolist()
print(ord_columns_list)

['RemovedTeeth', 'AgeCategory', 'LastCheckupTime', 'GeneralHealth', 'SmokerStatus', 'ECigaretteUsage', 'CovidPos', 'HadDiabetes']


In [44]:
ct = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_columns_list),  
        ('nom_pipeline', nom_pipeline, nom_columns_list),  
        ('teeth_pipeline', teeth_pipeline, ['RemovedTeeth']),  
      ('age_pipeline', age_pipeline, ['AgeCategory']),    
      ('last_checkup_pipeline', last_checkup_pipeline, ['LastCheckupTime']),  
       ('health_pipeline', health_pipeline, ['GeneralHealth']),  
      ('smoker_pipeline', smoker_pipeline, ['SmokerStatus']),  
      ('ecig_pipeline', ecig_pipeline, ['ECigaretteUsage']),  
      ('covid_pipeline', covid_pipeline, ['CovidPos']),  
    ('diabetes_pipeline', diabetes_pipeline, ['HadDiabetes'])  
    ],
    remainder='drop',
    n_jobs=-1
)

In [46]:
rfc = RandomForestClassifier()

In [50]:
knn = KNeighborsClassifier()

In [52]:
lr = LogisticRegression(max_iter=500)

In [91]:
X_sample = X_train.sample(10000, random_state =42)
y_sample = y_train.sample(10000, random_state =42)

In [93]:
X_tran = ct.fit_transform(X_sample)
y_tran = y_sample

In [95]:
rfc_param_grid = [{
    'n_estimators':[25,50,100],
    'max_depth':[5,10,20,40],
    'min_samples_split':[2,3,4],
    'min_samples_leaf':[1,2,3],
    'max_features':['sqrt','log2'],
    'max_leaf_nodes':[25,50,100,200]
}]

In [97]:
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv=3, scoring='accuracy', n_jobs=-1,error_score='raise')

In [99]:
rfc_grid_search.fit(X_tran, y_tran)

In [100]:
rfc_grid_search.best_score_

0.9458997789401016

In [101]:
rfc_grid_search.best_params_

{'max_depth': 20,
 'max_features': 'sqrt',
 'max_leaf_nodes': 200,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 25}

In [102]:
rfc2 = RandomForestClassifier(max_depth= 5,max_features= 'sqrt',max_leaf_nodes= 25,min_samples_leaf= 1,
 min_samples_split= 2,
 n_estimators= 25)

In [103]:
knn_param_grid = [{
    'n_neighbors':[5,6,7,8,9,10],
    'leaf_size':[5,10,15,30,45]
}]

In [104]:
knn_param_search = GridSearchCV(knn, knn_param_grid, cv = 5, scoring='accuracy', n_jobs=-1)

In [105]:
knn_param_search.fit(X_tran, y_tran)

In [106]:
knn_param_search.best_score_

0.9433999999999999

In [107]:
knn_param_search.best_params_

{'leaf_size': 5, 'n_neighbors': 10}

In [108]:
knn2 = KNeighborsClassifier(leaf_size= 5, n_neighbors=6)

In [109]:
vc = VotingClassifier([('RandomForest',rfc2),('LogisticRegression',lr),('KNN',knn2)])

In [110]:
vc_param_grid = {'voting':['hard','soft'],
                 'weights':[(1,1,1),(2,1,1),(1,2,1),(1,1,2)]}

In [111]:
vc_search = GridSearchCV(vc, vc_param_grid,cv =5,scoring='accuracy', n_jobs=-1)

In [112]:
vc_search.fit(X_tran, y_tran)

In [113]:
vc_search.best_score_

0.9449

In [114]:
vc_search.best_params_

{'voting': 'soft', 'weights': (1, 2, 1)}

In [115]:
vc_pipeline = make_pipeline(ct, vc)

In [116]:
vc_pipeline.fit(X_train, y_train)

In [117]:
vc_pipeline.score(X_train, y_train)

0.9464203770609412

In [118]:
val_pred = vc_pipeline.predict(X_val)    

In [119]:
accuracy_score(y_val, val_pred)

0.9435379279143418

In [120]:
test_pred = vc_pipeline.predict(X_test)

In [121]:
accuracy_score(y_test, test_pred)

0.9445642502752183