In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [42]:
df = pd.read_csv("data/heart_2020_cleaned.csv")

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin


In [54]:
class YesAndNoToBinaryTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y = None):
        self.binary_cols = x.columns[x.nunique() == 2]
        return self
    def transform(self, x, y = None):
        x_ = x.copy()
        for col in self.binary_cols:
            x_[col] = x_[col].replace({'No':0,'Yes':1})
        return x_
class ReplaceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, col: str, to_replace: list, values: list):
        self.col = col
        self.to_replace = to_replace
        self.values = values
    def fit(self, x: pd.DataFrame, y = None):
        return self
    def transform(self, x: pd.DataFrame, y = None):
        x_ = x.copy()
        x_[self.col].replace(self.to_replace, self.values, inplace=True)
        return x_
class GetDummiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, dummies_kwargs:dict):
        self.dummies_kwargs = dummies_kwargs
    def fit(self, x: pd.DataFrame, y = None):
        return self
    def transform(self, x: pd.DataFrame, y = None):
        x_ = x.copy()
        x_ = pd.get_dummies(x_, **self.dummies_kwargs)
        return x_

In [27]:
df['GoodSleep'] = df['SleepTime'].between(5,9).astype(int)
df['GoodBMI'] = df['BMI'].between(15,25).astype(int)
#df.drop(['BMI','SleepTime'],axis=1, inplace=True)

In [55]:
feature_preprocessor = Pipeline(
    steps=[
        ("yes_and_no_tobinary", YesAndNoToBinaryTransformer()),
        ("diabetic_replacer",ReplaceTransformer('Diabetic',['Yes', 'No', 'No, borderline diabetes', 'Yes (during pregnancy)'], [1,0,0,0])),
        ("genhealth_replacer",ReplaceTransformer('GenHealth',['Excellent','Very good', 'Fair', 'Good', 'Poor'],[5,4,3,2,1])),
        ("race_replacer",ReplaceTransformer('Race',['American Indian/Alaskan Native','Black','Asian'], ['White','Other','Hispanic'])),
        ("age_replace",ReplaceTransformer('AgeCategory',df['AgeCategory'].unique().tolist(),range(0,df['AgeCategory'].nunique()))),
        ("dummies",GetDummiesTransformer(dummies_kwargs={'drop_first':True})),
    ]
)

In [56]:
df_processed = feature_preprocessor.fit_transform(df)

X = df_processed.drop(['HeartDisease'],axis=1)
y = df_processed[['HeartDisease']]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [57]:
lr = LogisticRegression(solver='liblinear',class_weight='balanced')
lr.fit(X_train,y_train)

y_pred_lr = lr.predict(X_test)
print(classification_report(y_pred_lr,y_test))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.74      0.97      0.84     44906
           1       0.72      0.21      0.32     19053

    accuracy                           0.74     63959
   macro avg       0.73      0.59      0.58     63959
weighted avg       0.74      0.74      0.69     63959



In [None]:
pipe = Pipeline(
    steps=[("preprocessor", feature_preprocessor), ("classifier", RandomForestClassifier())]
)

param_grid = {
    "classifier__n_estimators": [200, 500],
    "classifier__max_features": ["auto", "sqrt", "log2"],
    "classifier__max_depth": [4, 5, 6, 7, 8],
    "classifier__criterion": ["gini", "entropy"],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=1)