In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score
from sklearn.pipeline import Pipeline
import numpy as np
from src.pipelines import YesAndNoToBinaryTransformer, ReplaceTransformer, GetDummiesTransformer

In [3]:
df = pd.read_csv("data/heart_2020_cleaned.csv")

In [5]:
df['GoodSleep'] = df['SleepTime'].between(5,9).astype(int)
df['GoodBMI'] = df['BMI'].between(15,25).astype(int)

In [9]:
feature_preprocessor = Pipeline(
    steps=[
        ("yes_and_no_tobinary", YesAndNoToBinaryTransformer()),
        ("diabetic_replacer",ReplaceTransformer('Diabetic',['Yes', 'No', 'No, borderline diabetes', 'Yes (during pregnancy)'], [1,0,0,0])),
        ("genhealth_replacer",ReplaceTransformer('GenHealth',['Excellent','Very good', 'Fair', 'Good', 'Poor'],[5,4,3,2,1])),
        ("race_replacer",ReplaceTransformer('Race',['American Indian/Alaskan Native','Black','Asian'], ['White','Other','Hispanic'])),
        ("age_replace",ReplaceTransformer('AgeCategory',df['AgeCategory'].unique().tolist(),range(0,df['AgeCategory'].nunique()))),
        ("dummies",GetDummiesTransformer(dummies_kwargs={'drop_first':True})),
    ]
)

In [10]:
df_processed = feature_preprocessor.fit_transform(df)

X = df_processed.drop(['HeartDisease'],axis=1)
y = df_processed[['HeartDisease']]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [15]:
lr = LogisticRegression(solver='liblinear',class_weight='balanced')
lr.fit(X_train,y_train.values.ravel())

y_pred_lr = lr.predict(X_test)
print(classification_report(y_pred_lr,y_test))

              precision    recall  f1-score   support

           0       0.74      0.97      0.84     44836
           1       0.72      0.21      0.32     19123

    accuracy                           0.74     63959
   macro avg       0.73      0.59      0.58     63959
weighted avg       0.74      0.74      0.68     63959



In [None]:
LogisticRegression(

In [38]:
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from skopt.callbacks import DeltaYStopper,TimerCallback, VerboseCallback
from skopt.utils import use_named_args
from sklearn.model_selection import cross_val_score

space  = [
    Categorical(categories=['l1', 'l2'], name="penalty"),
    Categorical(categories=['balanced',None], name="class_weight"),

]
lr = LogisticRegression(solver='liblinear')

In [30]:
Categorical(categories=['balanced',None], name="class_weight").

Categorical(categories=('balanced', None), prior=None)

In [46]:
opt = BayesSearchCV(
    LogisticRegression(solver='liblinear'),
    search_spaces={
        "penalty":Categorical(categories=['l1', 'l2']),
        "class_weight": Categorical(categories=['balanced',None])
    },
    n_iter=32,
    n_jobs=2,
    random_state=0,
    scoring='recall'
)

opt.fit(X_train,y_train,callback=[TimerCallback(),VerboseCallback(32),DeltaYStopper(0.03)])

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 5.2103
Function value obtained: -0.0890
Current minimum: -0.0890
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 4.0037
Function value obtained: -0.7287
Current minimum: -0.7287
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 13.6950
Function value obtained: -0.0889
Current minimum: -0.7287
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 4.0431
Function value obtained: -0.7287
Current minimum: -0.7287
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 23.5417
Function value obtain

  y = column_or_1d(y, warn=True)


BayesSearchCV(estimator=LogisticRegression(solver='liblinear'), n_iter=32,
              n_jobs=2, random_state=0, scoring='recall',
              search_spaces={'class_weight': Categorical(categories=('balanced', None), prior=None),
                             'penalty': Categorical(categories=('l1', 'l2'), prior=None)})

In [47]:
y_pred_lr = opt.best_estimator_.predict(X_test)
print(classification_report(y_pred_lr,y_test))

              precision    recall  f1-score   support

           0       0.74      0.97      0.84     44836
           1       0.72      0.21      0.32     19123

    accuracy                           0.74     63959
   macro avg       0.73      0.59      0.58     63959
weighted avg       0.74      0.74      0.68     63959



In [48]:
print(precision_score(y_pred_lr,y_test))

0.7231050228310503


In [49]:
opt.best_estimator_

LogisticRegression(class_weight='balanced', solver='liblinear')