In [298]:
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, gen_features
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [23]:
np.random.seed(seed=42)
df_data = pd.read_csv("./cardiovascular-disease-dataset/cardio_train.csv", sep=';', index_col="id")
display(df_data.describe())
display(df_data.head())

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [24]:
to_na_indices = np.random.randint(low=0, high=df_data.shape[0], size=int(0.05 * df_data.shape[0]))
df_data.iloc[to_na_indices, df_data.columns.get_loc("height")] = np.nan

to_na_indices = np.random.randint(low=0, high=df_data.shape[0], size=int(0.05 * df_data.shape[0]))
df_data.iloc[to_na_indices, df_data.columns.get_loc("weight")] = np.nan

to_na_indices = np.random.randint(low=0, high=df_data.shape[0], size=int(0.05 * df_data.shape[0]))
df_data.iloc[to_na_indices, df_data.columns.get_loc("cholesterol")] = np.nan

In [25]:
df_data["gender"] = df_data["gender"].replace({
    1: "women",
    2: "men"
})

df_data["cholesterol"]  = df_data["cholesterol"].replace({
    1: "normal",
    2: "above_normal",
    3: "well_above_normal"
})

df_data["gluc"]  = df_data["gluc"].replace({
    1: "normal",
    2: "above_normal",
    3: "well_above_normal"
})

display(df_data.describe(include="all"))
display(df_data.head())

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000,66591.0,66578.0,70000.0,70000.0,66589,70000,70000.0,70000.0,70000.0,70000.0
unique,,2,,,,,3,3,,,,
top,,women,,,,,normal,normal,,,,
freq,,45530,,,,,49789,59479,,,,
mean,19468.865814,,164.361205,74.210467,128.817286,96.630414,,,0.088129,0.053771,0.803729,0.4997
std,2467.251667,,8.226411,14.397678,154.011419,188.47253,,,0.283484,0.225568,0.397179,0.500003
min,10798.0,,55.0,10.0,-150.0,-70.0,,,0.0,0.0,0.0,0.0
25%,17664.0,,159.0,65.0,120.0,80.0,,,0.0,0.0,1.0,0.0
50%,19703.0,,165.0,72.0,120.0,80.0,,,0.0,0.0,1.0,0.0
75%,21327.0,,170.0,82.0,140.0,90.0,,,0.0,0.0,1.0,1.0


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,men,168.0,62.0,110,80,normal,normal,0,0,1,0
1,20228,women,156.0,85.0,140,90,well above normal,normal,0,0,1,1
2,18857,women,165.0,64.0,130,70,,normal,0,0,0,1
3,17623,men,169.0,82.0,150,100,normal,normal,0,0,1,1
4,17474,women,156.0,56.0,100,60,normal,normal,0,0,0,0


In [204]:
category_features = [["cholesterol"], ["gluc"]]
binary_features = [["gender"], ["smoke"], ["alco"], ["active"]]
numeric_features = [["age"], ["height"], ["weight"], ["ap_hi"], ["ap_lo"]]
target = "cardio"

X = df_data.copy()
y = X.pop(target)

In [89]:
pd.cut(pd.Series([1, 119, 120, 121, 128, 129, 130, 138, 139, 140, 179, 180, 181]), [-np.inf, 119, 129, 139, 180, np.inf])
pd.cut(pd.Series([1, 119, 120, 121, 128, 129, 130, 138, 139, 140, 179, 180, 181]), [-np.inf, 79, 89, 120, np.inf])

0      (-inf, 79.0]
1     (89.0, 120.0]
2     (89.0, 120.0]
3      (120.0, inf]
4      (120.0, inf]
5      (120.0, inf]
6      (120.0, inf]
7      (120.0, inf]
8      (120.0, inf]
9      (120.0, inf]
10     (120.0, inf]
11     (120.0, inf]
12     (120.0, inf]
dtype: category
Categories (4, interval[float64]): [(-inf, 79.0] < (79.0, 89.0] < (89.0, 120.0] < (120.0, inf]]

In [211]:
class BloodPressureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.systolic_ranges = [-np.inf, 119, 129, 139, 180, np.inf]
        self.diastolic_ranges = [-np.inf, 79, 89, 120, np.inf]
        self.blood_pressure_category = ["normal", "elevated", "high_pressure_stage_1", "high_pressure_stage_2", "hypertensive_crisis"]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df_blood_pressure = X.copy()
        df_blood_pressure["systolic"] = pd.cut(df_blood_pressure["ap_hi"], self.systolic_ranges, labels=["<120", "120-129", "130-139", "140-180", ">180"])
        df_blood_pressure["diastolic"] = pd.cut(df_blood_pressure["ap_lo"], self.diastolic_ranges, labels=["<79", "80-89", "90-120", ">120"])
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "<120") &
                              (df_blood_pressure["diastolic"] == "<79"), "blood_pressure"] = self.blood_pressure_category[0]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "120-129") &
                              (df_blood_pressure["diastolic"] == "<79"), "blood_pressure"] = self.blood_pressure_category[1]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "130-139") |
                              (df_blood_pressure["diastolic"] == "80-89"), "blood_pressure"] = self.blood_pressure_category[2]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "140-180") |
                              (df_blood_pressure["diastolic"] == "90-120"), "blood_pressure"] = self.blood_pressure_category[3]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == ">180") |
                              (df_blood_pressure["diastolic"] == ">120"), "blood_pressure"] = self.blood_pressure_category[4]
        
        return df_blood_pressure[["blood_pressure"]]

In [241]:
gen_blood_pressure = (
    ["ap_hi", "ap_lo"],
    [
        BloodPressureTransformer(),
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder()
    ],
    {"alias": "blood_pressure"}
)

In [249]:
class UnhealtyLifestyleTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df_unhealty_lifestyle = X.copy()
        df_unhealty_lifestyle["unhealty_lifestyle"] = df_unhealty_lifestyle["smoke"] | df_unhealty_lifestyle["alco"] | df_unhealty_lifestyle["active"]
        
        return df_unhealty_lifestyle[["unhealty_lifestyle"]]

In [250]:
gen_unhealty_lifestyle = (
    ["smoke", "alco", "active"],
    [
        UnhealtyLifestyleTransformer(),
        SimpleImputer(strategy="most_frequent")
    ],
    {"alias": "unhealty_lifestyle"}
)

In [251]:
gen_category = gen_features(
    columns=category_features,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "most_frequent"
        },
        {
            "class": OneHotEncoder
        }
    ]
)

In [252]:
gen_binary = gen_features(
    columns=binary_features,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "most_frequent"
        },
        {
            "class": OrdinalEncoder
        }
    ]
)

In [253]:
gen_numeric = gen_features(
    columns=numeric_features,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "mean"
        },
        {
            "class": StandardScaler
        }
    ]
)

In [259]:
preprocess_mapper = DataFrameMapper(
    [
        gen_blood_pressure,
        gen_unhealty_lifestyle,
        *gen_category,
        *gen_binary,
        *gen_numeric,
    ],
    input_df=True,
    df_out=True
)

In [302]:
feature_selection = DataFrameMapper(
    [(
        preprocess_mapper.transformed_names_,
        SelectFromModel(RandomForestClassifier(n_estimators=100, max_depth=6))
    )]
)

In [315]:
pipeline = Pipeline(steps=[
    ("preprocess", preprocess_mapper),
    ("feature_selection", feature_selection),
    ("estimator", RandomForestClassifier(n_estimators=100, max_depth=6))
])

In [316]:
pipeline.fit(X, y)

Pipeline(memory=None,
     steps=[('preprocess', DataFrameMapper(default=False, df_out=True,
        features=[(['ap_hi', 'ap_lo'], [BloodPressureTransformer(), SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0), OneHotEncoder(categorical_features=None, categories=None,
 ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [312]:
preds = pipeline.predict(X)
display(preds)

array([0, 1, 0, ..., 1, 1, 0], dtype=int64)

In [313]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

In [314]:
print(f"accuracy_score: {accuracy_score(y, preds)}")
print(f"roc_auc_score: {roc_auc_score(y, preds)}")
print(f"precision_score: {precision_score(y, preds)}")
print(f"recall_score: {recall_score(y, preds)}")

accuracy_score: 0.7297285714285714
roc_auc_score: 0.7296771398266274
precision_score: 0.7769920662297344
recall_score: 0.6439578032533806
