In [365]:
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, gen_features
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Read data
We are not reading the original data from Kaggel.  
The data we are reading is based on the data from Kaggle with small changes:
* Random replacement of values with NaNs
* Transformations of categorical (int) features to text categories

In [340]:
np.random.seed(seed=42)
df_data = pd.read_csv("./cardiovascular-disease-dataset/messy/cardio_train.csv", sep=';', index_col="id")
display(df_data.describe(include="all"))
display(df_data.head())

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000,66591.0,66578.0,70000.0,70000.0,66589,70000,70000.0,70000.0,70000.0,70000.0
unique,,2,,,,,3,3,,,,
top,,women,,,,,normal,normal,,,,
freq,,45530,,,,,49789,59479,,,,
mean,19468.865814,,164.361205,74.210467,128.817286,96.630414,,,0.088129,0.053771,0.803729,0.4997
std,2467.251667,,8.226411,14.397678,154.011419,188.47253,,,0.283484,0.225568,0.397179,0.500003
min,10798.0,,55.0,10.0,-150.0,-70.0,,,0.0,0.0,0.0,0.0
25%,17664.0,,159.0,65.0,120.0,80.0,,,0.0,0.0,1.0,0.0
50%,19703.0,,165.0,72.0,120.0,80.0,,,0.0,0.0,1.0,0.0
75%,21327.0,,170.0,82.0,140.0,90.0,,,0.0,0.0,1.0,1.0


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,men,168.0,62.0,110,80,normal,normal,0,0,1,0
1,20228,women,156.0,85.0,140,90,well_above_normal,normal,0,0,1,1
2,18857,women,165.0,64.0,130,70,,normal,0,0,0,1
3,17623,men,169.0,82.0,150,100,normal,normal,0,0,1,1
4,17474,women,156.0,56.0,100,60,normal,normal,0,0,0,0


## Initializations 

Declerations of all column types and target column

In [342]:
category_features = [["cholesterol"], ["gluc"]]
binary_features = [["gender"], ["smoke"], ["alco"], ["active"]]
numeric_features = [["age"], ["height"], ["weight"], ["ap_hi"], ["ap_lo"]]
target = "cardio"

Split the data into features and labels

In [343]:
X = df_data.copy()
y = X.pop(target)

## Split data to train test datasets

In [344]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Custom Transformers

We create custom transformers for feature engineering

### Blood Pressure Transformer

Custom transformer responsible for the creation of a new blood pressure categorical feature based on systolic (the number at the top) and diastolic (the number at the bottom) values.  
The transformer will create a new categorical feature with values according to the American Heart Association ranges of blood pressure:
* normal  
* elevated  
* high_pressure_stage_1  
* high_pressure_stage_2  
* hypertensive_crisis  
  
  
![title](images/blood_pressure.png)  
Photo from [American Heart Association](https://www.heart.org/-/media/data-import/downloadables/pe-abh-what-is-high-blood-pressure-ucm_300310.pdf?la=en&hash=CAC0F1D377BDB7BC3870993918226869524AAC3D)

In [345]:
class BloodPressureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        
        # Systolic and diastolic blood pressure ranges based on the American Heart Association
        self.systolic_ranges = [-np.inf, 119, 129, 139, 180, np.inf]
        self.diastolic_ranges = [-np.inf, 79, 89, 120, np.inf]
        
        # Blood pressure categories
        self.blood_pressure_category = ["normal", "elevated", "high_pressure_stage_1", "high_pressure_stage_2", "hypertensive_crisis"]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Copy the data so we will not change the original instance
        df_blood_pressure = X.copy()
        
        # Break down ranges of systolic values to categories
        df_blood_pressure["systolic"] = pd.cut(df_blood_pressure["ap_hi"], self.systolic_ranges, labels=["<120", "120-129", "130-139", "140-180", ">180"])
        
        # Break down ranges of diastolic values to categories
        df_blood_pressure["diastolic"] = pd.cut(df_blood_pressure["ap_lo"], self.diastolic_ranges, labels=["<79", "80-89", "90-120", ">120"])
        
        # Combine ranges from systolic and diastolic features to determine the category of the blood pressure feature
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "<120") &
                              (df_blood_pressure["diastolic"] == "<79"), "blood_pressure"] = self.blood_pressure_category[0]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "120-129") &
                              (df_blood_pressure["diastolic"] == "<79"), "blood_pressure"] = self.blood_pressure_category[1]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "130-139") |
                              (df_blood_pressure["diastolic"] == "80-89"), "blood_pressure"] = self.blood_pressure_category[2]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == "140-180") |
                              (df_blood_pressure["diastolic"] == "90-120"), "blood_pressure"] = self.blood_pressure_category[3]
        
        df_blood_pressure.loc[(df_blood_pressure["systolic"] == ">180") |
                              (df_blood_pressure["diastolic"] == ">120"), "blood_pressure"] = self.blood_pressure_category[4]
        
        # Return blood pressure feature as a dataframe with one column
        return df_blood_pressure[["blood_pressure"]]

### Unhealty Lifestyle Transformer

Custom transformer responsible for the creation of a new "unhealty lifestyle" feature.  
This is a boolean feature representing the use of cigarettes, alcohol, and physical inactivity. 

In [346]:
class UnhealtyLifestyleTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Copy the data so we will not change the original instance
        df_unhealty_lifestyle = X.copy()
        
        # If you smoke or use alcohol or don't do physical activity, you maintain an unhealty lifestyle!
        df_unhealty_lifestyle["unhealty_lifestyle"] = df_unhealty_lifestyle["smoke"] | df_unhealty_lifestyle["alco"] | (~df_unhealty_lifestyle["active"])
        
        # Return unhealty lifestyle feature as a dataframe with one column
        return df_unhealty_lifestyle[["unhealty_lifestyle"]]

## Definition of DataFrameMapper transformers

Now we will define the pipeline of transformations and the raw features we need to complete the creation and processing of the new features and the original features.  
We will pass this to the DataFrameMapper class of the sklearn-pandas package.

In [378]:
# Input features "ap_hi", "ap_lo".
# Steps:
#    BloodPressureTransformer - create blood pressure feature based on "ap_hi", "ap_lo".
#    SimpleImputer - fill nans with the most frequent value.
#    OneHotEncoder - encode categorical values as a one-hot numeric array.
gen_blood_pressure = (
    ["ap_hi", "ap_lo"],
    [
        BloodPressureTransformer(),
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder()
    ],
    {"alias": "blood_pressure"}
)

gen_blood_pressure

(['ap_hi', 'ap_lo'],
 [BloodPressureTransformer(),
  SimpleImputer(copy=True, fill_value=None, missing_values=nan,
         strategy='most_frequent', verbose=0),
  OneHotEncoder(categorical_features=None, categories=None,
         dtype=<class 'numpy.float64'>, handle_unknown='error',
         n_values=None, sparse=True)],
 {'alias': 'blood_pressure'})

In [379]:
# Input features ["smoke", "alco", "active"].
# Steps:
#    UnhealtyLifestyleTransformer - create unhealty lifestyle feature based on "smoke", "alco", "active".
#    SimpleImputer - fill nans with the most frequent value.
gen_unhealty_lifestyle = (
    ["smoke", "alco", "active"],
    [
        UnhealtyLifestyleTransformer(),
        SimpleImputer(strategy="most_frequent")
    ],
    {"alias": "unhealty_lifestyle"}
)

gen_unhealty_lifestyle

(['smoke', 'alco', 'active'],
 [UnhealtyLifestyleTransformer(),
  SimpleImputer(copy=True, fill_value=None, missing_values=nan,
         strategy='most_frequent', verbose=0)],
 {'alias': 'unhealty_lifestyle'})

### Apply the same transformers for multiple columns with gen_features

In [380]:
# Input features [["cholesterol"], ["gluc"]] (The columns are now list of lists because we want to send 2-dimentional DataFrame to each of the transformers).
# Steps:
#    SimpleImputer - fill nans with the most frequent value.
#    OneHotEncoder - encode categorical values as a one-hot numeric array.
gen_category = gen_features(
    columns=category_features,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "most_frequent"
        },
        {
            "class": OneHotEncoder
        }
    ]
)

gen_category

[(['cholesterol'],
  [SimpleImputer(copy=True, fill_value=None, missing_values=nan,
          strategy='most_frequent', verbose=0),
   OneHotEncoder(categorical_features=None, categories=None,
          dtype=<class 'numpy.float64'>, handle_unknown='error',
          n_values=None, sparse=True)]),
 (['gluc'], [SimpleImputer(copy=True, fill_value=None, missing_values=nan,
          strategy='most_frequent', verbose=0),
   OneHotEncoder(categorical_features=None, categories=None,
          dtype=<class 'numpy.float64'>, handle_unknown='error',
          n_values=None, sparse=True)])]

In [381]:
# Input features [["gender"], ["smoke"], ["alco"], ["active"]] (The columns are now list of lists because we want to send 2-dimentional DataFrame to each of the transformers).
# Steps:
#    SimpleImputer - fill nans with the most frequent value.
#    OrdinalEncoder - encode categorical features as an integer array.
gen_binary = gen_features(
    columns=binary_features,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "most_frequent"
        },
        {
            "class": OrdinalEncoder
        }
    ]
)

gen_binary

[(['gender'], [SimpleImputer(copy=True, fill_value=None, missing_values=nan,
          strategy='most_frequent', verbose=0),
   OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)]),
 (['smoke'], [SimpleImputer(copy=True, fill_value=None, missing_values=nan,
          strategy='most_frequent', verbose=0),
   OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)]),
 (['alco'], [SimpleImputer(copy=True, fill_value=None, missing_values=nan,
          strategy='most_frequent', verbose=0),
   OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)]),
 (['active'], [SimpleImputer(copy=True, fill_value=None, missing_values=nan,
          strategy='most_frequent', verbose=0),
   OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)])]

In [382]:
# Input features [["age"], ["height"], ["weight"], ["ap_hi"], ["ap_lo"]] (The columns are now list of lists because we want to send 2-dimentional DataFrame to each of the transformers).
# Steps:
#    SimpleImputer - fill nans with the mean value.
#    StandardScaler - standardize features by removing the mean and scaling to unit variance.
gen_numeric = gen_features(
    columns=numeric_features,
    classes=[
        {
            "class": SimpleImputer,
            "strategy": "mean"
        },
        {
            "class": StandardScaler
        }
    ]
)

gen_numeric

[(['age'],
  [SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
          verbose=0),
   StandardScaler(copy=True, with_mean=True, with_std=True)]),
 (['height'],
  [SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
          verbose=0),
   StandardScaler(copy=True, with_mean=True, with_std=True)]),
 (['weight'],
  [SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
          verbose=0),
   StandardScaler(copy=True, with_mean=True, with_std=True)]),
 (['ap_hi'],
  [SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
          verbose=0),
   StandardScaler(copy=True, with_mean=True, with_std=True)]),
 (['ap_lo'],
  [SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
          verbose=0),
   StandardScaler(copy=True, with_mean=True, with_std=True)])]

### DataFrameMapper Construction

Now we will define the course of action of the DataFrameMapper and indicate that the input and output will be Pandas Dataframe.

In [383]:
preprocess_mapper = DataFrameMapper(
    [
        gen_blood_pressure,
        gen_unhealty_lifestyle,
        *gen_category,
        *gen_binary,
        *gen_numeric,
    ],
    input_df=True,
    df_out=True
)

In [367]:
feature_selection = DataFrameMapper(
    [(
        preprocess_mapper.transformed_names_,
        SelectFromModel(RandomForestClassifier(n_estimators=100, max_depth=10))
    )]
)

In [368]:
pipeline = Pipeline(steps=[
    ("preprocess", preprocess_mapper),
    ("feature_selection", feature_selection),
    ("estimator", RandomForestClassifier())
])

In [369]:
param_grid = { 
    "estimator__n_estimators": [200, 500],
    "estimator__max_features": ['auto', 'sqrt', 'log2'],
    "estimator__max_depth": [4, 5, 6, 7, 8],
    "estimator__criterion":['gini', 'entropy']
}

In [None]:
gscv_estimator = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
gscv_estimator.fit(X_train, y_train)

In [374]:
print(gscv_estimator.best_params_)    
print(gscv_estimator.best_score_)

{'estimator__criterion': 'entropy', 'estimator__max_depth': 8, 'estimator__max_features': 'log2', 'estimator__n_estimators': 200}
0.7254489795918367


In [375]:
preds = gscv_estimator.predict(X_test)
display(preds)

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [376]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

In [377]:
print(f"accuracy_score: {accuracy_score(y_test, preds)}")
print(f"roc_auc_score: {roc_auc_score(y_test, preds)}")
print(f"precision_score: {precision_score(y_test, preds)}")
print(f"recall_score: {recall_score(y_test, preds)}")

accuracy_score: 0.7256190476190476
roc_auc_score: 0.7257726793673044
precision_score: 0.7475386050367914
recall_score: 0.6844102856058449
