In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
train_data = pd.read_csv('data/train_data.csv')
test_data = pd.read_csv('data/test_data.csv')

# Data Preprocessing

In [3]:
train_y = train_data["TenYearCHD"].copy()
train_x = train_data.drop("TenYearCHD", axis=1)
test_y = test_data["TenYearCHD"].copy()
test_x = test_data.drop("TenYearCHD", axis=1)

In [4]:
def drop_irrelevant_cols(data):
    
    # 'diabetes', 'BPMeds', 'prevalentStroke' can be dropped as they are highly unbalanced
    cols_to_be_deleted = ['diabetes', 'BPMeds', 'prevalentStroke']
    # 'currentSmoker' should also be dropped as cigsPerDay already encode it's value
    cols_to_be_deleted.append('currentSmoker')
    
    return data.drop(cols_to_be_deleted, axis = 1)

train_x = drop_irrelevant_cols(train_x)
test_x = drop_irrelevant_cols(test_x)

In [5]:
num_attribs = ["age", "sysBP", "diaBP", "glucose", "totChol","cigsPerDay", "BMI", "heartRate"]
cat_attribs = ['prevalentHyp', 'male', 'education']

## Custom Transformer

In [6]:
class FeatureEngineering(BaseEstimator, TransformerMixin):

    def __init__(self, use_age_as_cat_col = False, log_transform_cigsPerDay=False):
        self.use_age_as_cat_col = use_age_as_cat_col
        self.log_transform_cigsPerDay = log_transform_cigsPerDay

    def fit(self, X, y=None):
        return self  # nothing to do as for now

    def transform(self, X, y=None):
        # Use aggregation of sysBP and DiaBP and delete those two columns
        X['avgBP'] = X.apply(lambda x:(x['sysBP'] + x['diaBP'])/2, axis=1)
        X = X.drop(['sysBP', 'diaBP'], axis = 1)

        if self.use_age_as_cat_col:
            age_cat = pd.cut(X["age"],
                               bins=[ 30, 40, 50, 60, np.inf],
                               labels=['30s', '40s', '50s', '60 and above'])
            age_cat = pd.get_dummies(age_cat.astype(str), prefix = 'age')
            X = X.join(age_cat)
            X = X.drop(['age'], axis = 1)

        if self.log_transform_cigsPerDay:
            X['cigsPerDay'] = (X['cigsPerDay']+1).transform(np.log2)

        return np.c_[X]

# FeatureEngineering().fit_transform(train_x[num_attribs])[0]

## Pipeline

In [7]:
num_pipeline = Pipeline([('feature_eng', FeatureEngineering(True, True)),
                         ('std_scaler', StandardScaler()),])

cat_pipeline = Pipeline([('one_hot', OneHotEncoder(drop='if_binary')),])

# Can use Pipeline directly to transform but using it in ColumnTransformer
# train_x_transformed = num_pipeline.fit_transform(train_x)

In [8]:
full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs),
                                   ("cat", cat_pipeline, cat_attribs),])

In [9]:
train_x_prepared = full_pipeline.fit_transform(train_x)
train_x_prepared[8]

array([-0.33555811, -0.83446557,  1.61187563,  1.37838004, -0.63764289,
       -0.46945168, -0.46345463,  1.27374312, -0.66362195, -0.39694   ,
        0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        0.        ])