In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../data/external/cardio_train.csv', delimiter=';')

In [3]:
display( df.describe() )
display( df.head() )


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [5]:
df.nunique().loc[df.nunique() >  10 ]

id        70000
age        8076
height      109
weight      287
ap_hi       153
ap_lo       157
dtype: int64

In [6]:
#Set id as index, since it is unnique for each row.
df.set_index(keys='id', inplace=True)
#df[categorical_columns] = df[categorical_columns].astype('category')
#df['age_yrs'] = df.age//365


In [7]:
categorical_columns = df.nunique()[ df.nunique() < 5 ].index.to_list()
numeric_columns = df.nunique()[df.nunique() > 5].index.to_list()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.9 MB


Split The Data Into Test & Train

In [9]:
X_full = df.dropna(subset=['cardio'])
y = X_full.cardio
X_full.drop(['cardio'], axis=1, inplace = True)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=99)


Create Pipeline

In [49]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, QuantileTransformer, FunctionTransformer
from sklearn.pipeline import Pipeline

1) Preprocessors
- There is no missing rows so imputing is not necessary.
- There is no expected negative integers, ap_hi & ap_lo columns going to be used with their absolute values.
- Ordinal columns are split into two groups with respect to hierarchical encoding. Thus with hierarchy are already ordinally encoded. Non-hierarchical columns will be encoded with One Hot Encoding.

In [42]:
X_test.isnull().any()

age            False
gender         False
height         False
weight         False
ap_hi          False
ap_lo          False
cholesterol    False
gluc           False
smoke          False
alco           False
active         False
dtype: bool

In [43]:
categorical_columns = X_test.columns[ (X_test.nunique() == 2 )].tolist()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [44]:
numeric_columns = X_test.columns[ (X_test.min() < 0) & (X_test.dtypes == 'int64') ].tolist()
numeric_transformer = Pipeline( steps=[ ('absolute_values', FunctionTransformer(func=abs)),
                                       ('outliers', QuantileTransformer(random_state=0))
                                       ] )

In [45]:
preprocessor = ColumnTransformer( transformers=[ ('numeric', numeric_transformer, numeric_columns),
                                                ('categoric', categorical_transformer, categorical_columns )
                                                ]
                                                )


In [65]:
model = LogisticRegression()

In [66]:
my_pipeline = Pipeline( steps= [('preprocessor', preprocessor),
                                ('model', model)
                                ]
                                )

In [67]:
my_pipeline.fit(X_train,y_train)

In [68]:
mean_absolute_error( y_test, my_pipeline.predict(X_test))

0.28335714285714286