# Importing libraries

In [1]:
# references: 
# 1) https://www.codementor.io/bruce3557/beautiful-machine-learning-pipeline-with-scikit-learn-uiqapbxuj
# 2) https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf#targetText=Scikit%2Dlearn%20pipelines%20are%20a,of%20steps%20in%20your%20project.
# 3) https://michelleful.github.io/code-blog/2015/06/20/pipelines/
# 4) http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# 5) https://www.dataquest.io/blog/introduction-to-ensembles/
# 6) https://www.pluralsight.com/guides/ensemble-modeling-scikit-learn

# dataset: https://www.kaggle.com/ronitf/heart-disease-uci/download

import pandas as pd
import numpy as np
import warnings

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression

from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [2]:
warnings.filterwarnings('ignore') 

# Basic setup

In [3]:
data = pd.read_csv('heart.csv')

In [4]:
data.shape

(303, 14)

In [5]:
# Only needed when combining df's of separate 'data' and 'target objects'
# train_df = pd.DataFrame(
#     data = np.append(data.target[:, None], data.data, axis=1), 
#     columns = ['target'] + data.feature_names
# )

In [6]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [7]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [8]:
data.dtypes.value_counts() 

int64      13
float64     1
dtype: int64

In [9]:
feat_categorical = ['sex', 'cp', 'restecg', 'exang', 'slope', 'ca', 'thal']

feat_numeric = ['age', 'trestbps', 'chol', 'fbs', 'thalach', 'oldpeak']

# Splitting data

### Train/ test split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    data.loc[:, data.columns != 'target'],
    data['target'],
    train_size = 0.8, 
    random_state = 42, 
    stratify = data.target
)

### Cross-validation

In [11]:
cv_splits = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 42)

# Pipeline built-up - V1

In this example I'm combining both numeric and categorical features. On top of that I'm training a random forest which I optimize through using grid search.

### Numerical features

In [181]:
pipe_numeric = Pipeline(steps=[
    ('impute_num', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'median', 
        copy = False, 
        add_indicator = True)
    )
])

### Categorical features

In [187]:
pipe_categorical = Pipeline(steps=[
    ('impute_cat', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'constant', 
        fill_value = 99999,
        copy = False)
    ),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Combining transformers

In [188]:
transformer_union = ColumnTransformer([
    ('feat_numeric', pipe_numeric, feat_numeric),
    ('feat_categorical', pipe_categorical, feat_categorical),
])

### Meta-pipeline

In [252]:
model_pipeline = Pipeline(steps=[
    ('transformers', transformer_union),
    ('model', RandomForestClassifier(class_weight = 'balanced'))
])

### Defining parameters grid

In [256]:
grid_param = { 
    'model__n_estimators': [200, 500, 700],
    'model__max_features': [2, 4, 6],
    'model__max_depth' : [2, 4, 6]
}

grid_search = GridSearchCV(
    model_pipeline, 
    grid_param, 
    scoring = ['roc_auc'],
    cv = cv_splits, 
    refit = 'roc_auc'
)

In [257]:
grid_param

{'model__n_estimators': [200, 500, 700],
 'model__max_features': [2, 4, 6, 8],
 'model__max_depth': [2, 4, 6, 8, 10]}

### Fitting the meta-pipeline

In [258]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=<sklearn.model_selection._split.RepeatedStratifiedKFold object at 0x12b219d50>,
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('transformers',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('feat_numeric',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('impute_stage',
                                                                                          Simpl...
                                                         

### Best model

In [285]:
print(grid_search.best_params_)
print(round(grid_search.best_score_, 2), 'AUC')

{'model__max_depth': 8, 'model__max_features': 2, 'model__n_estimators': 200}
0.92 AUC


In [286]:
print(round(grid_search.cv_results_['mean_test_roc_auc'].mean(), 3))
print(round(grid_search.cv_results_['std_test_roc_auc'].mean(), 3))

0.913
0.029


### Making predictions on test set

In [309]:
# Since everything is wrapped into one big pipeline, it's sufficient to just call 'predict' on the
# pipeline object

grid_search.predict_proba(x_test)[:5, ]

array([[0.88785183, 0.11214817],
       [0.63254838, 0.36745162],
       [0.9624782 , 0.0375218 ],
       [0.30707293, 0.69292707],
       [0.49277115, 0.50722885]])

In [293]:
# All the transformations made to the data are available under the main aggregation pipeline called 'transformer union'

transformer_union.transform(x_test)

array([[ 57., 150., 276., ...,   1.,   0.,   0.],
       [ 67., 125., 254., ...,   0.,   0.,   1.],
       [ 46., 140., 311., ...,   0.,   0.,   1.],
       ...,
       [ 54., 110., 239., ...,   0.,   0.,   1.],
       [ 58., 150., 270., ...,   0.,   0.,   1.],
       [ 49., 130., 266., ...,   0.,   1.,   0.]])

In [305]:
# Comparing the performance of the model on the test set

print(round(roc_auc_score(y_test, grid_search.predict_proba(x_test)[:, 1]), 3))

0.927


# Pipeline built-up - V2

In this example I'm building V1 up by adding a set of additional binary variables calculated based on kmeans from the numeric input, and a pre-modelling feature selection piece with Lasso in the pipeline.

### Numerical features

In [91]:
impute_num = Pipeline(steps=[
    ('impute_num', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'median', 
        copy = False, 
        add_indicator = True))
])

discretize_num = Pipeline(steps=[
    ('impute_num', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'mean', 
        copy = False)),
    ('discretize_num', KBinsDiscretizer(
        strategy = 'kmeans')),
    ('select_num', SelectFromModel(Lasso(alpha = 0.5)))
])

pipe_numeric = Pipeline(steps=[
    ('union_num', FeatureUnion([
        ('impute_num', impute_num),
        ('discretize_num', discretize_num)
    ])
    )
])

In [92]:
x_train[x_train.columns.intersection(feat_numeric)]

Unnamed: 0,age,trestbps,chol,fbs,thalach,oldpeak
19,69,140,239,0,151,1.8
247,66,160,246,0,120,0.0
289,55,128,205,0,130,2.0
288,57,110,335,0,143,3.0
60,71,110,265,1,130,0.0
...,...,...,...,...,...,...
39,65,160,360,0,151,0.8
104,50,129,196,0,163,0.0
140,51,120,295,0,157,0.6
114,55,130,262,0,155,0.0


In [93]:
pipe_numeric.fit_transform(x_train, y_train).shape

(242, 13)

### Categorical features

In [94]:
pipe_categorical = Pipeline(steps=[
    ('impute_cat', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'constant', 
        fill_value = 99999,
        copy = False)
    ),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Combining transformers

In [95]:
transformer_union = ColumnTransformer([
    ('feat_numeric', pipe_numeric, feat_numeric),
    ('feat_categorical', pipe_categorical, feat_categorical),
])

### Meta-pipeline

In [96]:
model_pipeline = Pipeline(steps=[
    ('transformers', transformer_union),
    ('model', RandomForestClassifier(class_weight = 'balanced'))
])

### Defining parameters grid

In [97]:
grid_param = { 
    'model__n_estimators': [200, 500, 700],
    'model__max_features': [2, 4, 6],
    'model__max_depth' : [2, 4, 6]
}

grid_search = GridSearchCV(
    model_pipeline, 
    grid_param, 
    scoring = ['roc_auc'],
    cv = cv_splits, 
    refit = 'roc_auc'
)

In [98]:
grid_param

{'model__n_estimators': [200, 500, 700],
 'model__max_features': [2, 4, 6],
 'model__max_depth': [2, 4, 6]}

### Fitting the meta-pipeline

In [99]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=<sklearn.model_selection._split.RepeatedStratifiedKFold object at 0x11d9e7790>,
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('transformers',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('feat_numeric',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('union_num',
                                                                                          FeatureU...
                                                         

### Best model

In [100]:
print(grid_search.best_params_)
print(round(grid_search.best_score_, 2), 'AUC')

{'model__max_depth': 4, 'model__max_features': 2, 'model__n_estimators': 200}
0.92 AUC


In [101]:
print(round(grid_search.cv_results_['mean_test_roc_auc'].mean(), 3))
print(round(grid_search.cv_results_['std_test_roc_auc'].mean(), 3))

0.915
0.029


### Making predictions on test set

In [102]:
# Since everything is wrapped into one big pipeline, it's sufficient to just call 'predict' on the
# pipeline object

grid_search.predict_proba(x_test)[:5, ]

array([[0.80830587, 0.19169413],
       [0.69689503, 0.30310497],
       [0.88008589, 0.11991411],
       [0.35649438, 0.64350562],
       [0.51033467, 0.48966533]])

In [108]:
# Comparing the performance of the model on the test set

print(round(roc_auc_score(y_test, grid_search.predict_proba(x_test)[:, 1]), 3))

0.919


# Pipeline built-up - V3

In this example I'm building V2 up by combining two different scoring algorithms in an ensemble.

### Numerical features

In [12]:
impute_num = Pipeline(steps=[
    ('impute_num', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'median', 
        copy = False, 
        add_indicator = True))
])

discretize_num = Pipeline(steps=[
    ('impute_num', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'mean', 
        copy = False)),
    ('discretize_num', KBinsDiscretizer(
        strategy = 'kmeans')),
    ('select_num', SelectFromModel(Lasso(alpha = 0.5)))
])

pipe_numeric = Pipeline(steps=[
    ('union_num', FeatureUnion([
        ('impute_num', impute_num),
        ('discretize_num', discretize_num)
    ])
    )
])

### Categorical features

In [13]:
pipe_categorical = Pipeline(steps=[
    ('impute_cat', SimpleImputer(
        missing_values = np.nan, 
        strategy = 'constant', 
        fill_value = 99999,
        copy = False)
    ),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Combining transformers

In [14]:
transformer_union = ColumnTransformer([
    ('feat_numeric', pipe_numeric, feat_numeric),
    ('feat_categorical', pipe_categorical, feat_categorical),
])

## Meta-pipeline and grid for each model

### Random Forest

In [44]:
model_pipeline_rf = Pipeline(steps=[
    ('transformers', transformer_union),
    ('model', RandomForestClassifier(class_weight = 'balanced'))
])

### Elastic-Net

In [43]:
model_pipeline_en = Pipeline(steps=[
    ('transformers', transformer_union),
    ('model', LogisticRegression(penalty = 'elasticnet', solver = 'saga', class_weight = 'balanced'))
])

### Training the Ensemble learner

In [45]:
ensemble = VotingClassifier(estimators=[
    ('rf', model_pipeline_rf), 
    ('en', model_pipeline_en)], voting = 'soft')

In [19]:
ensemble.get_params().keys()

dict_keys(['estimators', 'flatten_transform', 'n_jobs', 'voting', 'weights', 'rf', 'en', 'rf__memory', 'rf__steps', 'rf__verbose', 'rf__transformers', 'rf__model', 'rf__transformers__n_jobs', 'rf__transformers__remainder', 'rf__transformers__sparse_threshold', 'rf__transformers__transformer_weights', 'rf__transformers__transformers', 'rf__transformers__verbose', 'rf__transformers__feat_numeric', 'rf__transformers__feat_categorical', 'rf__transformers__feat_numeric__memory', 'rf__transformers__feat_numeric__steps', 'rf__transformers__feat_numeric__verbose', 'rf__transformers__feat_numeric__union_num', 'rf__transformers__feat_numeric__union_num__n_jobs', 'rf__transformers__feat_numeric__union_num__transformer_list', 'rf__transformers__feat_numeric__union_num__transformer_weights', 'rf__transformers__feat_numeric__union_num__verbose', 'rf__transformers__feat_numeric__union_num__impute_num', 'rf__transformers__feat_numeric__union_num__discretize_num', 'rf__transformers__feat_numeric__union

In [26]:
grid_param_ensemble = { 
    'rf__model__n_estimators': [200, 500, 700],
    'rf__model__max_features': [2, 4, 6],
    'rf__model__max_depth' : [2, 4, 6],
    
    'en__model__l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1],
    'en__model__C': [2**i for i in range(-5,1)] # the level of regularization is specified differently
    
    # 'weights' = [1, 1]
}

grid_search_ensemble = GridSearchCV(
    ensemble, 
    grid_param_ensemble, 
    scoring = ['roc_auc'],
    cv = cv_splits, 
    refit = 'roc_auc'
)

In [27]:
grid_search_ensemble.fit(x_train, y_train)

GridSearchCV(cv=<sklearn.model_selection._split.RepeatedStratifiedKFold object at 0x1117cb050>,
             error_score='raise-deprecating',
             estimator=VotingClassifier(estimators=[('rf',
                                                     Pipeline(memory=None,
                                                              steps=[('transformers',
                                                                      ColumnTransformer(n_jobs=None,
                                                                                        remainder='drop',
                                                                                        sparse_threshold=0.3,
                                                                                        transformer_weights=None,
                                                                                        transformers=[('feat_numeric',
                                                                                        

### Best model

In [32]:
print(grid_search_ensemble.best_params_)
print(round(grid_search_ensemble.best_score_, 3), 'AUC')

{'en__model__l1_ratio': 0.4, 'rf__model__max_depth': 6, 'rf__model__max_features': 4, 'rf__model__n_estimators': 200}
0.89 AUC


In [33]:
print(round(grid_search_ensemble.cv_results_['mean_test_roc_auc'].mean(), 3))
print(round(grid_search_ensemble.cv_results_['std_test_roc_auc'].mean(), 3))

0.88
0.038


### Making predictions on test set

In [35]:
# Comparing the performance of the model on the test set
print(grid_search_ensemble.predict_proba(x_test)[:5, ])
print(round(roc_auc_score(y_test, grid_search_ensemble.predict_proba(x_test)[:, 1]), 3))

[[0.8770519  0.1229481 ]
 [0.52692454 0.47307546]
 [0.8786889  0.1213111 ]
 [0.3298358  0.6701642 ]
 [0.40684985 0.59315015]]
0.889


# Original pipeline reference

In [None]:
model_pipeline = Pipeline(steps=[
  ("features", FeatureUnion([
    (
      "numerical_features",
      ColumnTransformer([
        (
          "numerical",
          Pipeline(steps=[(
            "impute_stage",
            SimpleImputer(missing_values=np.nan, strategy="median",)
          )]),
          ["feature_1"]
        )
      ])
    ), (
      "categorical_features",
      ColumnTransformer([
        (
          "country_encoding",
          Pipeline(steps=[
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
            ("reduction", NMF(n_components=8)),
          ]),
          ["country"],
        ),
      ])
    ), (
      "text_features",
      ColumnTransformer([
        (
          "title_vec",
          Pipeline(steps=[
            ("tfidf", TfidfVectorizer()),
            ("reduction", NMF(n_components=50)),
          ]),
          "title"
        )
      ])
    )
  ])),
  ("classifiers", RandomForestClassifier())
])

model_pipeline.fit(train_data, train_labels.values)
predictions = model_pipeline.predict(predict_data)