# Tree Ensemble Classifier Pipeline

The explanation for this notebook is available at https://youranalystbuddy.com/tree-ensemble-models/

For example of classification, we use the heart_disease data. The target is `HeartDisease` which is binary.

### Import and split data

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv('heart_disease.csv')
data.head(n=2)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1


In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.25)

### Processing pipeline

In [4]:
num_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
cat_cols = ['Sex', 'ChestPainType','RestingECG', 'ExerciseAngina', 'ST_Slope']
target = 'HeartDisease'

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

def remove_0(X):
    X.loc[X['Cholesterol']==0, 'Cholesterol'] = np.nan
    X.loc[X['RestingBP']==0, 'RestingBP'] = np.nan
    return X

num_pipeline = Pipeline([
    ('remove 0', FunctionTransformer(remove_0, validate=False)),
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

cat_pipeline = Pipeline([
    ('encode', OneHotEncoder())
])

process_pipeline = ColumnTransformer([
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])

### Modeling pipeline

#### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = Pipeline([
    ('processing', process_pipeline), 
    ('rfc', RandomForestClassifier())
])

data_size = train.shape[0]
n_features = process_pipeline.fit_transform(train).shape[1]

param_grid = {
    'rfc__n_estimators' : [25, 50, 100, 200],
    'rfc__max_depth' : [3, 4],
    'rfc__min_samples_split' : [0.05, 0.1, 0.2, 0.3],
    'rfc__min_samples_leaf' : [0.05, 0.1, 0.2, 0.3],
    'rfc__max_features' : [0.25, 0.5, 0.75, None],
    'rfc__max_leaf_nodes' : [5, 10, 20, None]
}


grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

Training

In [7]:
grid_search.fit(train,train[target])

Let's check the best model

In [8]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'rfc__max_depth': 3, 'rfc__max_features': 0.25, 'rfc__max_leaf_nodes': None, 'rfc__min_samples_leaf': 0.05, 'rfc__min_samples_split': 0.2, 'rfc__n_estimators': 25}
0.8604781550830426


And test it on testing data

In [9]:
grid_search.score(test,test[target])

0.8478260869565217

#### Gradient Boosting Model

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gbc = Pipeline([
    ('processing', process_pipeline), 
    ('gbc', RandomForestClassifier())
])

data_size = train.shape[0]
n_features = process_pipeline.fit_transform(train).shape[1]

param_grid = {
    'gbc__n_estimators' : [25, 50, 100, 200],
    'gbc__max_depth' : [3, 4],
    'gbc__min_samples_split' : [0.05, 0.1, 0.2, 0.3],
    'gbc__min_samples_leaf' : [0.05, 0.1, 0.2, 0.3],
    'gbc__max_features' : [0.25, 0.5, 0.75, None],
    'gbc__max_leaf_nodes' : [5, 10, 20, None]
}


grid_search = GridSearchCV(gbc, param_grid, cv=5, scoring='accuracy', return_train_score=True)

Training

In [11]:
grid_search.fit(train,train[target])

Best model:

In [12]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'gbc__max_depth': 3, 'gbc__max_features': 0.25, 'gbc__max_leaf_nodes': 5, 'gbc__min_samples_leaf': 0.05, 'gbc__min_samples_split': 0.1, 'gbc__n_estimators': 50}
0.8677245318946367


And testing performance

In [13]:
grid_search.score(test,test[target])

0.8434782608695652