# Tree Ensemble Regressor Pipeline

The complete explanation of this notebook is available at The explanation for this notebook is available at https://youranalystbuddy.com/tree-ensemble-models/

For regression, we use the auto-mpg data. The target is `mpg`, miles-per-gallon of cars

### Load and split data

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv('auto-mpg.csv')
data.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1


In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.25)

### Processing pipeline

In [4]:
num_cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
cat_cols = ['origin']
target = 'mpg'

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#pipeline for numeric features
#we need to impute horsepower
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('standardize', StandardScaler())
])

#pipeline for class features
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

#full pipeline - combine numeric and class pipelines
process_pipeline = ColumnTransformer([
    ('numeric', num_pipeline, num_cols),
    ('class', cat_pipeline, cat_cols)
])

#### Random Forest Regressor

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

rfr = Pipeline([
    ('processing', process_pipeline), 
    ('rfr', RandomForestRegressor())
])

data_size = train.shape[0]
n_features = process_pipeline.fit_transform(train).shape[1]

param_grid = {
    'rfr__n_estimators' : [25, 50, 100, 200],
    'rfr__max_depth' : [3, 4],
    'rfr__min_samples_split' : [0.05, 0.1, 0.2, 0.3],
    'rfr__min_samples_leaf' : [0.05, 0.1, 0.2, 0.3],
    'rfr__max_features' : [0.25, 0.5, 0.75, None],
    'rfr__max_leaf_nodes' : [5, 10, 20, None]
}


grid_search = GridSearchCV(rfr, param_grid, cv=5, scoring='r2', return_train_score=True)

Train the model

In [6]:
grid_search.fit(train, train[target])

The best architecture

In [7]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'rfr__max_depth': 4, 'rfr__max_features': None, 'rfr__max_leaf_nodes': 10, 'rfr__min_samples_leaf': 0.05, 'rfr__min_samples_split': 0.1, 'rfr__n_estimators': 25}
0.8398594011439748


And the testing performance 

In [8]:
grid_search.score(test, test[target])

0.8242646271081303

#### Gradient Boosting Regressor

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

gbr = Pipeline([
    ('processing', process_pipeline), 
    ('gbr', GradientBoostingRegressor())
])

data_size = train.shape[0]
n_features = process_pipeline.fit_transform(train).shape[1]

param_grid = {
    'gbr__n_estimators' : [25, 50, 100, 200],
    'gbr__max_depth' : [3, 4],
    'gbr__min_samples_split' : [0.05, 0.1, 0.2, 0.3],
    'gbr__min_samples_leaf' : [0.05, 0.1, 0.2, 0.3],
    'gbr__max_features' : [0.25, 0.5, 0.75, None],
    'gbr__max_leaf_nodes' : [5, 10, 20, None]
}


grid_search = GridSearchCV(gbr, param_grid, cv=5, scoring='r2', return_train_score=True)

Train the model

In [10]:
grid_search.fit(train, train[target])

The best architecture

In [11]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'gbr__max_depth': 4, 'gbr__max_features': 0.5, 'gbr__max_leaf_nodes': 10, 'gbr__min_samples_leaf': 0.1, 'gbr__min_samples_split': 0.1, 'gbr__n_estimators': 50}
0.8634702566212141


And testing performance

In [12]:
grid_search.score(test, test[target])

0.8495487893480208