# Demonstração do uso de pipelines no scikit-learn com o dataset do Titanic

## Código no estilo de programação procedural

In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# lendo o dataset
df = pd.read_csv("data/titanic.csv")

# retirando as colunas Nome, Ticket e Cabin
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# dividindo em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), 
                                                    df['Survived'], 
                                                    test_size=0.2, 
                                                    random_state=42)

# tratando variáveis categóricas com o one-hot-encoding
X_train_new = pd.get_dummies(X_train)
X_test_new = pd.get_dummies(X_test)

# tratando valores nulos na coluna Age
mean_age = X_train_new['Age'].mean()
X_train_new['Age'].fillna(mean_age, inplace=True)
X_test_new['Age'].fillna(mean_age, inplace=True)

# tratando valores nulos na coluna Fare
mean_fare = X_train_new['Fare'].mean()
X_train_new['Fare'].fillna(mean_fare, inplace=True)
X_test_new['Fare'].fillna(mean_fare, inplace=True)

# treinando o modelo: árvore de decisão
tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X_train_new, y_train)

# avaliando o modelo
train_score = tree.score(X_train_new, y_train)
test_score = tree.score(X_test_new, y_test)
print("Train score: {}".format(train_score))
print("Test score: {}".format(test_score))

Train score: 0.8342696629213483
Test score: 0.7988826815642458


## Usando Pipeline

In [8]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder


# lendo o dataset
df = pd.read_csv("data/titanic.csv")

# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# dividindo em conjunto de treino e test
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1), 
                                                    df['Survived'], 
                                                    test_size=0.2, 
                                                    random_state=42)
model = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("Train score: {}".format(train_score))
print("Test score: {}".format(test_score))

Train score: 0.8342696629213483
Test score: 0.7988826815642458


## Pipeline + Cross-Validation

In [9]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

# lendo o dataset
df = pd.read_csv("data/titanic.csv")

# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

model = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

# validando o modelo usando 5-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_validate(model, X=df.drop(['Survived'], axis=1), y=df['Survived'], cv=kfold, return_estimator=True)
print("Average accuracy: %f (%f)" %(results['test_score'].mean(), results['test_score'].std()))

Average accuracy: 0.815956 (0.024300)


## Pipeline + GridSearchCV

In [10]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# lendo o dataset
df = pd.read_csv("data/titanic.csv")

# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

# criando o modelo usando pipeline
model = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder()),
    ('imputer', SimpleImputer(strategy='mean')),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

# Tunando hiperparâmetros com 5-fold cross-validation e pipelines
parameters = {'tree__max_depth': [3, 4, 5]}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(model, param_grid=parameters, cv=kfold, n_jobs=-1, return_train_score=True)
grid.fit(X=df.drop(['Survived'], axis=1), y=df['Survived'])

# qual o melhor parâmetro
grid.best_params_ 
# OUTPUT
# {'tree__max_depth': 3}

# para ver os resultados do hyperparameter tunning + cross-validation
grid.cv_results_

{'mean_fit_time': array([0.04860263, 0.05019822, 0.04559975]),
 'std_fit_time': array([0.0020585 , 0.00147057, 0.00338091]),
 'mean_score_time': array([0.01360025, 0.01340199, 0.01159973]),
 'std_score_time': array([0.00049   , 0.00049184, 0.00149645]),
 'param_tree__max_depth': masked_array(data=[3, 4, 5],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'tree__max_depth': 3},
  {'tree__max_depth': 4},
  {'tree__max_depth': 5}],
 'split0_test_score': array([0.79888268, 0.79888268, 0.79888268]),
 'split1_test_score': array([0.79775281, 0.79775281, 0.78651685]),
 'split2_test_score': array([0.85955056, 0.8258427 , 0.81460674]),
 'split3_test_score': array([0.79775281, 0.79775281, 0.7752809 ]),
 'split4_test_score': array([0.8258427 , 0.8258427 , 0.81460674]),
 'mean_test_score': array([0.81593715, 0.80920314, 0.7979798 ]),
 'std_test_score': array([0.02429306, 0.01357972, 0.01548562]),
 'rank_test_score': array([1, 2, 3]),
 'spli

## ColumnTransformer

In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

# lendo o dataset
df = pd.read_csv("data/titanic.csv")

# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)


# pipeline para pré-processamento das variáveis Age e Fare
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# pipeline para pré-processamento das variáveis Sex e Embarked
cat_transformer = Pipeline(steps=[
    ('one-hot encoder', OneHotEncoder())
])

# Compondo os pré-processadores
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, ['Age', 'Fare']),
    ('cat', cat_transformer, ['Sex', 'Embarked'])
])


# criando o modelo usando pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])


# Tunando hiperparâmetros com 5-fold cross-validation e pipelines
parameters = {'tree__max_depth': [3, 4, 5]}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(model, param_grid=parameters, cv=kfold, n_jobs=-1, return_train_score=True)
grid.fit(X=df.drop(['Survived'], axis=1), y=df['Survived'])

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                           