# Pipelines em ML

Primeiramente, pipeline é usado em TROCENTOS contextos, aqui estamos falando sobre pipelines para criação de modelos preditivos, seja de regressão, seja de classificação (também poderia ser análise não superviosionada, fica pra outra live).

Pipelines scikit learn são espécies de "contênieres" que podem ter objetos do tipo:
- Tranformer (não é de NLP, é de pré-processamento mesmo)
- Estimator (nome que o sklearn dá pra algoritmos de classificação, regressão e clustering)
- Pipeline (sim, é possível utilizar pipelines um dentro do outro)
- FeatureUnion (ajuda a juntar pipelines diferents)

Muito desse notebook teve como referência uma palestra ótima do Kevin Goetsch no [PyData Chicago de 2016](https://www.youtube.com/watch?v=URdnFlZnlaE)



In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV



In [2]:
# Ah nem, Leon, Titanic??? Como todo mundo conhece, facilita a entender o poder dos pipelines
treino = pd.read_csv('./data/train.csv')
teste = pd.read_csv('./data/test.csv')

display(treino.head())
display(teste.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Criando pipeline "na mão"

In [3]:
pipeline_da_hora = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ('min_max_scaler', StandardScaler(with_mean=False)),
    ('classificador', RandomForestClassifier())
])

pipeline_da_hora

Pipeline(steps=[('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
                ('min_max_scaler', StandardScaler(with_mean=False)),
                ('classificador', RandomForestClassifier())])

In [4]:
pipeline_da_hora.steps[1]

('min_max_scaler', StandardScaler(with_mean=False))

### Usando make_pipeline pra criar pra gente

In [5]:
make_pipeline(OneHotEncoder(handle_unknown='ignore'), StandardScaler(with_mean=False), RandomForestClassifier())

Pipeline(steps=[('onehotencoder', OneHotEncoder(handle_unknown='ignore')),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('randomforestclassifier', RandomForestClassifier())])

In [6]:
# Separando treino e validação!!
X = treino.drop('Survived', axis = 1)
y = treino['Survived']

X_treino, X_valid, y_treino, y_valid = train_test_split(X, y)

X_treino.shape, X_valid.shape, y_treino.shape, y_valid.shape


((668, 11), (223, 11), (668,), (223,))

In [7]:
pipeline_da_hora.fit(X_treino, y_treino)

Pipeline(steps=[('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
                ('min_max_scaler', StandardScaler(with_mean=False)),
                ('classificador', RandomForestClassifier())])

In [8]:
pipeline_da_hora.predict(X_valid)

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [9]:
pipeline_da_hora.score(X_valid, y_valid)

0.7847533632286996

### Vamos melhorar um pouco essa bagunça e separar as transformações das variáveis cat e numéricas

In [10]:
X_treino.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [11]:
X_treino.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
341,342,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S
619,620,2,"Gavey, Mr. Lawrence",male,26.0,0,0,31028,10.5,,S
470,471,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S
658,659,2,"Eitemiller, Mr. George Floyd",male,23.0,0,0,29751,13.0,,S
429,430,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32.0,0,0,SOTON/O.Q. 392078,8.05,E10,S


In [12]:
X_treino['Name'].dtype.name

'object'

In [13]:
variaveis_categoricas = [coluna for coluna in X_treino.columns if X_treino[coluna].dtype.name == 'object']
variaveis_categoricas

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [14]:
variaveis_numericas = [coluna for coluna in X_treino.columns if coluna not in variaveis_categoricas]
variaveis_numericas

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [15]:
# Separando tratamento para colunas categóricas (imputer = tratar dados faltantes)
pipeline_categoricas = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [16]:
# Tratamento para numéricas
pipeline_numericas = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

In [17]:
# 
pre_processamento = ColumnTransformer([
    ('cat', pipeline_categoricas, variaveis_categoricas),
    ('num', pipeline_numericas, variaveis_numericas)
])

In [18]:
pipeline_random_forest = make_pipeline(pre_processamento, RandomForestClassifier(random_state=42))
pipeline_log_reg = make_pipeline(pre_processamento, LogisticRegression(random_state=42))

In [19]:
pipeline_random_forest

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Name', 'Sex', 'Ticket',
                                                   'Cabin', 'Embarked']),
                                                 ('num',
                                                  Pipeline(steps=[('imputer',
                                                    

In [20]:
pipeline_random_forest.fit(X_treino, y_treino)
pipeline_random_forest.score(X_valid, y_valid)

0.7892376681614349

In [21]:
pipeline_log_reg.fit(X_treino, y_treino)
pipeline_log_reg.score(X_valid, y_valid)

0.8116591928251121

### Peraí?! E o cross-validation?

In [22]:
validacao_cruzada = KFold(n_splits=10, shuffle=True, random_state=42)
validacao_cruzada

KFold(n_splits=10, random_state=42, shuffle=True)

In [23]:
cross_val_score(pipeline_random_forest, X_valid, y_valid, cv=validacao_cruzada)

array([0.65217391, 0.86956522, 0.82608696, 0.86363636, 0.90909091,
       0.77272727, 0.77272727, 0.63636364, 0.77272727, 0.90909091])

In [24]:
acuracia_media_rf = cross_val_score(pipeline_random_forest, X_valid, y_valid, cv=validacao_cruzada).mean()
acuracia_media_rf

0.7984189723320159

In [25]:
acuracia_media_log_reg = cross_val_score(pipeline_log_reg, X_valid, y_valid, cv=validacao_cruzada).mean()
acuracia_media_log_reg

0.8025691699604742

### Bônus: tunagem de hiperparâmetros