## pipeline démo

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.DataFrame({'feat1':[10, 20, np.nan, 2], 'feat2':[25., 20, 5, 3], 'label':['A', 'A', 'B', 'B']})
test = pd.DataFrame({'feat1':[30., 5, 15], 'feat2':[12, 10, np.nan]})

In [3]:
train

Unnamed: 0,feat1,feat2,label
0,10.0,25.0,A
1,20.0,20.0,A
2,,5.0,B
3,2.0,3.0,B


In [4]:
test

Unnamed: 0,feat1,feat2
0,30.0,12.0
1,5.0,10.0
2,15.0,


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [6]:
imputer = SimpleImputer(strategy="median")
clf = LogisticRegression()

In [7]:
# 2-step pipeline: impute missing values, then pass the results to the classifier
pipe = make_pipeline(imputer, clf)

In [8]:
features = ['feat1', 'feat2']

In [9]:
X, y = train[features], train['label']
X_test = test[features]

In [10]:
X

Unnamed: 0,feat1,feat2
0,10.0,25.0
1,20.0,20.0
2,,5.0
3,2.0,3.0


In [11]:
y

0    A
1    A
2    B
3    B
Name: label, dtype: object

In [15]:
X_test

Unnamed: 0,feat1,feat2
0,30.0,12.0
1,5.0,10.0
2,15.0,


In [13]:
# pipeline où on applique d'abord l'imputer puis le modèle de régression logistique pour la classification
pipe.fit(X, y)

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('logisticregression', LogisticRegression())])

In [14]:
# pipeline applique l'imputer sur  X_test avant de faire la prédiction
# note: pipeline se sert des valeurs de l'imputer durant l'apprentissage
pipe.predict(X_test)

array(['A', 'B', 'A'], dtype=object)

## la classe named__steps

In [26]:
import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain', nrows=6)
df = df[['Age', 'Pclass', 'Survived']]

In [27]:
df

Unnamed: 0,Age,Pclass,Survived
0,22.0,3,0
1,38.0,1,1
2,26.0,3,1
3,35.0,1,1
4,35.0,3,0
5,,3,0


In [28]:
X = df[['Age', 'Pclass']]
y = df['Survived']

In [29]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [30]:
pipe = make_pipeline(SimpleImputer(), LogisticRegression())

In [31]:
# use semicolon to suppress output in IPython
pipe.fit(X, y);

In [32]:
# display the imputation values for "Age" and "Pclass"
pipe.named_steps.simpleimputer.statistics_

array([31.2       ,  2.33333333])

In [33]:
# display the model coefficients for "Age" and "Pclass"
pipe.named_steps.logisticregression.coef_

array([[ 0.03232238, -0.83741131]])

## la description d'un pipeline

In [16]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [17]:
df = pd.read_csv('http://bit.ly/kaggletrain')
X = df[['Parch', 'Fare', 'Embarked', 'Sex', 'Name', 'Age']]
y = df['Survived']

In [18]:
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder()

In [19]:
imp_ohe = make_pipeline(imp_constant, ohe)
vect = CountVectorizer()
imp = SimpleImputer()

In [20]:
# pipeline step 1
ct = make_column_transformer(
    (imp_ohe, ['Embarked', 'Sex']),
    (vect, 'Name'),
    (imp, ['Age', 'Fare']),
    ('passthrough', ['Parch']))

In [21]:
# pipeline step 2
selection = SelectPercentile(chi2, percentile=50)

In [22]:
# pipeline step 3
logreg = LogisticRegression(solver='liblinear')

In [23]:
# display estimators as diagrams
from sklearn import set_config
set_config(display='diagram')

In [24]:
pipe = make_pipeline(ct, selection, logreg)
pipe

In [25]:
# export the diagram to a file
from sklearn.utils import estimator_html_repr
with open('pipeline.html', 'w') as f:  
    f.write(estimator_html_repr(pipe))

## pipeline et CrossValidation

In [34]:
import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')

In [35]:
cols = ['Sex', 'Name']
X = df[cols]
y = df['Survived']

In [36]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

In [37]:
ohe = OneHotEncoder()
vect = CountVectorizer()
ct = make_column_transformer((ohe, ['Sex']), (vect, 'Name'))

In [38]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', random_state=1)

In [39]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(ct, clf)

### Find optimal tuning parameters for the entire pipeline


In [40]:
# specify parameter values to search
params = {}
params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']

In [50]:
#Ou

params={
    'columntransformer__countvectorizer__min_df':[1, 2],
    'logisticregression__C' : [0.1,1,2,5, 10],
    'logisticregression__penalty' : ['l1', 'l2'],
}

In [51]:
# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);

In [52]:
# what was the best score found during the search?
grid.best_score_

0.822635113928818

In [53]:
# which combination of parameters produced the best score?
grid.best_params_

{'columntransformer__countvectorizer__min_df': 1,
 'logisticregression__C': 5,
 'logisticregression__penalty': 'l1'}

## Affichage grille

In [54]:
import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')

In [55]:
X = df[['Pclass', 'Sex', 'Name']]
y = df['Survived']

In [56]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

In [57]:
ohe = OneHotEncoder()
vect = CountVectorizer()
clf = LogisticRegression(solver='liblinear', random_state=1)

In [58]:
ct = make_column_transformer((ohe, ['Sex']), (vect, 'Name'), remainder='passthrough')
pipe = Pipeline([('preprocessor', ct), ('model', clf)])

In [59]:
# specify parameter values to search
params = {}
params['model__C'] = [0.1, 1,2,5, 10]
params['model__penalty'] = ['l1', 'l2']

In [60]:
# try all possible combinations of those parameter values
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X, y);



In [61]:
# convert results into a DataFrame
results = pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]

In [62]:
# sort by test score
results.sort_values('rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score
4,"{'model__C': 2, 'model__penalty': 'l1'}",0.826012,1
6,"{'model__C': 5, 'model__penalty': 'l1'}",0.821537,2
8,"{'model__C': 10, 'model__penalty': 'l1'}",0.821537,2
7,"{'model__C': 5, 'model__penalty': 'l2'}",0.820426,4
2,"{'model__C': 1, 'model__penalty': 'l1'}",0.820394,5
5,"{'model__C': 2, 'model__penalty': 'l2'}",0.819296,6
9,"{'model__C': 10, 'model__penalty': 'l2'}",0.817055,7
3,"{'model__C': 1, 'model__penalty': 'l2'}",0.812573,8
1,"{'model__C': 0.1, 'model__penalty': 'l2'}",0.791225,9
0,"{'model__C': 0.1, 'model__penalty': 'l1'}",0.788984,10
