In [15]:
import os
import pickle
import numpy as np
import pandas as pd

In [16]:
data = pd.read_csv("../data/toy_example.csv")
data.iloc[[10,48,61], 0] = np.nan
data.iloc[[22, 34], 1] = np.nan
data['tipo_antiguedad'] = pd.cut(data['antiguedad'], bins=[0, 1, 3, 6, 100], include_lowest=True,
                                 labels=['first_year', 'junior', 'senior', 'master'])
data.drop(columns=['antiguedad'], inplace=True)
data.head()

Unnamed: 0,n_medio_productos,impago,tipo_antiguedad
0,1.2,0,senior
1,1.4,0,senior
2,1.55,0,master
3,1.6,0,master
4,1.45,0,master


In [17]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='impago').copy()
y = data['impago'].copy()

# FIX a random_state to make your pipeline reproducible!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## ColumnTransformer

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [19]:
continuous_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('codification', OneHotEncoder(sparse_output=False))
])

preprocessing_pipeline = ColumnTransformer([
        ('continuous', continuous_pipeline, ['n_medio_productos']),
        ('categorical', categorical_pipeline, ['tipo_antiguedad'])
    ],
    remainder = 'drop' 
)

pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', LogisticRegression(random_state=42))])

pipeline.set_output(transform="pandas")

my_param_grid = [
    {
     'feature_selection__k': [1,2],
     'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [10, 50, 100],
    },
    {
     'feature_selection__k': [1,2],
     'classifier': [LogisticRegression(random_state=42)],
     'classifier__C': [0.01, 0.1, 1.0, 10.0, 100.0]
    }
]

cv = GridSearchCV(pipeline, 
                  param_grid = my_param_grid,
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train,y_train)

cv_results = pd.DataFrame(cv.cv_results_)
display(cv_results[
        ['param_feature_selection__k','param_classifier','param_classifier__n_estimators', 
         'param_classifier__C', 'mean_train_score', 'mean_test_score', 
         'rank_test_score']].sort_values(by="rank_test_score").head(3))
print(cv.best_estimator_.get_params())


# WATCH OUT!
with open(os.path.join('pkl','round_categorical','pipeline.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

Unnamed: 0,param_feature_selection__k,param_classifier,param_classifier__n_estimators,param_classifier__C,mean_train_score,mean_test_score,rank_test_score
11,2,LogisticRegression(random_state=42),,1.0,0.67098,0.670769,1
1,2,RandomForestClassifier(),10.0,,0.67085,0.65641,2
2,1,RandomForestClassifier(),50.0,,0.67085,0.65641,2


{'memory': None, 'steps': [('preprocess', ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('scaler', StandardScaler()),
                                                 ('imputer', SimpleImputer())]),
                                 ['n_medio_productos']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('codification',
                                                  OneHotEncoder(sparse_output=False))]),
                                 ['tipo_antiguedad'])])), ('feature_selection', SelectKBest(k=2)), ('classifier', LogisticRegression(random_state=42))], 'verbose': False, 'preprocess': ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('scaler', StandardScaler()),
                           

### TEST

In [20]:
from sklearn.metrics import accuracy_score

with open(os.path.join('pkl','round_categorical','pipeline.pkl'), 'rb') as f:
    pipe = pickle.load(f)

print("Accuracy in test: ", accuracy_score(y_test, pipe.predict(X_test)))

Accuracy in test:  0.7


## Great! But... what about feature importance?
- Considerar que la importancia de variables se refiere a lo que entra justo antes del classificador. ¿Cómo sé qué variables son? ¿y además si han pasado por one-hot-encoder...?

In [21]:
from sklearn import set_config

set_config(display="diagram")
pipe

### ¿Cuantás y qué variables tengo exactamente antes One Hot Encoder? Espeleología de Pipeliness
* `named_steps`
* `transformers_`
* ¡OJO! `feature_names_in_` solo definido si input names son strings (¡pero simple imputer no me mantiene el nombre de las columnas!)
* ¡OJO! por defecto `SimpleImputer` me está devolviendo un array
* `set_output` es nuestro amigo: https://scikit-learn.org/1.4/auto_examples/miscellaneous/plot_set_output.html#sphx-glr-auto-examples-miscellaneous-plot-set-output-py

In [22]:
print(pipe.named_steps.keys())
display(pipe.named_steps['preprocess'])
print(pipe.named_steps['preprocess'].named_transformers_.keys())
print(pipe.named_steps['preprocess'].named_transformers_['categorical'])

dict_keys(['preprocess', 'feature_selection', 'classifier'])


dict_keys(['continuous', 'categorical'])
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('codification', OneHotEncoder(sparse_output=False))])


In [32]:
print("Variables entrada categorical-imputer: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['imputer'].feature_names_in_)
print("Variables salida categorical-imputer: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['imputer'].get_feature_names_out())

print("Variables entrada one hot encoder: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['codification'].feature_names_in_)
print("Variables salida one hot encoder: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['codification'].get_feature_names_out())

Variables entrada categorical-imputer:  ['tipo_antiguedad']
Variables salida categorical-imputer:  ['tipo_antiguedad']
Variables entrada one hot encoder:  ['tipo_antiguedad']
Variables salida one hot encoder:  ['tipo_antiguedad_first_year' 'tipo_antiguedad_junior'
 'tipo_antiguedad_master' 'tipo_antiguedad_senior']


### Volviendo al tema de feature importance y entendiendo ya `feature_names_in_` y `get_feature_names_out()`... --> ¿Quiénes son esas 2 variables?

In [24]:
pipe.named_steps['classifier'].coef_

array([[ 0.72902013, -1.14326098]])

In [25]:
pipe.named_steps['classifier'].feature_names_in_

array(['categorical__tipo_antiguedad_junior',
       'categorical__tipo_antiguedad_master'], dtype=object)

In [26]:
fimp = pd.DataFrame(pipe.named_steps['classifier'].coef_.T, columns=['coef'])
fimp['feature'] = pipe.named_steps['classifier'].feature_names_in_
fimp

Unnamed: 0,coef,feature
0,0.72902,categorical__tipo_antiguedad_junior
1,-1.143261,categorical__tipo_antiguedad_master


### ¿Y qué entró exactamente al feature selection?

In [33]:
subpl = Pipeline(pipe.steps[:-2])
x_transf = subpl.transform(X_train)
x_transf.head(2)

Unnamed: 0,continuous__n_medio_productos,categorical__tipo_antiguedad_first_year,categorical__tipo_antiguedad_junior,categorical__tipo_antiguedad_master,categorical__tipo_antiguedad_senior
40,0.364954,0.0,0.0,1.0,0.0
67,0.006896,0.0,1.0,0.0,0.0


## Great! But... what if I want to use my own encoding algorithm? NO PROB!
* Developing scikit-learn estimators: https://scikit-learn.org/stable/developers/develop.html
* Pipelines and custom transformers in Scikit-Learn: https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156
* Ejemplo: `category_encoders`
* Ejemplo: mi feature selector (OJO! falta robustez en el código)
    - Avisar de que tengo algún paquete para ponerle el lazo (target encoder multiclass y mrmr)

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin

class myFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features):
        self.n_features=n_features
        self.selected_features=None

    def fit(self, X, y = None):
        corr = pd.concat([X,y], axis=1).corr().iloc[:-1,-1].abs()
        self.selected_features = list(corr.sort_values(ascending=False).head(self.n_features).index)
        return self
      
    def transform(self, X):
        return X.loc[:,self.selected_features]

    def get_feature_names_out(self):
        return self.selected_features

In [29]:
fs = myFeatureSelector(n_features=3)
fs.fit(x_transf)
display(fs.fit_transform(x_transf).head(2))
fs.get_feature_names_out()

Unnamed: 0,categorical__tipo_antiguedad_master,categorical__tipo_antiguedad_junior,categorical__tipo_antiguedad_first_year
40,1.0,0.0,0.0
67,0.0,1.0,0.0


['categorical__tipo_antiguedad_master',
 'categorical__tipo_antiguedad_junior',
 'categorical__tipo_antiguedad_first_year']

In [30]:
continuous_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('codification', OneHotEncoder(sparse_output=False))
])

preprocessing_pipeline = ColumnTransformer([
        ('continuous', continuous_pipeline, ['n_medio_productos']),
        ('categorical', categorical_pipeline, ['tipo_antiguedad'])
    ],
    remainder = 'drop' 
)

pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('feature_selection', myFeatureSelector(n_features=3)),
    ('classifier', LogisticRegression(random_state=42))])

pipeline.set_output(transform="pandas")

my_param_grid = [
    {
     'feature_selection__n_features': [1,2,3],
     'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [10, 50, 100],
    },
    {
     'feature_selection__n_features': [1,2,3],
     'classifier': [LogisticRegression(random_state=42)],
     'classifier__C': [0.01, 0.1, 1.0, 10.0, 100.0]
    }
]

cv = GridSearchCV(pipeline, 
                  param_grid = my_param_grid,
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train,y_train)

cv_results = pd.DataFrame(cv.cv_results_)
display(cv_results[
        ['param_feature_selection__n_features','param_classifier','param_classifier__n_estimators', 
         'param_classifier__C', 'mean_train_score', 'mean_test_score', 
         'rank_test_score']].sort_values(by="rank_test_score").head(3))
print(cv.best_estimator_.get_params())


# WATCH OUT!
with open(os.path.join('pkl','round_categorical','pipeline.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

Unnamed: 0,param_feature_selection__n_features,param_classifier,param_classifier__n_estimators,param_classifier__C,mean_train_score,mean_test_score,rank_test_score
23,3,LogisticRegression(random_state=42),,100.0,0.697255,0.696923,1
20,3,LogisticRegression(random_state=42),,10.0,0.697255,0.696923,1
5,3,RandomForestClassifier(),50.0,,0.697255,0.696923,1


{'memory': None, 'steps': [('preprocess', ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('scaler', StandardScaler()),
                                                 ('imputer', SimpleImputer())]),
                                 ['n_medio_productos']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('codification',
                                                  OneHotEncoder(sparse_output=False))]),
                                 ['tipo_antiguedad'])])), ('feature_selection', myFeatureSelector(n_features=3)), ('classifier', RandomForestClassifier(n_estimators=50))], 'verbose': False, 'preprocess': ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('scaler', StandardScaler()),
        

# Feature Union
- Contarlo por encima con el post de Medium: https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f
- We can think of FeatureUnion as if it creates a copy of the data, transforms those copies in parallel and then pastes together the results. The term copy here is more of an analogy to aid conceptualisation than a technical reference.
- At the beginning of each pipeline, we added an extra step where we selected relevant columns using a custom transformer: ColumnSelector in line 14 and 19. Here’s how we can visually summarise the script above:
- As seen in this example, using FeatureUnion is more verbose than using ColumnTransformer. Therefore, in my opinion, it’s better to use ColumnTransformer in a case similar to this. However, FeatureUnion definitely has its place. If you ever need to transform the same input data in different ways and use them as features, FeatureUnion is the one. For example, if you are working on a text data and want to do both tf-idf vectorisation of the data as well as extract length of text, FeatureUnion is the perfect tool. 

In [31]:
import sklearn
sklearn.__version__

'1.4.0'