# Scikit-learn pipelines examples for toy dataset with continuous and categorical features
* Round 6 - Round 8 of Pydata Talk
* Toy example generated with https://guoguibing.github.io/librec/datagen.html

In [None]:
import os
import pickle
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("toy_example.csv")
data.iloc[[10,48,61], 0] = np.nan
data.iloc[[22, 34], 1] = np.nan
data['tenure_category'] = pd.cut(data['tenure'], bins=[0, 1, 3, 6, 100], include_lowest=True,
                                 labels=['first_year', 'junior', 'senior', 'master'])
data.drop(columns=['tenure'], inplace=True)
data.head()

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='nonpayment').copy()
y = data['nonpayment'].copy()

# FIX a random_state to make your pipeline reproducible!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Round 6

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### TRAIN

In [None]:
continuous_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('codification', OneHotEncoder(sparse_output=False))
])

preprocessing_pipeline = ColumnTransformer([
        ('continuous', continuous_pipeline, ['avg_products']),
        ('categorical', categorical_pipeline, ['tenure_category'])
    ],
    remainder = 'drop' 
)

pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', LogisticRegression(random_state=42))])

pipeline.set_output(transform="pandas")

my_param_grid = [
    {
     'preprocess__continuous__imputer__strategy': ['mean','median'],
     'feature_selection__k': [1,2],
     'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [10, 50, 100],
    },
    {
     'preprocess__continuous__imputer__strategy': ['mean','median'],
     'feature_selection__k': [1,2],
     'classifier': [LogisticRegression(random_state=42)],
     'classifier__C': [0.01, 0.1, 1.0, 10.0, 100.0]
    }
]

cv = GridSearchCV(pipeline, 
                  param_grid = my_param_grid,
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train,y_train)

cv_results = pd.DataFrame(cv.cv_results_)
display(cv_results[
        ['param_preprocess__continuous__imputer__strategy','param_feature_selection__k','param_classifier','param_classifier__n_estimators', 
         'param_classifier__C', 'mean_train_score', 'mean_test_score', 
         'rank_test_score']].sort_values(by="rank_test_score").head(3))
print(cv.best_estimator_.get_params())


# WATCH OUT!
with open(os.path.join('pkl','round_6','pipeline.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

In [None]:
pipeline

### PREDICT

In [None]:
from sklearn.metrics import accuracy_score

with open(os.path.join('pkl','round_6','pipeline.pkl'), 'rb') as f:
    pipe = pickle.load(f)

print("Accuracy in test: ", accuracy_score(y_test, pipe.predict(X_test)))

## Round 7

In [None]:
print(pipe.named_steps.keys())
display(pipe.named_steps['preprocess'])
print(pipe.named_steps['preprocess'].named_transformers_.keys())
print(pipe.named_steps['preprocess'].named_transformers_['categorical'])

In [None]:
print("Variables entrada categorical-imputer: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['imputer'].feature_names_in_)
print("Variables salida categorical-imputer: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['imputer'].get_feature_names_out())

print("Variables entrada one hot encoder: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['codification'].feature_names_in_)
print("Variables salida one hot encoder: ", 
      pipe.named_steps['preprocess'].named_transformers_['categorical'].named_steps['codification'].get_feature_names_out())

In [None]:
fimp = pd.DataFrame(pipe.named_steps['classifier'].coef_.T, columns=['coef'])
fimp['feature'] = pipe.named_steps['classifier'].feature_names_in_
fimp

In [None]:
subpl = Pipeline(pipe.steps[:-2])
x_transf = subpl.transform(X_train)
x_transf.head(2)

## Round 8

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class myFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features):
        self.n_features=n_features
        self.selected_features=None

    def fit(self, X, y = None):
        corr = pd.concat([X,y], axis=1).corr().iloc[:-1,-1].abs()
        self.selected_features = list(corr.sort_values(ascending=False).head(self.n_features).index)
        return self
      
    def transform(self, X):
        return X.loc[:,self.selected_features]

    def get_feature_names_out(self):
        return self.selected_features

In [None]:
continuous_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('codification', OneHotEncoder(sparse_output=False))
])

preprocessing_pipeline = ColumnTransformer([
        ('continuous', continuous_pipeline, ['avg_products']),
        ('categorical', categorical_pipeline, ['tenure_category'])
    ],
    remainder = 'drop' 
)

pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('feature_selection', myFeatureSelector(n_features=3)),
    ('classifier', LogisticRegression(random_state=42))])

pipeline.set_output(transform="pandas")

my_param_grid = [
    {
     'feature_selection__n_features': [1,2,3],
     'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [10, 50, 100],
    },
    {
     'feature_selection__n_features': [1,2,3],
     'classifier': [LogisticRegression(random_state=42)],
     'classifier__C': [0.01, 0.1, 1.0, 10.0, 100.0]
    }
]

cv = GridSearchCV(pipeline, 
                  param_grid = my_param_grid,
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train,y_train)

cv_results = pd.DataFrame(cv.cv_results_)
display(cv_results[
        ['param_feature_selection__n_features','param_classifier','param_classifier__n_estimators', 
         'param_classifier__C', 'mean_train_score', 'mean_test_score', 
         'rank_test_score']].sort_values(by="rank_test_score").head(3))
print(cv.best_estimator_.get_params())


# WATCH OUT!
with open(os.path.join('pkl','round_8','pipeline.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

pipeline