In [43]:
import os
import pickle
import numpy as np
import pandas as pd

In [44]:
data = pd.read_csv("../data/toy_example.csv")
data.iloc[[10,48,61], 0] = np.nan
data.iloc[[22, 34], 1] = np.nan
data['tipo_antiguedad'] = pd.cut(data['antiguedad'], bins=[0, 1, 3, 6, 100], include_lowest=True,
                                 labels=['first_year', 'junior', 'senior', 'master'])
data.drop(columns=['antiguedad'], inplace=True)
data.head()

Unnamed: 0,n_medio_productos,impago,tipo_antiguedad
0,1.2,0,senior
1,1.4,0,senior
2,1.55,0,master
3,1.6,0,master
4,1.45,0,master


In [45]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='impago').copy()
y = data['impago'].copy()

# FIX a random_state to make your pipeline reproducible!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## ColumnTransformer

In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [47]:
continuous_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean')),
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('codification', OneHotEncoder())
])

preprocessing_pipeline = ColumnTransformer([
        ('continuous', continuous_pipeline, ['n_medio_productos']),
        ('categorical', categorical_pipeline, ['tipo_antiguedad'])
    ],
    remainder = 'drop' 
)

pipeline = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', LogisticRegression(random_state=42))])

my_param_grid = [
    {
     'feature_selection__k': [1,2],
     'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': [1, 3, 5],
    },
    {
     'feature_selection__k': [1,2],
     'classifier': [LogisticRegression(random_state=42)],
     'classifier__C': [0.01, 0.1, 1.0, 10.0, 100.0]
    }
]

cv = GridSearchCV(pipeline, 
                  param_grid = my_param_grid,
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train,y_train)

cv_results = pd.DataFrame(cv.cv_results_)
display(cv_results[
        ['param_feature_selection__k','param_classifier','param_classifier__n_neighbors', 
         'param_classifier__C', 'mean_train_score', 'mean_test_score', 
         'rank_test_score']].sort_values(by="rank_test_score").head(3))
print(cv.best_estimator_.get_params())


# WATCH OUT!
with open(os.path.join('pkl','round_categorical','pipeline.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

Unnamed: 0,param_feature_selection__k,param_classifier,param_classifier__n_neighbors,param_classifier__C,mean_train_score,mean_test_score,rank_test_score
1,2,KNeighborsClassifier(n_neighbors=1),1,,0.67098,0.670769,1
3,2,KNeighborsClassifier(n_neighbors=1),3,,0.67098,0.670769,1
5,2,KNeighborsClassifier(n_neighbors=1),5,,0.67098,0.670769,1


{'memory': None, 'steps': [('preprocess', ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('scaler', StandardScaler()),
                                                 ('imputer', SimpleImputer())]),
                                 ['n_medio_productos']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('codification',
                                                  OneHotEncoder())]),
                                 ['tipo_antiguedad'])])), ('feature_selection', SelectKBest(k=2)), ('classifier', KNeighborsClassifier(n_neighbors=1))], 'verbose': False, 'preprocess': ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('scaler', StandardScaler()),
                                              

### TEST

In [49]:
from sklearn.metrics import accuracy_score

with open(os.path.join('pkl','round_categorical','pipeline.pkl'), 'rb') as f:
    pipe = pickle.load(f)

print("Accuracy in test: ", accuracy_score(y_test, pipe.predict(X_test)))

Accuracy in test:  0.7


## Great! But... what about feature importance?
- Considerar que la importancia de variables se refiere a lo que entra justo antes del classificador. ¿Cómo sé qué variables son? ¿y además si han pasado por one-hot-encoder...?

In [51]:
from sklearn import set_config

set_config(display="diagram")
pipe

### ¿Cuantás y qué variables tengo exactamente antes One Hot Encoder?

## Great! But... what if I want to use my own encoding algorithm?