# Pipeline preprocesamiento
Se listan una serie de pasos que se ejecutan sobre las columnas de manera seriada. <br>
Para armar un pipeline con métodos propios, es necesario que sea una clase que herede de las clases base de sklearn. <br>
Esto debería quedar arriba.

In [1]:
import pandas as pd

In [2]:
df_temp = pd.read_csv('BankChurners.csv')

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
class CategoricalMapper(BaseEstimator, TransformerMixin):
    def __init__(self, *, mapper=None):
        self.mapper = mapper
    
    def fit(self, x, y=None):
        return self
        
    def transform(self, x, y=None):                    
        return x.replace(self.mapper)   

In [4]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [5]:
df_unknowns = df_temp[(df_temp['Income_Category'] == 'Unknown') & 
                 (df_temp['Marital_Status'] == 'Unknown') & 
                 (df_temp['Education_Level'] == 'Unknown')]

df_unknowns.describe()

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,727931600.0,42.714286,2.285714,34.142857,4.571429,2.285714,2.285714,8257.285714,1354.285714,6903.0,0.753429,4507.142857,62.571429,0.763,0.245
std,24356300.0,12.134171,1.976047,7.668737,1.272418,0.95119,0.48795,5378.081248,1011.71401,5770.212879,0.249404,4196.845142,18.301704,0.394679,0.27054
min,709375200.0,26.0,0.0,18.0,3.0,1.0,2.0,1801.0,0.0,1129.0,0.499,1296.0,31.0,0.348,0.0
25%,713073600.0,35.5,0.5,35.0,3.5,1.5,2.0,3920.5,659.5,1857.5,0.5865,2689.0,55.0,0.457,0.053
50%,715007600.0,43.0,3.0,36.0,5.0,3.0,2.0,7270.0,1528.0,5435.0,0.731,3337.0,69.0,0.769,0.12
75%,737644800.0,49.0,3.5,36.0,5.5,3.0,2.5,12620.5,2058.0,11197.0,0.8185,3877.5,70.0,0.915,0.41
max,769701300.0,61.0,5.0,43.0,6.0,3.0,3.0,15648.0,2517.0,15648.0,1.234,13784.0,88.0,1.48,0.669


In [6]:
df_temp.drop(index=df_unknowns.index, inplace=True)

In [7]:
import numpy as np

In [8]:
attrition_mapper = {'Existing Customer': 0, 
                    'Attrited Customer': 1}

gender_mapper = {'F': 0, 
                 'M': 1}

card_mapper = {'Blue': 0,
               'Silver':1,
               'Gold':2,
               'Platinum':3}

income_mapper = {'Less than $40K': 0,
                 '$40K - $60K':1,
                 '$60K - $80K':2,
                 '$80K - $120K':3, 
                 '$120K +':4, 
                 'Unknown': np.nan}

education_mapper = {'Uneducated': 0, 
                    'High School':1, 
                    'College':2, 
                    'Graduate': 3,
                    'Post-Graduate': 4, 
                    'Doctorate': 5, 
                    'Unknown': np.nan}

marital_mapper = {'Single': 0, 
                  'Married':1, 
                  'Divorced':2, 
                  'Unknown': np.nan}

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns

¿Por qué no poner el tratamiento de outliers dentro de un pipeline?: <br>
_Removing samples, does not (yet?) comply with the scikit-learn transformer API. So if you need to do this, you should do it outside any calls to scikit learn, as preprocessing. <br>
As it is now, the transformer API is used to transform the features of a given sample into something new. This can implicitly contain information from other samples, but samples are never deleted. <br>
Another option is to attempt to impute the missing values. But again, if you need to delete samples, treat it as preprocessing before using scikit learn._

In [10]:
def treat_outiers(dataframe, col, **kwargs):
    """
    Treat outliers considering interquartile range.
    """     
    # Get keyword arguments
    column_action = kwargs.pop('column_action', 'remove')

    q1 = dataframe[col].quantile(0.25)
    q3 = dataframe[col].quantile(0.75)
    iqr = q3 - q1
    outlier_threshold = q3 + (iqr *1.5)
    if column_action == 'remove':
        dataframe = dataframe[dataframe[col] < outlier_threshold]
    
    return dataframe

In [16]:
cols = ['Credit_Limit', 'Avg_Open_To_Buy']
for k in range(len(cols)):
    df_temp = treat_outiers(df_temp, cols[k]) 

In [12]:
from sklearn.pipeline import make_pipeline
float_columns = df_temp.select_dtypes(['float64']).columns

preprocessor= make_column_transformer(
                ('drop', 'CLIENTNUM'),
                (CategoricalMapper(mapper=attrition_mapper),['Attrition_Flag']),
                (CategoricalMapper(mapper=gender_mapper),['Gender']),     
                (make_pipeline(CategoricalMapper(mapper=marital_mapper),
                               SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                               ['Marital_Status']),
                (make_pipeline(CategoricalMapper(mapper=education_mapper),
                               SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                               ['Education_Level']),    
                (make_pipeline(CategoricalMapper(mapper=income_mapper),
                               SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                               ['Income_Category']),  
                (CategoricalMapper(mapper=card_mapper),['Card_Category']),
                (StandardScaler(),float_columns),
                remainder='passthrough'
                )

In [13]:
df_preprocessed = pd.DataFrame(preprocessor.fit_transform(df_temp))
df_preprocessed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,1.0,1.0,1.0,3.0,0.0,1.842316,1.934200,2.627902,3.822275,-0.909964,45.0,3.0,39.0,5.0,1.0,3.0,777.0,1144.0,42.0
1,0.0,0.0,0.0,3.0,1.0,0.0,0.734509,0.807437,3.568206,12.573300,-0.752494,49.0,5.0,44.0,6.0,1.0,2.0,864.0,1291.0,33.0
2,0.0,1.0,1.0,3.0,4.0,0.0,-0.473962,-0.182780,8.374710,6.788156,-1.128274,51.0,3.0,36.0,4.0,1.0,0.0,0.0,1887.0,20.0
3,0.0,0.0,1.0,1.0,1.0,0.0,-0.500190,-0.836113,2.947423,6.788156,1.591659,40.0,4.0,34.0,3.0,4.0,1.0,2517.0,1171.0,20.0
4,0.0,1.0,1.0,0.0,3.0,0.0,-0.149738,0.140648,6.462151,7.487735,-1.128274,40.0,3.0,21.0,5.0,1.0,0.0,0.0,816.0,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8615,0.0,1.0,0.0,3.0,2.0,0.0,-0.327837,-0.498233,-0.256914,0.605048,0.525159,50.0,2.0,40.0,3.0,2.0,3.0,1851.0,15476.0,117.0
8616,1.0,1.0,2.0,3.0,2.0,0.0,-0.259395,-0.513433,0.204109,-0.123855,0.700523,41.0,2.0,25.0,4.0,2.0,3.0,2186.0,8764.0,69.0
8617,1.0,0.0,1.0,1.0,1.0,0.0,0.023364,0.313325,0.272578,0.441673,-1.128274,44.0,1.0,36.0,5.0,3.0,4.0,0.0,10291.0,60.0
8618,1.0,1.0,1.0,3.0,2.0,0.0,-0.008608,0.281431,-1.023764,0.039520,-1.128274,30.0,2.0,36.0,4.0,3.0,3.0,0.0,8395.0,62.0


In [14]:
df_preprocessed.columns = ['Attrition_Flag',
                          'Gender',
                          'Marital_Status',
                          'Education_Level',
                          'Income_Category',
                          'Card_Category',
                          'Credit_Limit',
                          'Avg_Open_To_Buy',                          
                          'Total_Amt_Chng_Q4_Q1',
                          'Total_Ct_Chng_Q4_Q1',  
                          'Avg_Utilization_Ratio',                           
                          'Customer_Age',
                          'Dependent_count',
                          'Months_on_book',
                          'Total_Relationship_Count',
                          'Months_Inactive_12_mon',
                          'Contacts_Count_12_mon',                          
                          'Total_Revolving_Bal',                          
                          'Total_Trans_Amt',
                          'Total_Trans_Ct']

In [15]:
df_preprocessed.drop(columns=['Dependent_count', 
                              'Education_Level', 
                              'Marital_Status', 
                              'Months_on_book', 
                              'Months_Inactive_12_mon'], inplace=True)