In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df_clientes = pd.read_csv('../../dados/credit_score/df_clientes_variavel_target.csv')

# Train Test Split

In [3]:
SEED = 42

In [4]:
df_train, df_test = train_test_split(df_clientes, test_size=0.25, random_state=SEED)

In [5]:
print(df_train.shape, df_test.shape)

(21307, 16) (7103, 16)


In [6]:
df_train.to_csv('../../dados/credit_score/df_clientes_train.csv', index=False)
df_test.to_csv('../../dados/credit_score/df_clientes_test.csv', index=False)

In [10]:
df_train

Unnamed: 0,ID_Cliente,Tem_carro,Tem_casa_propria,Tem_telefone_trabalho,Tem_telefone_fixo,Tem_email,Idade,Anos_empregado,Tamanho_familia,Rendimento_anual,Categoria_de_renda,Grau_escolaridade,Estado_civil,Moradia,Ocupacao,Risco_de_credito
25545,5139901,1,0,0,0,0,40,16,3,166500.0,Servidor público,Ensino superior,Casado,Casa/apartamento próprio,Servidor público,0
23467,5125393,1,0,0,0,0,50,2,2,292500.0,Empregado,Ensino superior,Casado,Casa/apartamento próprio,Gerência,0
11611,5061611,1,1,0,1,0,31,2,3,225000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Motorista,0
21871,5115941,0,1,0,0,0,44,25,2,112500.0,Empregado,Ensino médio,Casado,Apartamento alugado,Vendas,0
1768,5021664,0,1,1,0,0,38,2,3,225000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Construção Civil,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,5115429,1,1,0,0,0,35,9,4,202500.0,Associado comercial,Ensino médio,Casado,Casa/apartamento próprio,Associado comercial,0
5390,5033714,0,1,1,1,0,38,8,3,135000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Segurança,0
860,5010126,0,1,1,1,1,35,6,4,270000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Gerência,0
15795,5089368,0,0,0,1,0,61,0,1,202500.0,Pensionista,Ensino médio,Solteiro,Casa/apartamento próprio,Pensionista,0


# Criação pipeline

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from imblearn.over_sampling import SMOTE

## Drop ID_Cliente

In [8]:
class DropFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, feature_to_drop='ID_Cliente') -> None:
        self.feature_to_drop = feature_to_drop

    def fit(self, df):
        return self
    
    def transform(self, df):
        if self.feature_to_drop in df.columns:
            drop_df = df.drop(columns=[self.feature_to_drop])
            return drop_df
        else:
            print(f'Variável {self.feature_to_drop} não encontrada no dataframe')
            return df

## MinMaxScaler

In [11]:
class MinMax(BaseEstimator, TransformerMixin):
    def __init__(self, min_max_scaler=['Idade', 'Anos_empregado', 'Tamanho_familia','Rendimento_anual']) -> None:
        self.min_max_scaler = min_max_scaler

    def fit(self, df):
        return self
    
    def transform(self, df):
        scaler = MinMaxScaler()
        df[self.min_max_scaler] = scaler.fit_transform(df[self.min_max_scaler])

        return df

## OneHotEncoding

In [12]:
class OneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, one_hot_encoder=['Estado_civil','Moradia','Ocupacao','Categoria_de_renda']):
        self.one_hot_encoder = one_hot_encoder

    def fit(self, df):
        return self
    
    def transform(self, df):
        if set(self.one_hot_encoder).issubset(df.columns):
            def one_hot_enc(df, one_hot_encoder):
                one_hot_enc = OneHotEncoder()
                one_hot_enc.fit(df[one_hot_encoder])
                feature_names = one_hot_enc.get_feature_names_out(one_hot_encoder)
                df = pd.DataFrame(one_hot_enc.transform(df[self.one_hot_encoder]).to_array(),
                                columns=feature_names, index=df.index)
            
                return df
            
            def concat_result(df, one_hot_enc_df, one_hot_encoder):
                other_features = [column for column in df.columns if column not in one_hot_encoder]
                df_concat = pd.concat([df[other_features], one_hot_enc_df], axis=1)

                return df_concat
            
            df_OneHotEncoding = one_hot_enc(df, self.one_hot_encoder)
            df_final = concat_result(df, df_OneHotEncoding, self.one_hot_encoder)

            return df_final
            
        else:
            print(f'Colunas {self.one_hot_encoder} não encontradas')
            return df


## OrdinalEncoding

In [13]:
class OrdinalFeature(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_feature=['Grau_escolaridade']):
        self.ordinal_feature = ordinal_feature

    def fit(self, df):
        return self
    
    def transform(self, df):
        ordinal_encoder = OrdinalEncoder()
        df[self.ordinal_feature] = ordinal_encoder.fit_transform(df[self.ordinal_feature])

        return df

## Oversample

In [None]:
class OverSample(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, df):
        return self
    
    def transform(self, df):
        oversampler = SMOTE(sampling_strategy='minority')
        x_bal, y_bal = oversampler.fit_resample(df.drop(columns=['Risco_de_credito']), df['Risco_de_credito'])
        df_bal = pd.concat([pd.DataFrame(x_bal), pd.DataFrame(y_bal)], axis=1)

        return df_bal

# Rodando pipeline

In [14]:
from sklearn.pipeline import Pipeline

In [None]:
def pipeline_ml(df):
    pipeline = Pipeline([
        ('feature_dropper', DropFeatures()),
        ('scaler', MinMax()),
        ('one_hot_encoder', OneHotEncoding()),
        ('ordinal_encoder', OrdinalFeature()),
        ('oversample', OverSample())
    ])

    df_pipeline = pipeline.fit_transform(df)
    return df_pipeline