In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df_clientes = pd.read_csv('../../dados/credit_score/df_clientes_variavel_target.csv')

# Train Test Split

In [3]:
SEED = 42

In [4]:
df_train, df_test = train_test_split(df_clientes, test_size=0.25, random_state=SEED)

In [5]:
print(df_train.shape, df_test.shape)

(21307, 16) (7103, 16)


In [6]:
df_train.to_csv('../../dados/credit_score/df_clientes_train.csv', index=False)
df_test.to_csv('../../dados/credit_score/df_clientes_test.csv', index=False)

In [7]:
df_train

Unnamed: 0,ID_Cliente,Tem_carro,Tem_casa_propria,Tem_telefone_trabalho,Tem_telefone_fixo,Tem_email,Idade,Anos_empregado,Tamanho_familia,Rendimento_anual,Categoria_de_renda,Grau_escolaridade,Estado_civil,Moradia,Ocupacao,Risco_de_credito
25545,5139901,1,0,0,0,0,40,16,3,166500.0,Servidor público,Ensino superior,Casado,Casa/apartamento próprio,Servidor público,0
23467,5125393,1,0,0,0,0,50,2,2,292500.0,Empregado,Ensino superior,Casado,Casa/apartamento próprio,Gerência,0
11611,5061611,1,1,0,1,0,31,2,3,225000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Motorista,0
21871,5115941,0,1,0,0,0,44,25,2,112500.0,Empregado,Ensino médio,Casado,Apartamento alugado,Vendas,0
1768,5021664,0,1,1,0,0,38,2,3,225000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Construção Civil,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,5115429,1,1,0,0,0,35,9,4,202500.0,Associado comercial,Ensino médio,Casado,Casa/apartamento próprio,Associado comercial,0
5390,5033714,0,1,1,1,0,38,8,3,135000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Segurança,0
860,5010126,0,1,1,1,1,35,6,4,270000.0,Empregado,Ensino médio,Casado,Casa/apartamento próprio,Gerência,0
15795,5089368,0,0,0,1,0,61,0,1,202500.0,Pensionista,Ensino médio,Solteiro,Casa/apartamento próprio,Pensionista,0


# Criação pipeline

In [42]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from imblearn.over_sampling import SMOTE

## Drop ID_Cliente

In [43]:
class DropFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, feature_to_drop='ID_Cliente') -> None:
        self.feature_to_drop = feature_to_drop

    def fit(self, df):
        return self
    
    def transform(self, df):
        if self.feature_to_drop in df.columns:
            drop_df = df.drop(columns=[self.feature_to_drop])
            return drop_df
        else:
            print(f'Variável {self.feature_to_drop} não encontrada no dataframe')
            return df

## MinMaxScaler

In [44]:
class MinMax(BaseEstimator, TransformerMixin):
    def __init__(self, min_max_scaler=['Idade', 'Anos_empregado', 'Tamanho_familia','Rendimento_anual']):
        self.min_max_scaler = min_max_scaler

    def fit(self, df):
        return self
    
    def transform(self, df):
        if set(self.min_max_scaler).issubset(df.columns):
            scaler = MinMaxScaler()
            df[self.min_max_scaler] = scaler.fit_transform(df[self.min_max_scaler])

            return df
        else:
            print(f'Colunas {[col for col in self.min_max_scaler if col not in df.columns]} não encontradas')
            return df

## OneHotEncoding

In [66]:
class OneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, one_hot_encoder=['Categoria_de_renda','Estado_civil','Moradia','Ocupacao']):
        self.one_hot_encoder = one_hot_encoder

    def fit(self, df):
        return self
    
    def transform(self, df):
        if set(self.one_hot_encoder).issubset(df.columns):
            def one_hot_enc(df, one_hot_encoder):
                one_hot_enc = OneHotEncoder()
                one_hot_enc.fit(df[one_hot_encoder])
                feature_names = one_hot_enc.get_feature_names_out(one_hot_encoder)
                df = pd.DataFrame(one_hot_enc.transform(df[self.one_hot_encoder]).toarray(),
                                columns=feature_names, index=df.index)
                df[feature_names] = df[feature_names].astype(int)
            
                return df
            
            def concat_result(df, one_hot_enc_df, one_hot_encoder):
                other_features = [column for column in df.columns if column not in one_hot_encoder]
                df_concat = pd.concat([df[other_features], one_hot_enc_df], axis=1)

                return df_concat
            
            df_OneHotEncoding = one_hot_enc(df, self.one_hot_encoder)
            df_final = concat_result(df, df_OneHotEncoding, self.one_hot_encoder)

            return df_final
            
        else:
            print(f'Colunas {[col for col in self.one_hot_encoder if col not in df.columns]} não encontradas')
            return df


## OrdinalEncoding

In [75]:
class OrdinalFeature(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_feature=['Grau_escolaridade']):
        self.ordinal_feature = ordinal_feature

    def fit(self, df):
        return self
    
    def transform(self, df):
        if self.ordinal_feature[0] in df.columns:
            ordinal_encoder = OrdinalEncoder(dtype=int)
            df[self.ordinal_feature] = ordinal_encoder.fit_transform(df[self.ordinal_feature])

            return df
        else:
            print(f'Variável {self.ordinal_feature} não encontrada no dataframe!')
            return df

## Oversample

In [61]:
class OverSample(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, df):
        return self
    
    def transform(self, df):
        oversampler = SMOTE(sampling_strategy='minority')
        x_bal, y_bal = oversampler.fit_resample(df.drop(columns=['Risco_de_credito']), df['Risco_de_credito'])
        df_bal = pd.concat([pd.DataFrame(x_bal), pd.DataFrame(y_bal)], axis=1)

        return df_bal

# Rodando pipeline

In [62]:
from sklearn.pipeline import Pipeline

In [63]:
def pipeline_ml(df):
    pipeline = Pipeline([
        ('feature_dropper', DropFeatures()),
        ('scaler', MinMax()),
        ('one_hot_encoder', OneHotEncoding()),
        ('ordinal_encoder', OrdinalFeature()),
        ('oversample', OverSample())
    ])

    df_pipeline = pipeline.fit_transform(df)
    return df_pipeline

In [83]:
df_train_clean = pipeline_ml(df_train)

In [84]:
df_test_clean = pipeline_ml(df_test)

In [86]:
print(df_test.shape)
print(df_test_clean.shape)

(7103, 16)
(13924, 50)


# Separando X e Y de treino

In [91]:
x_train, y_train = df_train_clean.iloc[:, :-1], df_train_clean.iloc[:, -1]
x_test, y_test = df_test_clean.iloc[:, :-1], df_test_clean.iloc[:, -1]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(41756, 49) (41756,)
(13924, 49) (13924,)


# Importando pacotes para validação do modelo

In [106]:
from sklearn import metrics
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_auc_score, RocCurveDisplay
from scipy import stats
import warnings

warnings.filterwarnings('ignore')

In [110]:
def run_model(modelo, x_train, y_train, x_test, y_test):
    modelo.fit(x_train, y_train)

    try:
        prob_predict = modelo.predict_proba(x_test)
    except:
        # Modelo não aceita resultado em probabilidade
        prob_predict = '-'

    data_bom = np.sort(prob_predict)[:, 0]
    data_mau = np.sort(prob_predict)[:, 1]
    kstest = stats.ks_2samp(data_bom, data_mau)


    print(f'\n-------Resultados--------')
    
    return prob_predict

In [96]:
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression()

In [107]:
predict, prob = run_model(model_logistic, x_train, y_train, x_test, y_test)


-------Resultados--------


In [109]:
np.sort(prob)[:,0]

array([0.48845603, 0.31871654, 0.4186163 , ..., 0.29108715, 0.25252377,
       0.42575859])