In [16]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from xai_agg.agg_exp import *
from xai_agg.utils import *

# Disable pandas column limit
pd.set_option('display.max_columns', None)

In [19]:
raw = pd.read_csv("../data/PAKDD2010.tsv", sep="\t", encoding="unicode_escape", header=None)

raw.columns = [
    "ID_CLIENT", "CLERK_TYPE", "PAYMENT_DAY", "APPLICATION_SUBMISSION_TYPE", "QUANT_ADDITIONAL_CARDS", 
    "POSTAL_ADDRESS_TYPE", "SEX", "MARITAL_STATUS", "QUANT_DEPENDANTS", "EDUCATION_LEVEL", 
    "STATE_OF_BIRTH", "CITY_OF_BIRTH", "NACIONALITY", "RESIDENCIAL_STATE", "RESIDENCIAL_CITY", 
    "RESIDENCIAL_BOROUGH", "FLAG_RESIDENCIAL_PHONE", "RESIDENCIAL_PHONE_AREA_CODE", "RESIDENCE_TYPE", 
    "MONTHS_IN_RESIDENCE", "FLAG_MOBILE_PHONE", "FLAG_EMAIL", "PERSONAL_MONTHLY_INCOME", "OTHER_INCOMES", 
    "FLAG_VISA", "FLAG_MASTERCARD", "FLAG_DINERS", "FLAG_AMERICAN_EXPRESS", "FLAG_OTHER_CARDS", 
    "QUANT_BANKING_ACCOUNTS", "QUANT_SPECIAL_BANKING_ACCOUNTS", "PERSONAL_ASSETS_VALUE", "QUANT_CARS", 
    "COMPANY", "PROFESSIONAL_STATE", "PROFESSIONAL_CITY", "PROFESSIONAL_BOROUGH", "FLAG_PROFESSIONAL_PHONE", 
    "PROFESSIONAL_PHONE_AREA_CODE", "MONTHS_IN_THE_JOB", "PROFESSION_CODE", "OCCUPATION_TYPE", 
    "MATE_PROFESSION_CODE", "EDUCATION_LEVEL", "FLAG_HOME_ADDRESS_DOCUMENT", "FLAG_RG", "FLAG_CPF", 
    "FLAG_INCOME_PROOF", "PRODUCT", "FLAG_ACSP_RECORD", "AGE", "RESIDENCIAL_ZIP_3", "PROFESSIONAL_ZIP_3", 
    "TARGET_LABEL_BAD"
]

raw.drop(columns=["ID_CLIENT", "CLERK_TYPE", "QUANT_ADDITIONAL_CARDS"], inplace=True)

display(raw["MATE_PROFESSION_CODE"].value_counts())

display(raw.head())

MATE_PROFESSION_CODE
0.0     13632
11.0     6005
9.0      1035
16.0      208
2.0       110
12.0       34
10.0       31
6.0        16
13.0       15
7.0        11
8.0         7
15.0        3
17.0        3
14.0        2
5.0         1
1.0         1
3.0         1
4.0         1
Name: count, dtype: int64

Unnamed: 0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,POSTAL_ADDRESS_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,EDUCATION_LEVEL,STATE_OF_BIRTH,CITY_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,RESIDENCIAL_CITY,RESIDENCIAL_BOROUGH,FLAG_RESIDENCIAL_PHONE,RESIDENCIAL_PHONE_AREA_CODE,RESIDENCE_TYPE,MONTHS_IN_RESIDENCE,FLAG_MOBILE_PHONE,FLAG_EMAIL,PERSONAL_MONTHLY_INCOME,OTHER_INCOMES,FLAG_VISA,FLAG_MASTERCARD,FLAG_DINERS,FLAG_AMERICAN_EXPRESS,FLAG_OTHER_CARDS,QUANT_BANKING_ACCOUNTS,QUANT_SPECIAL_BANKING_ACCOUNTS,PERSONAL_ASSETS_VALUE,QUANT_CARS,COMPANY,PROFESSIONAL_STATE,PROFESSIONAL_CITY,PROFESSIONAL_BOROUGH,FLAG_PROFESSIONAL_PHONE,PROFESSIONAL_PHONE_AREA_CODE,MONTHS_IN_THE_JOB,PROFESSION_CODE,OCCUPATION_TYPE,MATE_PROFESSION_CODE,EDUCATION_LEVEL.1,FLAG_HOME_ADDRESS_DOCUMENT,FLAG_RG,FLAG_CPF,FLAG_INCOME_PROOF,PRODUCT,FLAG_ACSP_RECORD,AGE,RESIDENCIAL_ZIP_3,PROFESSIONAL_ZIP_3,TARGET_LABEL_BAD
0,5,Web,1,F,6,1,0,RN,Assu,1,RN,Santana do Matos,Centro,Y,105.0,1.0,15.0,N,1,900.0,0.0,1,1,0,0,0,0,0,0.0,0,N,,,,N,,0,9.0,4.0,,,0,0,0,0,1,N,32,595,595,1
1,15,Carga,1,F,2,0,0,RJ,rio de janeiro,1,RJ,RIO DE JANEIRO,CAMPO GRANDE,Y,20.0,1.0,1.0,N,1,750.0,0.0,0,0,0,0,0,0,0,0.0,0,Y,,,,N,,0,11.0,4.0,11.0,,0,0,0,0,1,N,34,230,230,1
2,5,Web,1,F,2,0,0,RN,GARANHUNS,1,RN,Parnamirim,Boa Esperanca,Y,105.0,1.0,,N,1,500.0,0.0,0,0,0,0,0,0,0,0.0,0,N,,,,N,,0,11.0,,,,0,0,0,0,1,N,27,591,591,0
3,20,Web,1,F,2,0,0,PE,CABO,1,PE,CABO,PONTE DOS CARVALHOS,N,,,,N,1,500.0,0.0,0,0,0,0,0,0,0,0.0,0,N,,,,N,,0,,,,,0,0,0,0,1,N,61,545,545,0
4,10,Web,1,M,2,0,0,RJ,RIO DE JANEIRO,1,RJ,Rio de Janeiro,Santa Cruz,Y,20.0,1.0,12.0,N,1,1200.0,0.0,0,0,0,0,0,0,0,0.0,0,N,,,,N,,0,9.0,5.0,,,0,0,0,0,1,N,48,235,235,1


In [18]:
preprocessed_data = raw.copy()

preprocessed_data.drop(columns=["STATE_OF_BIRTH", "CITY_OF_BIRTH", "NACIONALITY", "RESIDENCIAL_STATE",
                                "RESIDENCIAL_CITY", "RESIDENTIAL_BOROUGH", "PROFESSIONAL_CITY", "PROFESSIONAL_BOROUGH",
                                "PROFESSIONAL_STATE", "PROFESSION_CODE", "OCCUPATION_TYPE", "MATE_PROFESSION_CODE", "FLAG_HOME_ADDRESS_DOCUMENT", "FLAG_RG", "FLAG_CPF", "FLAG_INCOME_PROOF", "FLAG_ACSP_RECORD", "RESIDENCIAL_ZIP_3", "PROFESSIONAL_ZIP_3"], inplace=True)

preprocessed_data["POSTAL_ADDRESS_TYPE"] = preprocessed_data["POSTAL_ADDRESS_TYPE"].map({1: "Home", 2: "Other"})

display(preprocessed_data["POSTAL_ADDRESS_TYPE"].value_counts())

categorical_fearures = ["APPLICATION_SUBMISSION_TYPE", "POSTAL_ADDRESS_TYPE", "SEX",
                        "MARITAL_STATUS", "EDUCATION_LEVEL", "STATE_OF_BIRTH", "CITY_OF_BIRTH",]

POSTAL_ADDRESS_TYPE
1    49673
2      327
Name: count, dtype: int64