# Exploração Inicial e Processamento de Dados

In [67]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler


In [68]:
df_train = pd.read_csv('../data/raw/train.csv', low_memory=False)
df_train.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


## Pré-processamento

In [69]:
colunas_drop = ['ID', 'Customer_ID', 'Month', 'Name', 'SSN', 'Monthly_Inhand_Salary', 'Delay_from_due_date', 'Delay_from_due_date', 'Total_EMI_per_month', 'Changed_Credit_Limit', 
                'Num_Credit_Inquiries', 'Credit_History_Age',  'Payment_Behaviour', 'Payment_of_Min_Amount']
df_train_transformed = df_train.astype(str).apply(lambda x: x.str.lower()).copy()
df_train_transformed.drop(colunas_drop, axis=1, inplace=True)
df_train_transformed.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Num_of_Delayed_Payment,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Amount_invested_monthly,Monthly_Balance,Credit_Score
0,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",7.0,_,809.98,26.822619623699016,80.41529543900253,312.49408867943663,good
1,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",,good,809.98,31.94496005538421,118.28022162236736,284.62916249607184,good
2,-500,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",7.0,good,809.98,28.60935202206993,81.699521264648,331.2098628537912,good
3,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",4.0,good,809.98,31.37786186958235,199.4580743910713,223.45130972736783,good
4,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",,good,809.98,24.797346908844982,41.420153086217326,341.48923103222177,good


### Age

In [70]:
df_train_transformed['Age'] = pd.to_numeric(df_train_transformed['Age'], errors='coerce')
frequencia_age = df_train_transformed['Age'].value_counts().sort_index()
frequencia_age

Age
-500.0      886
 14.0      1129
 15.0      1488
 16.0      1378
 17.0      1438
           ... 
 8674.0       1
 8678.0       1
 8682.0       1
 8697.0       1
 8698.0       1
Name: count, Length: 1661, dtype: int64

In [71]:
df_train_features = df_train_transformed[(df_train_transformed['Age'] > 0) 
                                    & (df_train_transformed['Age'] < 110)].copy()
df_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92372 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       92372 non-null  float64
 1   Occupation                92372 non-null  object 
 2   Annual_Income             92372 non-null  object 
 3   Num_Bank_Accounts         92372 non-null  object 
 4   Num_Credit_Card           92372 non-null  object 
 5   Interest_Rate             92372 non-null  object 
 6   Num_of_Loan               92372 non-null  object 
 7   Type_of_Loan              92372 non-null  object 
 8   Num_of_Delayed_Payment    92372 non-null  object 
 9   Credit_Mix                92372 non-null  object 
 10  Outstanding_Debt          92372 non-null  object 
 11  Credit_Utilization_Ratio  92372 non-null  object 
 12  Amount_invested_monthly   92372 non-null  object 
 13  Monthly_Balance           92372 non-null  object 
 14  Credit_Scor

### Occupation

In [72]:
df_train_features['Occupation'] = df_train_features['Occupation'].replace('_______', 'sem_informacao')
df_train_features.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Num_of_Delayed_Payment,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Amount_invested_monthly,Monthly_Balance,Credit_Score
0,23.0,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",7.0,_,809.98,26.822619623699016,80.41529543900253,312.49408867943663,good
1,23.0,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",,good,809.98,31.94496005538421,118.28022162236736,284.62916249607184,good
3,23.0,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",4.0,good,809.98,31.37786186958235,199.4580743910713,223.45130972736783,good
4,23.0,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",,good,809.98,24.797346908844982,41.420153086217326,341.48923103222177,good
5,23.0,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",4.0,good,809.98,27.26225871052017,62.430172331195294,340.4792117872438,good


### Valores numéricos

In [73]:
df_train_features['Age']                      = pd.to_numeric(df_train_features['Age'], errors='coerce').astype(int)
df_train_features['Annual_Income']            = pd.to_numeric(df_train_features['Annual_Income'], errors='coerce').fillna(0)
df_train_features['Num_Bank_Accounts']        = pd.to_numeric(df_train_features['Num_Bank_Accounts'], errors='coerce').fillna(0).astype(int)
df_train_features['Num_Credit_Card']          = pd.to_numeric(df_train_features['Num_Credit_Card'], errors='coerce').fillna(0).astype(int)
df_train_features['Interest_Rate']            = pd.to_numeric(df_train_features['Interest_Rate'], errors='coerce').fillna(0).astype(int)
df_train_features['Num_of_Loan']              = pd.to_numeric(df_train_features['Num_of_Loan'], errors='coerce').fillna(0).astype(int)
df_train_features['Num_of_Delayed_Payment']   = pd.to_numeric(df_train_features['Num_of_Delayed_Payment'], errors='coerce').fillna(0).astype(int)
df_train_features['Outstanding_Debt']         = pd.to_numeric(df_train_features['Num_of_Delayed_Payment'], errors='coerce').fillna(0).astype(int)
df_train_features['Credit_Utilization_Ratio'] = pd.to_numeric(df_train_features['Credit_Utilization_Ratio'], errors='coerce').fillna(0)
df_train_features['Amount_invested_monthly']  = pd.to_numeric(df_train_features['Amount_invested_monthly'], errors='coerce').fillna(0)
df_train_features['Monthly_Balance']          = pd.to_numeric(df_train_features['Monthly_Balance'], errors='coerce').fillna(0)


#drops
df_train_features.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Num_of_Delayed_Payment,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Amount_invested_monthly,Monthly_Balance,Credit_Score
0,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",7,_,7,26.82262,80.415295,312.494089,good
1,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",0,good,0,31.94496,118.280222,284.629162,good
3,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",4,good,4,31.377862,199.458074,223.45131,good
4,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",0,good,0,24.797347,41.420153,341.489231,good
5,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",4,good,4,27.262259,62.430172,340.479212,good


In [74]:
df_train_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 92372 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       92372 non-null  int64  
 1   Occupation                92372 non-null  object 
 2   Annual_Income             92372 non-null  float64
 3   Num_Bank_Accounts         92372 non-null  int64  
 4   Num_Credit_Card           92372 non-null  int64  
 5   Interest_Rate             92372 non-null  int64  
 6   Num_of_Loan               92372 non-null  int64  
 7   Type_of_Loan              92372 non-null  object 
 8   Num_of_Delayed_Payment    92372 non-null  int64  
 9   Credit_Mix                92372 non-null  object 
 10  Outstanding_Debt          92372 non-null  int64  
 11  Credit_Utilization_Ratio  92372 non-null  float64
 12  Amount_invested_monthly   92372 non-null  float64
 13  Monthly_Balance           92372 non-null  float64
 14  Credit_Scor

### Type_of_Loan

In [75]:
frequencia_Type_of_Loan = df_train_transformed['Type_of_Loan'].value_counts().sort_index()
frequencia_Type_of_Loan

df_train_features['Type_of_Loan_clean'] = (
    df_train_features['Type_of_Loan']
    .str.replace('and', '', regex=False)
    .str.lower()
    .str.replace(r' ', '', regex=True)
    .str.strip()
)

df_train_features['loan_list'] = df_train_features['Type_of_Loan_clean'].str.split(',')

df_loans_exploded = df_train_features.explode('loan_list')
df_loans_exploded['loan_list'].value_counts()




loan_list
paydayloan               37459
credit-builderloan       37334
notspecified             36571
homeequityloan           36165
studentloan              36042
mortgageloan             35993
personalloan             35942
debtconsolidationloan    35807
autoloan                 35088
nan                      10504
Name: count, dtype: int64

In [76]:
mlb = MultiLabelBinarizer()
loan_dummies = pd.DataFrame(
    mlb.fit_transform(df_train_features['loan_list']),
    columns=mlb.classes_,
    index=df_train_features.index
)

# Adicionar prefixo 'TypeLoan_' nas colunas
loan_dummies = loan_dummies.add_prefix('TypeLoan_')

# Juntar no DataFrame original
df_train_features = pd.concat([df_train_features, loan_dummies], axis=1)

In [77]:
df_train_features.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Num_of_Delayed_Payment,Credit_Mix,...,TypeLoan_autoloan,TypeLoan_credit-builderloan,TypeLoan_debtconsolidationloan,TypeLoan_homeequityloan,TypeLoan_mortgageloan,TypeLoan_nan,TypeLoan_notspecified,TypeLoan_paydayloan,TypeLoan_personalloan,TypeLoan_studentloan
0,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",7,_,...,1,1,0,1,0,0,0,0,1,0
1,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",0,good,...,1,1,0,1,0,0,0,0,1,0
3,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",4,good,...,1,1,0,1,0,0,0,0,1,0
4,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",0,good,...,1,1,0,1,0,0,0,0,1,0
5,23,scientist,19114.12,3,4,3,4,"auto loan, credit-builder loan, personal loan,...",4,good,...,1,1,0,1,0,0,0,0,1,0


In [78]:
colunas_drop = ['TypeLoan_nan', 'TypeLoan_notspecified', 'loan_list', 'Type_of_Loan_clean', 'Type_of_Loan']
df_train_features.drop(colunas_drop, axis=1, inplace=True)
df_train_features.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Num_of_Delayed_Payment,Credit_Mix,Outstanding_Debt,...,Monthly_Balance,Credit_Score,TypeLoan_autoloan,TypeLoan_credit-builderloan,TypeLoan_debtconsolidationloan,TypeLoan_homeequityloan,TypeLoan_mortgageloan,TypeLoan_paydayloan,TypeLoan_personalloan,TypeLoan_studentloan
0,23,scientist,19114.12,3,4,3,4,7,_,7,...,312.494089,good,1,1,0,1,0,0,1,0
1,23,scientist,19114.12,3,4,3,4,0,good,0,...,284.629162,good,1,1,0,1,0,0,1,0
3,23,scientist,19114.12,3,4,3,4,4,good,4,...,223.45131,good,1,1,0,1,0,0,1,0
4,23,scientist,19114.12,3,4,3,4,0,good,0,...,341.489231,good,1,1,0,1,0,0,1,0
5,23,scientist,19114.12,3,4,3,4,4,good,4,...,340.479212,good,1,1,0,1,0,0,1,0


### Credit_Mix

In [79]:
frequencia_cm = df_train_transformed['Credit_Mix'].value_counts().sort_index()
frequencia_cm

Credit_Mix
_           20195
bad         18989
good        24337
standard    36479
Name: count, dtype: int64

In [80]:
df_train_features['Credit_Mix'] = df_train_features['Credit_Mix'].replace('_', 'sem_informacao')
df_train_features.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Num_of_Delayed_Payment,Credit_Mix,Outstanding_Debt,...,Monthly_Balance,Credit_Score,TypeLoan_autoloan,TypeLoan_credit-builderloan,TypeLoan_debtconsolidationloan,TypeLoan_homeequityloan,TypeLoan_mortgageloan,TypeLoan_paydayloan,TypeLoan_personalloan,TypeLoan_studentloan
0,23,scientist,19114.12,3,4,3,4,7,sem_informacao,7,...,312.494089,good,1,1,0,1,0,0,1,0
1,23,scientist,19114.12,3,4,3,4,0,good,0,...,284.629162,good,1,1,0,1,0,0,1,0
3,23,scientist,19114.12,3,4,3,4,4,good,4,...,223.45131,good,1,1,0,1,0,0,1,0
4,23,scientist,19114.12,3,4,3,4,0,good,0,...,341.489231,good,1,1,0,1,0,0,1,0
5,23,scientist,19114.12,3,4,3,4,4,good,4,...,340.479212,good,1,1,0,1,0,0,1,0


### Dummies

In [81]:
col_dummies = ['Occupation','Credit_Mix']
df_train_features = pd.get_dummies(df_train_features, columns=col_dummies, drop_first=False)
for col in df_train_features.columns:
    if df_train_features[col].dtype == 'bool':
        df_train_features[col] = df_train_features[col].astype(int)
        
df_train_features.head()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Num_of_Delayed_Payment,Outstanding_Debt,Credit_Utilization_Ratio,Amount_invested_monthly,...,Occupation_media_manager,Occupation_musician,Occupation_scientist,Occupation_sem_informacao,Occupation_teacher,Occupation_writer,Credit_Mix_bad,Credit_Mix_good,Credit_Mix_sem_informacao,Credit_Mix_standard
0,23,19114.12,3,4,3,4,7,7,26.82262,80.415295,...,0,0,1,0,0,0,0,0,1,0
1,23,19114.12,3,4,3,4,0,0,31.94496,118.280222,...,0,0,1,0,0,0,0,1,0,0
3,23,19114.12,3,4,3,4,4,4,31.377862,199.458074,...,0,0,1,0,0,0,0,1,0,0
4,23,19114.12,3,4,3,4,0,0,24.797347,41.420153,...,0,0,1,0,0,0,0,1,0,0
5,23,19114.12,3,4,3,4,4,4,27.262259,62.430172,...,0,0,1,0,0,0,0,1,0,0


### Variavel-alvo

In [82]:
df_train_features['Credit_Score'].unique()

array(['good', 'standard', 'poor'], dtype=object)

In [83]:
mapeamento = {
    'good': 0,
    'standard': 1,
    'poor': 2
}

df_train_features['target_numeric'] = df_train_features['Credit_Score'].map(mapeamento).astype(int)
df_train_features.drop('Credit_Score', axis=1, inplace=True)
df_train_features.head()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Num_of_Delayed_Payment,Outstanding_Debt,Credit_Utilization_Ratio,Amount_invested_monthly,...,Occupation_musician,Occupation_scientist,Occupation_sem_informacao,Occupation_teacher,Occupation_writer,Credit_Mix_bad,Credit_Mix_good,Credit_Mix_sem_informacao,Credit_Mix_standard,target_numeric
0,23,19114.12,3,4,3,4,7,7,26.82262,80.415295,...,0,1,0,0,0,0,0,1,0,0
1,23,19114.12,3,4,3,4,0,0,31.94496,118.280222,...,0,1,0,0,0,0,1,0,0,0
3,23,19114.12,3,4,3,4,4,4,31.377862,199.458074,...,0,1,0,0,0,0,1,0,0,0
4,23,19114.12,3,4,3,4,0,0,24.797347,41.420153,...,0,1,0,0,0,0,1,0,0,0
5,23,19114.12,3,4,3,4,4,4,27.262259,62.430172,...,0,1,0,0,0,0,1,0,0,0


# Pipeline Train

In [84]:
df_train = pd.read_csv('../data/raw/train.csv', low_memory=False)
colunas_drop = ['ID', 'Customer_ID', 'Month', 'Name', 'SSN', 'Monthly_Inhand_Salary', 'Delay_from_due_date', 'Total_EMI_per_month', 'Changed_Credit_Limit', 
                'Num_Credit_Inquiries', 'Credit_History_Age',  'Payment_Behaviour', 'Payment_of_Min_Amount']
#Create a transformed DataFrame with lowercased strings
df_train_transformed = df_train.astype(str).apply(lambda x: x.str.lower()).copy()
df_train_transformed.drop(colunas_drop, axis=1, inplace=True)
#Create a DataFrame with valid ages
df_train_transformed['Age'] = pd.to_numeric(df_train_transformed['Age'], errors='coerce')
df_train_features = df_train_transformed[(df_train_transformed['Age'] > 0) & (df_train_transformed['Age'] < 110)].copy()
df_train_features['Occupation'] = df_train_features['Occupation'].replace('_______', 'sem_informacao')
df_train_features['Age']                      = pd.to_numeric(df_train_features['Age'], errors='coerce').astype(int)
df_train_features['Annual_Income']            = pd.to_numeric(df_train_features['Annual_Income'], errors='coerce').fillna(0)
df_train_features['Num_Bank_Accounts']        = pd.to_numeric(df_train_features['Num_Bank_Accounts'], errors='coerce').fillna(0).astype(int)
df_train_features['Num_Credit_Card']          = pd.to_numeric(df_train_features['Num_Credit_Card'], errors='coerce').fillna(0).astype(int)
df_train_features['Interest_Rate']            = pd.to_numeric(df_train_features['Interest_Rate'], errors='coerce').fillna(0).astype(int)
df_train_features['Num_of_Loan']              = pd.to_numeric(df_train_features['Num_of_Loan'], errors='coerce').fillna(0).astype(int)
df_train_features['Num_of_Delayed_Payment']   = pd.to_numeric(df_train_features['Num_of_Delayed_Payment'], errors='coerce').fillna(0).astype(int)
df_train_features['Outstanding_Debt']         = pd.to_numeric(df_train_features['Outstanding_Debt'], errors='coerce').fillna(0).astype(int)
df_train_features['Credit_Utilization_Ratio'] = pd.to_numeric(df_train_features['Credit_Utilization_Ratio'], errors='coerce').fillna(0)
df_train_features['Amount_invested_monthly']  = pd.to_numeric(df_train_features['Amount_invested_monthly'], errors='coerce').fillna(0)
df_train_features['Monthly_Balance']          = pd.to_numeric(df_train_features['Monthly_Balance'], errors='coerce').fillna(0)
#Transform 'Type_of_Loan' column
frequencia_Type_of_Loan = df_train_transformed['Type_of_Loan'].value_counts().sort_index()
frequencia_Type_of_Loan
df_train_features['Type_of_Loan_clean'] = (
    df_train_features['Type_of_Loan']
    .str.replace('and', '', regex=False)
    .str.lower()
    .str.replace(r' ', '', regex=True)
    .str.strip()
)
df_train_features['loan_list'] = df_train_features['Type_of_Loan_clean'].str.split(',')
mlb = MultiLabelBinarizer()
loan_dummies = pd.DataFrame(
    mlb.fit_transform(df_train_features['loan_list']),
    columns=mlb.classes_,
    index=df_train_features.index
)
loan_dummies = loan_dummies.add_prefix('TypeLoan_')
df_train_features = pd.concat([df_train_features, loan_dummies], axis=1)
colunas_drop = ['TypeLoan_nan', 'TypeLoan_notspecified', 'loan_list', 'Type_of_Loan_clean', 'Type_of_Loan']
df_train_features.drop(colunas_drop, axis=1, inplace=True)
# Outras varaiveis
df_train_features['Credit_Mix'] = df_train_features['Credit_Mix'].replace('_', 'sem_informacao')
col_dummies = ['Occupation','Credit_Mix']
df_train_features = pd.get_dummies(df_train_features, columns=col_dummies, drop_first=False)
for col in df_train_features.columns:
    if df_train_features[col].dtype == 'bool':
        df_train_features[col] = df_train_features[col].astype(int)

#Normalizacao de colunas continuas
colunas_continuas = ['Annual_Income', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Monthly_Balance']
scaler = StandardScaler()
df_train_features[colunas_continuas] = scaler.fit_transform(df_train_features[colunas_continuas])

#Limpeza de NaNs
df_train_features.dropna(inplace=True)

#Variavel-alvo
mapeamento = {
    'good': 0,
    'standard': 1,
    'poor': 2
}

df_train_features['target_numeric'] = df_train_features['Credit_Score'].map(mapeamento).astype(int)
df_train_features.drop('Credit_Score', axis=1, inplace=True)

df_train_features.head()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Num_of_Delayed_Payment,Outstanding_Debt,Credit_Utilization_Ratio,Amount_invested_monthly,...,Occupation_musician,Occupation_scientist,Occupation_sem_informacao,Occupation_teacher,Occupation_writer,Credit_Mix_bad,Credit_Mix_good,Credit_Mix_sem_informacao,Credit_Mix_standard,target_numeric
0,23,-0.105759,3,4,3,4,7,-0.52076,-1.067845,80.415295,...,0,1,0,0,0,0,0,1,0,0
1,23,-0.105759,3,4,3,4,0,-0.52076,-0.067121,118.280222,...,0,1,0,0,0,0,1,0,0,0
3,23,-0.105759,3,4,3,4,4,-0.52076,-0.177912,199.458074,...,0,1,0,0,0,0,1,0,0,0
4,23,-0.105759,3,4,3,4,0,-0.52076,-1.463512,41.420153,...,0,1,0,0,0,0,1,0,0,0
5,23,-0.105759,3,4,3,4,4,-0.52076,-0.981955,62.430172,...,0,1,0,0,0,0,1,0,0,0


In [85]:
df_train_features.to_csv("../data/processed/credit_score_train.csv", index=False)

# Pipeline Teste

In [86]:
df_test = pd.read_csv('../data/raw/test.csv', low_memory=False)
colunas_drop = ['ID', 'Customer_ID', 'Month', 'Name', 'SSN', 'Monthly_Inhand_Salary', 'Delay_from_due_date', 'Total_EMI_per_month', 'Changed_Credit_Limit', 
                'Num_Credit_Inquiries', 'Credit_History_Age',  'Payment_Behaviour', 'Payment_of_Min_Amount']
#Create a transformed DataFrame with lowercased strings
df_test_transformed = df_test.astype(str).apply(lambda x: x.str.lower()).copy()
df_test_transformed.drop(colunas_drop, axis=1, inplace=True)
#Create a DataFrame with valid ages
df_test_transformed['Age'] = pd.to_numeric(df_test_transformed['Age'], errors='coerce')
df_test_features = df_test_transformed[(df_test_transformed['Age'] > 0) & (df_test_transformed['Age'] < 110)].copy()
df_test_features['Occupation'] = df_test_features['Occupation'].replace('_______', 'sem_informacao')
df_test_features['Age']                      = pd.to_numeric(df_test_features['Age'], errors='coerce').astype(int)
df_test_features['Annual_Income']            = pd.to_numeric(df_test_features['Annual_Income'], errors='coerce').fillna(0)
df_test_features['Num_Bank_Accounts']        = pd.to_numeric(df_test_features['Num_Bank_Accounts'], errors='coerce').fillna(0).astype(int)
df_test_features['Num_Credit_Card']          = pd.to_numeric(df_test_features['Num_Credit_Card'], errors='coerce').fillna(0).astype(int)
df_test_features['Interest_Rate']            = pd.to_numeric(df_test_features['Interest_Rate'], errors='coerce').fillna(0).astype(int)
df_test_features['Num_of_Loan']              = pd.to_numeric(df_test_features['Num_of_Loan'], errors='coerce').fillna(0).astype(int)
df_test_features['Num_of_Delayed_Payment']   = pd.to_numeric(df_test_features['Num_of_Delayed_Payment'], errors='coerce').fillna(0).astype(int)
df_test_features['Outstanding_Debt']         = pd.to_numeric(df_test_features['Outstanding_Debt'], errors='coerce').fillna(0).astype(int)
df_test_features['Credit_Utilization_Ratio'] = pd.to_numeric(df_test_features['Credit_Utilization_Ratio'], errors='coerce').fillna(0)
df_test_features['Amount_invested_monthly']  = pd.to_numeric(df_test_features['Amount_invested_monthly'], errors='coerce').fillna(0)
df_test_features['Monthly_Balance']          = pd.to_numeric(df_test_features['Monthly_Balance'], errors='coerce').fillna(0)
#Transform 'Type_of_Loan' column
frequencia_Type_of_Loan = df_test_transformed['Type_of_Loan'].value_counts().sort_index()
frequencia_Type_of_Loan
df_test_features['Type_of_Loan_clean'] = (
    df_test_features['Type_of_Loan']
    .str.replace('and', '', regex=False)
    .str.lower()
    .str.replace(r' ', '', regex=True)
    .str.strip()
)
df_test_features['loan_list'] = df_test_features['Type_of_Loan_clean'].str.split(',')
mlb = MultiLabelBinarizer()
loan_dummies = pd.DataFrame(
    mlb.fit_transform(df_test_features['loan_list']),
    columns=mlb.classes_,
    index=df_test_features.index
)
loan_dummies = loan_dummies.add_prefix('TypeLoan_')
df_test_features = pd.concat([df_test_features, loan_dummies], axis=1)
colunas_drop = ['TypeLoan_nan', 'TypeLoan_notspecified', 'loan_list', 'Type_of_Loan_clean', 'Type_of_Loan']
df_test_features.drop(colunas_drop, axis=1, inplace=True)
# Outras varaiveis
df_test_features['Credit_Mix'] = df_test_features['Credit_Mix'].replace('_', 'sem_informacao')
col_dummies = ['Occupation','Credit_Mix']
df_test_features = pd.get_dummies(df_test_features, columns=col_dummies, drop_first=False)
for col in df_test_features.columns:
    if df_test_features[col].dtype == 'bool':
        df_test_features[col] = df_test_features[col].astype(int)

#Normalizacao de colunas continuas
colunas_continuas = ['Annual_Income', 'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Monthly_Balance']
scaler = StandardScaler()
df_test_features[colunas_continuas] = scaler.fit_transform(df_test_features[colunas_continuas])

#Limpeza de NaNs
df_test_features.dropna(inplace=True)

df_test_features.head()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Num_of_Delayed_Payment,Outstanding_Debt,Credit_Utilization_Ratio,Amount_invested_monthly,...,Occupation_media_manager,Occupation_musician,Occupation_scientist,Occupation_sem_informacao,Occupation_teacher,Occupation_writer,Credit_Mix_bad,Credit_Mix_good,Credit_Mix_sem_informacao,Credit_Mix_standard
0,23,-0.104545,3,4,3,4,7,-0.519694,0.540461,236.642682,...,0,0,1,0,0,0,0,1,0,0
1,24,-0.104545,3,4,3,4,9,-0.519694,0.153192,21.46538,...,0,0,1,0,0,0,0,1,0,0
2,24,-0.104545,3,4,3,4,4,-0.519694,0.301806,148.233938,...,0,0,1,0,0,0,0,1,0,0
4,28,-0.092363,2,4,6,1,1,-0.695632,-1.242557,39.684018,...,0,0,0,1,0,0,0,1,0,0
5,28,-0.092363,2,4,6,1,3,-0.695632,-0.421951,251.627369,...,0,0,0,0,1,0,0,1,0,0


In [87]:
df_test_features.to_csv("../data/processed/credit_score_test.csv", index=False)