<a href="https://colab.research.google.com/github/igormartins0301/Portfolio/blob/main/Projeto_risco_de_cr%C3%A9dito_Classifica%C3%A7%C3%A3o.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q category_encoders

In [None]:
# Para manipulacao dos dados
import pandas as pd
import numpy as np

# para graficos
import seaborn as sns
import matplotlib.pyplot as plt

# para modelagem
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder


In [None]:
df = pd.read_csv('/content/imobiliario.csv')
df.head()

Unnamed: 0,idade,empregador,renda,education,education_num,estado_civil,ocupacao,relacionamento,race,sex,investimentos,endividamento,horas_por_semana,nacionalidade,aprovacao_credito
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,não aprovado
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,não aprovado
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,não aprovado
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,não aprovado
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,não aprovado


In [None]:
# Informações do Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   idade              32561 non-null  int64 
 1   empregador         32561 non-null  object
 2   renda              32561 non-null  int64 
 3   education          32561 non-null  object
 4   education_num      32561 non-null  int64 
 5   estado_civil       32561 non-null  object
 6   ocupacao           32561 non-null  object
 7   relacionamento     32561 non-null  object
 8   race               32561 non-null  object
 9   sex                32561 non-null  object
 10  investimentos      32561 non-null  int64 
 11  endividamento      32561 non-null  int64 
 12  horas_por_semana   32561 non-null  int64 
 13  nacionalidade      32561 non-null  object
 14  aprovacao_credito  32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [None]:
#Estatísticas descritivas
df.describe()

Unnamed: 0,idade,renda,education_num,investimentos,endividamento,horas_por_semana
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
# Tabela de Frequencia do TARGET
df['aprovacao_credito'].value_counts()

não aprovado    24720
APROVADO         7841
Name: aprovacao_credito, dtype: int64

In [None]:
# Verificando Nulos do dataset
df.isnull().sum()

idade                0
empregador           0
renda                0
education            0
education_num        0
estado_civil         0
ocupacao             0
relacionamento       0
race                 0
sex                  0
investimentos        0
endividamento        0
horas_por_semana     0
nacionalidade        0
aprovacao_credito    0
dtype: int64

In [None]:
df.duplicated().sum()

24

In [None]:
df['aprovacao_credito'] = np.where(df["aprovacao_credito"] == "não aprovado", 1, 0)
df.head()

Unnamed: 0,idade,empregador,renda,education,education_num,estado_civil,ocupacao,relacionamento,race,sex,investimentos,endividamento,horas_por_semana,nacionalidade,aprovacao_credito
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,1


### 2. Pré-Processamento dos dados

In [None]:
#Elimina registros duplicados
df.drop_duplicates(keep='first', inplace=True)
df
df.duplicated().sum()

0

In [None]:
# Vamos listar as features que vamos utilizar
features = ['race', 'ocupacao','education','empregador','estado_civil','relacionamento','sex','nacionalidade','idade','renda','education_num', 'investimentos', 'endividamento', 'horas_por_semana']
target = "aprovacao_credito"

In [None]:
# Agora, dividimos antes de qualquer pré-processamento
X = df[features]  # Features
y = df[target]  # Labels

# Divide em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=999
)

In [None]:
# Separa em dataframes diferentes as colunas categóricas e numéricas
num_features = ['idade','renda','education_num', 'investimentos', 'endividamento', 'horas_por_semana']
cat_features = ['race', 'ocupacao','education','empregador','estado_civil','relacionamento','sex','nacionalidade']

In [None]:
import category_encoders as ce
encoder = ce.OneHotEncoder(cols = cat_features)

X_train = encoder.fit_transform(X_train)

X_train.head()

Unnamed: 0,race_1,race_2,race_3,race_4,race_5,ocupacao_1,ocupacao_2,ocupacao_3,ocupacao_4,ocupacao_5,...,nacionalidade_39,nacionalidade_40,nacionalidade_41,nacionalidade_42,idade,renda,education_num,investimentos,endividamento,horas_por_semana
5595,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,60,338345,9,0,0,45
688,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,41,195124,14,0,0,35
28054,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,34,49469,13,99999,0,50
32382,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,44,150171,9,0,0,40
287,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,50,176609,10,0,0,45


In [None]:
X_test = encoder.transform(X_test)

X_test.head()

Unnamed: 0,race_1,race_2,race_3,race_4,race_5,ocupacao_1,ocupacao_2,ocupacao_3,ocupacao_4,ocupacao_5,...,nacionalidade_39,nacionalidade_40,nacionalidade_41,nacionalidade_42,idade,renda,education_num,investimentos,endividamento,horas_por_semana
4405,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,45,102096,13,0,0,40
24800,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,37,188563,10,4386,0,50
14000,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,54,96460,9,7688,0,60
15675,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,38,135416,10,0,0,40
16426,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,56,103948,7,0,0,40


### Aplicando o modelo Random Forest

In [None]:
clf_RF = RandomForestClassifier(n_estimators=1000)  # instanciar (= inicializar, criar o objeto)
clf_RF.fit(X_train, y_train)  # treina o modelo

RandomForestClassifier(n_estimators=1000)

In [None]:
# aplica no teste
y_pred = clf_RF.predict(X_test)

y_pred

array([0, 1, 0, ..., 0, 0, 0])

### Avaliando modelo Random Forest

In [None]:
# Avaliando a Acurácia da Máquina Preditiva
accuracy = clf_RF.score(X_test, y_test)
print ('Acurácia: ' + str(accuracy))

Acurácia: 0.8545380045072731


In [None]:
# Avaliando a Confusion Matrix e o Classification Report
prediction = clf_RF.predict(X_test)
cm = confusion_matrix(prediction, y_test)
print(cm)
cr = classification_report(prediction, y_test)
print(cr)


[[1425  569]
 [ 851 6917]]
              precision    recall  f1-score   support

           0       0.63      0.71      0.67      1994
           1       0.92      0.89      0.91      7768

    accuracy                           0.85      9762
   macro avg       0.78      0.80      0.79      9762
weighted avg       0.86      0.85      0.86      9762



XGB

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
clf_XGB = XGBClassifier(n_estimators=1000 , learning_rate=0.01)  # instanciar (= inicializar, criar o objeto)
clf_XGB.fit(X_train, y_train)  # treina o modelo

XGBClassifier(learning_rate=0.01, n_estimators=1000)

In [None]:
# Avaliando a Acurácia da Máquina Preditiva
accuracy = clf_XGB.score(X_test, y_test)
print ('Acurácia: ' + str(accuracy))

Acurácia: 0.8618111042819094


In [None]:
# Avaliando a Cofusion Matrix e o Classification Report
prediction = clf_XGB.predict(X_test)
cm = confusion_matrix(prediction, y_test)
print(cm)
cr = classification_report(prediction, y_test)
print(cr)

[[1343  416]
 [ 933 7070]]
              precision    recall  f1-score   support

           0       0.59      0.76      0.67      1759
           1       0.94      0.88      0.91      8003

    accuracy                           0.86      9762
   macro avg       0.77      0.82      0.79      9762
weighted avg       0.88      0.86      0.87      9762



In [None]:
clf_XGB.feature_importances_
feature_imp = pd.Series(clf_XGB.feature_importances_, index=X_train.columns).sort_values(
    ascending=False
)
feature_imp

estado_civil_1      0.370830
education_num       0.088618
investimentos       0.057423
ocupacao_2          0.051409
education_1         0.038257
                      ...   
nacionalidade_11    0.000000
nacionalidade_12    0.000000
nacionalidade_13    0.000000
nacionalidade_14    0.000000
nacionalidade_8     0.000000
Length: 108, dtype: float32

In [None]:
df['estado_civil'].unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

### Resultado

Entre os dois modelos escolhidos, o que apresentou os melhores resultados foi o modelo de classificação XGB, com os seguintes resultados:

Acurácia: 86% para não aprovados

F1-score: 91% para não aprovados 