#Credito

## Carregar base de Dados

In [1]:
# Importação dos pacotes
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

# Seed para reprodução de resultados
seed = 1
random.seed(seed)
np.random.seed(seed)

In [2]:
data = pd.read_table('credito.txt')
data.head()

Unnamed: 0,ESTC,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,1,0,360,0,313,9,52,0,25,48,0,1
1,0,0,350,1,468,10,65,0,33,6,0,1
2,0,0,1100,0,829,9,125,0,56,48,0,1
3,0,0,3000,0,552,12,76,1,31,60,0,1
4,1,0,1000,0,809,12,111,0,24,7,0,1


In [3]:
data.shape

(2077, 12)

In [4]:
data.describe()

Unnamed: 0,ESTC,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
count,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0,2077.0
mean,0.521907,0.122292,969.541647,0.44728,563.794415,8.395282,102.666346,0.128069,41.204622,30.9013,30.66442,0.474723
std,0.67442,0.556507,897.707359,0.497333,292.36083,3.614923,64.315987,0.334247,13.276082,51.960972,93.356164,0.499481
min,0.0,0.0,300.0,0.0,300.0,1.0,50.0,0.0,18.0,0.0,0.0,0.0
25%,0.0,0.0,470.0,0.0,404.0,6.0,66.0,0.0,31.0,6.0,0.0,0.0
50%,0.0,0.0,640.0,0.0,489.0,10.0,83.0,0.0,39.0,6.0,0.0,0.0
75%,1.0,0.0,1150.0,1.0,618.0,10.0,118.0,0.0,52.0,48.0,0.0,1.0
max,3.0,7.0,9675.0,1.0,6000.0,24.0,719.0,1.0,70.0,420.0,1300.0,1.0


## Separar base em treino e teste

In [5]:
from sklearn.model_selection import train_test_split 
# Agora usaremos a função para fazer de fato a separação. Já faremos a separação em treino e teste e entrada e saída.
X = data.loc[:,data.columns != 'CLASSE']  # Entrada
y = data.CLASSE    # Saída
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y)

In [6]:
y.value_counts()

CLASSE
0    1091
1     986
Name: count, dtype: int64

In [7]:
# Verificar proporção para teste para confirmar amostragem estratificada
print(0.2*y.value_counts()[0])
print(0.2*y.value_counts()[1])

218.20000000000002
197.20000000000002


In [8]:
y_test.value_counts()

CLASSE
0    219
1    197
Name: count, dtype: int64

In [9]:
y_train.value_counts()

CLASSE
0    872
1    789
Name: count, dtype: int64

In [29]:
# Vamos olhar os shapes da bases de treino e teste
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1661, 11)
(416, 11)
(1661,)
(416,)


## Machine Learning

In [30]:
# treinar modelo
from sklearn.tree import DecisionTreeClassifier

# No modelo Arvore de decisao, se tivéssemos um problema de regressão, poderíamos usar o DecisionTreeRegression
def train(X_train, y_train, seed):
  model = DecisionTreeClassifier(random_state=seed,
                                 max_depth=10, # define a profundidade máxima da árvore
                                 min_samples_leaf=3) # tente mudar parâmetro para evitar overfitting. min_samples_leaf é o número mínimo de amostras que deve cair numa folha
  model.fit(X_train, y_train);
  return model

# Os parâmetros dentro da árvore (max_depth e min_samples_leaf) são usados para evitar overfitting

model = train(X_train, y_train, seed)

In [31]:
# Visualização gráfica da árvore de decisão
from sklearn.tree import plot_tree
plt.subplots(figsize=(80, 40)) # Definir tamanho da imagem a ser gerada
plot_tree(model, class_names=['Não Pagou', 'Pagou'], feature_names=data.columns,
               filled=True, rounded=True); # plota a árvore

plt.savefig('tree.png') # Salva a imagem

KeyboardInterrupt: 

Error in callback <function flush_figures at 0x14053df30> (for post_execute):


KeyboardInterrupt: 

In [None]:
# profundidade da árvore
model.get_depth()

### Avaliar modelo treinado na base de teste

In [None]:
def predict_and_evaluate(model, X_test, y_test):

  # inferência do teste
  y_pred = model.predict(X_test) 

  # Acurácia
  from sklearn.metrics import accuracy_score
  accuracy = accuracy_score(y_test, y_pred)
  print('Acurácia: ', accuracy)

  # Kappa
  from sklearn.metrics import cohen_kappa_score
  kappa = cohen_kappa_score(y_test, y_pred)
  print('Kappa: ', kappa)

  # F1
  from sklearn.metrics import f1_score
  f1 = f1_score(y_test, y_pred)
  print('F1: ', f1)

  # Matriz de confusão
  from sklearn.metrics import confusion_matrix
  confMatrix = confusion_matrix(y_test, y_pred)

  ax = plt.subplot()
  sns.heatmap(confMatrix, annot=True, fmt=".0f")
  plt.xlabel('Previsto')
  plt.ylabel('Real')
  plt.title('Matriz de Confusão')

  # Colocar os nomes
  ax.xaxis.set_ticklabels(['Não Pagou', 'Pagou']) 
  ax.yaxis.set_ticklabels(['Não Pagou', 'Pagou'])
  plt.show()

print('Resultados de Treino')
predict_and_evaluate(model, X_train, y_train)
print('Resultados de Teste')
predict_and_evaluate(model, X_test, y_test)

###Normalizar

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Treinar e Testar novamente
model = train(X_train, y_train, seed)
print('Resultados de Treino')
predict_and_evaluate(model, X_train, y_train)
print('Resultados de Teste')
predict_and_evaluate(model, X_test, y_test)

##Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Definir parâmetros a serem utilizados
tuned_parameters = [{'criterion': ['gini', 'entropy'], 'max_depth': [2,4,5,6,8,10,12],
                     'min_samples_leaf': [1, 2, 3, 4, 5, 8, 10]}]

# Executar o grid search
model = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, scoring='f1')
model.fit(X_train, y_train);

In [None]:
model.best_params_

In [None]:
print('Resultados de Treino')
predict_and_evaluate(model, X_train, y_train)
print('Resultados de Teste')
predict_and_evaluate(model, X_test, y_test)

In [None]:
# Visualização gráfica da árvore de decisão
plt.subplots(figsize=(20, 10)) # Definir tamanho da imagem a ser gerada
plot_tree(model.best_estimator_, class_names=['Não Pagou', 'Pagou'], 
               filled=True, rounded=True, feature_names=data.columns);

plt.savefig('tree_final.png') # Salva a imagem

# Inferir novos dados com modelo treinado!

In [None]:
# 1. Carreguei os dados sem o rótulo
new_data = pd.read_table('credito_sem_rotulo.txt')
new_data.head(8)

In [None]:
# 2. Aplicar os pré processamentos feitos no treino
new_data_normalized = scaler.transform(new_data)

In [None]:
# 3. Faço as inferências
inferences = model.predict(new_data_normalized)

In [None]:
new_data.shape

In [None]:
# 4. Visualizar inferências juntamente com a base original
new_data['previsões'] = inferences
new_data.head(8)

In [None]:
new_data['prob0'] = model.predict_proba(new_data_normalized)[:,0]

In [None]:
new_data['prob1'] = model.predict_proba(new_data_normalized)[:,1]

In [None]:
new_data.head(8)