# Ciclo 02: Aprendizado Supervisionado - Classificação

## 1.0 Import Libraries

In [46]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics as mt

## 1.1 Load Dataset

In [47]:
dataset_path = './dataset/train.csv'

df = pd.read_csv(dataset_path)

In [48]:
df.head()

Unnamed: 0,id_cliente,idade,saldo_atual,divida_atual,renda_anual,valor_em_investimentos,taxa_utilizacao_credito,num_emprestimos,num_contas_bancarias,num_cartoes_credito,dias_atraso_dt_venc,num_pgtos_atrasados,num_consultas_credito,taxa_juros,investe_exterior,pessoa_polit_exp,limite_adicional
0,1767,21,278.172008,2577.05,24196.89636,104.306544,31.038763,6,5,7,21,14,9,15,Não,Não,Negar
1,11920,40,268.874152,2465.39,19227.37796,69.858778,36.917093,5,8,5,40,23,10,18,Não,Não,Negar
2,8910,36,446.643127,1055.29,42822.28223,134.201478,34.561714,0,3,6,26,13,3,15,Sim,Não,Negar
3,4964,58,321.141267,703.05,51786.826,297.350067,31.493561,0,3,7,12,7,2,1,Sim,Não,Negar
4,10100,35,428.716114,891.29,44626.85346,134.201478,28.028887,2,8,7,24,10,8,20,Sim,Não,Negar


In [49]:
df.loc[: ,'limite_adicional'].unique()

array(['Negar', 'Conceder'], dtype=object)

## 1.2 Features Selection

In [50]:
df.columns

Index(['id_cliente', 'idade', 'saldo_atual', 'divida_atual', 'renda_anual',
       'valor_em_investimentos', 'taxa_utilizacao_credito', 'num_emprestimos',
       'num_contas_bancarias', 'num_cartoes_credito', 'dias_atraso_dt_venc',
       'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros',
       'investe_exterior', 'pessoa_polit_exp', 'limite_adicional'],
      dtype='object')

In [51]:
# x = training features
# y = label

features = ['idade', 'saldo_atual', 'divida_atual', 'renda_anual', 'valor_em_investimentos',
            'taxa_utilizacao_credito', 'num_emprestimos', 'num_contas_bancarias', 'num_cartoes_credito',
             'dias_atraso_dt_venc', 'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros']
label = 'limite_adicional'

x_train = df.loc[:, features]
y_train = df.loc[:, label]

## 1.3 Training

In [52]:
#training parameters definition
k = 21
knn_classifier = KNeighborsClassifier( n_neighbors = k )

#algorith training
knn_classifier.fit(x_train, y_train)

In [53]:
y_pred = knn_classifier.predict(x_train)

In [54]:
df_result = df.copy()
df_result['Classificacao'] = y_pred

In [55]:
df_result.loc[:, ['id_cliente', 'idade', 'limite_adicional', 'Classificacao']].sample(20)

Unnamed: 0,id_cliente,idade,limite_adicional,Classificacao
7988,10122,50,Negar,Negar
9048,6319,25,Conceder,Negar
2295,3394,46,Negar,Negar
8517,1552,23,Negar,Negar
4088,8377,59,Negar,Negar
4644,1457,32,Conceder,Negar
3886,3348,39,Conceder,Negar
7657,6616,24,Negar,Negar
8019,10979,696,Negar,Negar
5634,1300,44,Negar,Negar


## 1.4 Performance

### 1.4.1 Confusion Matrix

In [56]:
df.groupby('limite_adicional')['limite_adicional'].count()

limite_adicional
Conceder    1505
Negar       7995
Name: limite_adicional, dtype: int64

In [57]:
df.loc[:, 'limite_adicional'].value_counts(normalize = True)

limite_adicional
Negar       0.841579
Conceder    0.158421
Name: proportion, dtype: float64

In [58]:
df_result['acertos'] = df_result.loc[:, ['id_cliente','limite_adicional','Classificacao']].apply(
    lambda x: 1 if x['limite_adicional'] == x['Classificacao'] else 0, axis = 1)

In [59]:
df_result.loc[:, ['id_cliente','limite_adicional','Classificacao','acertos']].sample(20)


Unnamed: 0,id_cliente,limite_adicional,Classificacao,acertos
6496,11886,Negar,Negar,1
4428,8234,Negar,Negar,1
161,291,Negar,Negar,1
3339,11749,Negar,Negar,1
3702,2007,Conceder,Negar,0
2846,621,Negar,Negar,1
6730,11317,Negar,Negar,1
8585,3555,Negar,Negar,1
2979,10614,Negar,Negar,1
5339,5405,Negar,Negar,1


In [60]:
mt.confusion_matrix(y_train, y_pred)

array([[ 146, 1359],
       [ 102, 7893]])

### 1.4.2 Accurary

In [61]:
corrects = df_result['acertos'].sum()
total = df_result['acertos'].size

accuracy = corrects/total
print( 'Acuracia: {:.2f}%'.format( accuracy * 100) )

Acuracia: 84.62%


In [62]:
accuracy = mt.accuracy_score(y_train, y_pred)
print( 'Acuracia: {:.2f}%'.format( accuracy * 100) )

Acuracia: 84.62%


### 1.4.3 Precision

In [63]:
precision = mt.precision_score(y_train, y_pred, pos_label = 'Conceder')
print( 'Precision: {:.2f}%'.format( precision*100 ) )

Precision: 58.87%


### 1.4.4 Recall

In [64]:
recall = mt.recall_score(y_train, y_pred, pos_label = 'Conceder')
print( 'Recall: {:.2f}%'.format( recall*100 ) )

Recall: 9.70%


O algoritmo tem uma precisao melhor que recall!

### 1.4.4 Exercises

2. Retreine o algoritmo com os seguintes valores para K: [3, 5, 7, 9, 11, 13,
15, 17, 19 e 21] e anote a acurácia.

In [65]:
results = [accuracy,precision,recall]
print('K = {} \nAccuracy = {:.2f}%\nPrecision = {:.2f}%\nRecall = {:.2f}%'.format(k, accuracy*100, precision*100, recall*100))

K = 21 
Accuracy = 84.62%
Precision = 58.87%
Recall = 9.70%


5. Escrever um trecho de código que automatize o treinamento do algoritmo K-NN, a fim de encontrar o melhor valor para K, do exercício 2.

Sendo o objetivo aumentar sugerir ao cliente aumentar o seu limite no momento certo em que ele for apto, otimizaríamos a precisão!

In [66]:
k_list = list(range(3,22,2))
k_list

[3, 5, 7, 9, 11, 13, 15, 17, 19, 21]

In [67]:

for k in k_list:
    knn_classifier = KNeighborsClassifier( n_neighbors = k )
    knn_classifier.fit(x_train, y_train)
    y_pred = knn_classifier.predict(x_train)
    df_result = df.copy()
    df_result.head()
    df_result['Classificacao'] = y_pred
    accuracy = mt.accuracy_score(y_train, y_pred)
    recall = mt.recall_score(y_train, y_pred, pos_label = 'Conceder')
    precision = mt.precision_score(y_train, y_pred, pos_label = 'Conceder')
    if precision > max_precision:
        confusion_matrix = mt.confusion_matrix(y_train, y_pred)
        accuracy_of_max_precision = accuracy
        recall_of_max_precision = recall
        max_precision = precision
        best_k = k

print(confusion_matrix)
print('Best k = {}\nMax Precision = {:.2f}%\nAccuracy = {:.2f}%\nRecall = {:.2f}%'.format(best_k, max_precision*100, accuracy_of_max_precision*100, recall_of_max_precision*100))

[[ 705  800]
 [ 288 7707]]
Best k = 3
Max Precision = 71.00%
Accuracy = 88.55%
Recall = 46.84%


8.2 Classe balanceada: Mantenha a proporção de 50% das linhas da planilha de dados com
exemplos da classe “Conceder” e 50% com a classe “Negar”. Faça a matriz de confusão, calcule a acurácia, recall e precision.

In [68]:
#Criando tabela com 50% das linhas sendo 'Negar' e 50% das linhas sendo 'Conceder'
df_2 = df.copy()

negar_df = df_2.loc[df_2['limite_adicional'] == 'Negar', :]
conceder_df = df_2.loc[df_2['limite_adicional'] == 'Conceder', :]

min_size = min(len(negar_df), len(conceder_df))

negar_sample = negar_df.sample(n = min_size, random_state = 42)
conceder_sample = conceder_df.sample(n = min_size, random_state = 42)

df_50_50 = pd.concat([negar_sample,conceder_sample]).sample(frac = 1, random_state = 42).reset_index(drop = True)
df_50_50.loc[:, 'limite_adicional'].value_counts(normalize=True)



limite_adicional
Conceder    0.5
Negar       0.5
Name: proportion, dtype: float64

In [None]:
#Treinando o modelo 50/50
features = ['idade', 'saldo_atual', 'divida_atual', 'renda_anual', 'valor_em_investimentos',
            'taxa_utilizacao_credito', 'num_emprestimos', 'num_contas_bancarias', 'num_cartoes_credito',
             'dias_atraso_dt_venc', 'num_pgtos_atrasados', 'num_consultas_credito', 'taxa_juros']
label = 'limite_adicional'

x_train = df_50_50.loc[:, features]
y_train = df_50_50.loc[:, label]

k_list = list(range(3,22,2))

for k in k_list:
    knn_classifier = KNeighborsClassifier( n_neighbors = k )
    knn_classifier.fit(x_train, y_train)
    y_pred = knn_classifier.predict(x_train)
    df_result = df_50_50.copy()
    df_result.head()
    df_result['Classificacao'] = y_pred
    accuracy = mt.accuracy_score(y_train, y_pred)
    recall = mt.recall_score(y_train, y_pred, pos_label = 'Conceder')
    precision = mt.precision_score(y_train, y_pred, pos_label = 'Conceder')
    if precision > max_precision:
        confusion_matrix = mt.confusion_matrix(y_train, y_pred)
        accuracy_of_max_precision = accuracy
        recall_of_max_precision = recall
        max_precision = precision
        best_k = k

print(confusion_matrix)
print('Best k = {}\nMax Precision = {:.2f}%\nAccuracy = {:.2f}%\nRecall = {:.2f}%'.format(best_k, max_precision*100, accuracy_of_max_precision*100, recall_of_max_precision*100))

[[1303  202]
 [ 319 1186]]
Best k = 3
Max Precision = 80.33%
Accuracy = 82.69%
Recall = 86.58%


8.3 Classe desbalanceada 90/10: Mantenha a proporção de 90% das linhas da planilha de dados com
exemplos da classe “Conceder” e 10% com a classe “Negar”. Faça a matriz de confusão, calcule a acurácia, recall e precision.