## Default of Credit Card Clients

Dataset

This dataset contains information on default payments, demographic factors, credit data, history of payment, and bill statements of credit card clients in Taiwan from April 2005 to September 2005.

Content
There are 25 variables:

ID: ID of each client

LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit

SEX: Gender (1=male, 2=female)

EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)

MARRIAGE: Marital status (1=married, 2=single, 3=others)

AGE: Age in years

PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)

PAY_2: Repayment status in August, 2005 (scale same as above)

PAY_3: Repayment status in July, 2005 (scale same as above)

PAY_4: Repayment status in June, 2005 (scale same as above)

PAY_5: Repayment status in May, 2005 (scale same as above)

PAY_6: Repayment status in April, 2005 (scale same as above)

BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)

BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)

BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)

BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)

BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)

BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)

PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)

PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)

PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)

PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)

PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)

PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)

default.payment.next.month: Default payment (1=yes, 0=no)

## Librerias 

In [None]:
!pip install imbalanced-learn
!pip install joblib
!pip install pydotplus
!pip install pydot
!pip install graphviz
!pip install Pillow
!pip install Image

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import itertools
from datetime import datetime 
import time
import sys
%matplotlib inline

## Carga Datos 

In [None]:
df0 = pd.read_excel(r'https://github.com/joatamayoav/Trabajo_Final_Analitica_Predictiva/blob/main/default_of_credit_card_clients.xls?raw=true',
     header=1,
    )

df0.head()


In [None]:
df0.rename(columns=lambda x: x.lower(), inplace=True)
df0.rename(columns={"default payment next month":"default"}, inplace=True) 

df0.head()

In [None]:
#
# Se crea copia del Dataframe original para no dañarlo
#

df = df0.copy()

df.head()

## Variables Dummys 

Variable ficticia para explicar valores cualitativos en un modelo de regresión.

In [None]:
df['grad_school'] = (df0["education"] == 1).astype(int)
df['university'] = (df0["education"] == 2).astype(int)
df['high_school'] = (df0["education"] == 3).astype(int)
df.drop("education", axis = 1, inplace = True)

df['male'] = (df0["sex"] == 1).astype(int)
df['female'] = (df0["sex"] == 2).astype(int)
df.drop("sex", axis = 1, inplace = True)

df['single'] = (df0["marriage"] == 2).astype(int)
df['married'] = (df0["marriage"] == 1).astype(int)
df.drop("marriage", axis = 1, inplace = True)

df.head(10)

In [None]:
pay_features = ['pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']

for p in pay_features:
    df.loc[df[p] <= 0, p] = 0

df.head()

## Análisis Descriptivo

In [None]:
sorted(df.columns)

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.describe()

## Análisis Exploratorio

In [None]:
# Cantidad de nulos

total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

In [None]:
temp = df["default"].value_counts()

df1 = pd.DataFrame({'default': temp.index,'values': temp.values})

plt.figure(figsize = (6,6))

plt.title('Default Credit Card Clients - target value - data unbalance\n (Default = 0, Not Default = 1)')

sns.set_color_codes("pastel")
sns.barplot(x = 'default', y="values", data=df1)

locs, labels = plt.xticks()

plt.show()
df1

### Distribución Credit Limit 

In [None]:
plt.figure(figsize = (14,6))
plt.title('Amount of credit limit - Density Plot')
sns.set_color_codes("dark")
sns.distplot(df['limit_bal'],kde=True,bins=200, color="black")
plt.show()


temp = df["limit_bal"].value_counts()
df1 = pd.DataFrame({'limit_bal': temp.index,'values': temp.values})

df1


## Modelación

###  Importación de Librerias

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, precision_recall_curve
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report


In [None]:
def MatrixConfusion(cm, labels = ['pay','default']):
    df = pd.DataFrame(data=cm, index=labels , columns=labels)
    df.index.name = 'True'
    df.columns.name = 'Prediction'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sum(axis=1)
    return df

In [None]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Greys):
    plt.figure(figsize=(9, 3), dpi = 72, tight_layout = True)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(cm.shape[1])
    plt.xticks(tick_marks, rotation=0)
    ax = plt.gca()
    ax.set_xticklabels((ax.get_xticks()+1).astype(str))
    plt.yticks(tick_marks)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], '.0f'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')  
    plt.show()
    
# ;

#     plt.figure(figsize=(8, 10))
#     sns.heatmap(mc, annot=True, fmt="d");
#     plt.title("Confusion matrix")
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     plt.show()


In [None]:
metrics = pd.DataFrame(index=['accuracy', 'precision', 'recall', 'roc_auc_score'],
                       columns=['BernoulliNB', 'GaussianNB'])

metrics

### Conjuntos de entrenamiento y prueba 

In [None]:
target = 'default'

X = df.drop('default', axis = 1)

robust_scaler = RobustScaler()

X = robust_scaler.fit_transform(X)

Y = df[target]

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.15, random_state=123, stratify=Y)

### BernoulliNB 

In [None]:
#
# Se importa la libreria
#
from sklearn.naive_bayes import BernoulliNB


start_time = time.time()

#
# Se crea un clasificador Gaussiano ingenuo
#
gnb = BernoulliNB(
    alpha=1.0,        # Laplace parameter
    binarize=0.0,
    fit_prior=True,
    class_prior=None,
)

#
# Se entrena el clasificador
#
gnb.fit(X_train, Y_train)

#
# Se pronostica la clasificación de los
# mensajes para los datos de entrada
#
y_pred_test = gnb.predict(X_test)

#
# Evaluación
#

metrics.loc['accuracy','BernoulliNB'] = accuracy_score(y_pred=y_pred_test, y_true=Y_test)
metrics.loc['precision','BernoulliNB'] = precision_score(y_pred=y_pred_test, y_true=Y_test)
metrics.loc['recall','BernoulliNB'] = recall_score(y_pred=y_pred_test, y_true=Y_test)
metrics.loc['roc_auc_score','BernoulliNB'] = roc_auc_score(y_pred_test, Y_test)

#
# Matriz Confusión
#

cm = confusion_matrix(y_pred=y_pred_test, y_true=Y_test)
matrix = MatrixConfusion(cm)
print(matrix)
print()
plot_confusion_matrix(cm)

#
# Reporte
#

sk_report = classification_report(
    digits=4,
    y_true=Y_test, 
    y_pred=y_pred_test)
print(sk_report)

print("--- %s seconds ---" % (time.time() - start_time))