## Observations 
From https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset/discussion/34608
- sometimes bill statement can be negative. This is most likely due to customers having a negative balance on their card eg. for a particular month, pay more than they owe on their bill. This might happen if autopay is set up to pay the same amount every month regardless of balance. 
- PAY_N == 0 indicates revolving line of credit which might indicate that the customer is using a credit card rather than giving a one-time load. 

In [1]:
import pandas as pd 
import altair as alt

In [2]:
df = pd.read_csv('UCI_Credit_Card.csv')

## Paying on time customer, no using credit very frequently, but quite close to the balance
No consumption for a few months, paid in full for the previous months

In [3]:
df[df['PAY_0']==-2].iloc[0,:]

ID                               10.0
LIMIT_BAL                     20000.0
SEX                               1.0
EDUCATION                         3.0
MARRIAGE                          2.0
AGE                              35.0
PAY_0                            -2.0
PAY_2                            -2.0
PAY_3                            -2.0
PAY_4                            -2.0
PAY_5                            -1.0
PAY_6                            -1.0
BILL_AMT1                         0.0
BILL_AMT2                         0.0
BILL_AMT3                         0.0
BILL_AMT4                         0.0
BILL_AMT5                     13007.0
BILL_AMT6                     13912.0
PAY_AMT1                          0.0
PAY_AMT2                          0.0
PAY_AMT3                          0.0
PAY_AMT4                      13007.0
PAY_AMT5                       1122.0
PAY_AMT6                          0.0
default.payment.next.month        0.0
Name: 9, dtype: float64

## Most likely using a credit card, paying off only the minimum every month

In [4]:
df[df['PAY_0']==0].iloc[0,:]

ID                                3.0
LIMIT_BAL                     90000.0
SEX                               2.0
EDUCATION                         2.0
MARRIAGE                          2.0
AGE                              34.0
PAY_0                             0.0
PAY_2                             0.0
PAY_3                             0.0
PAY_4                             0.0
PAY_5                             0.0
PAY_6                             0.0
BILL_AMT1                     29239.0
BILL_AMT2                     14027.0
BILL_AMT3                     13559.0
BILL_AMT4                     14331.0
BILL_AMT5                     14948.0
BILL_AMT6                     15549.0
PAY_AMT1                       1518.0
PAY_AMT2                       1500.0
PAY_AMT3                       1000.0
PAY_AMT4                       1000.0
PAY_AMT5                       1000.0
PAY_AMT6                       5000.0
default.payment.next.month        0.0
Name: 2, dtype: float64

In [5]:
alt.Chart(df.sample(5000)).mark_bar().encode(
    alt.X('default.payment.next.month:N', bin=True),
    y='count()',
    color='SEX:N',
    column='SEX:N')

In [6]:
def func(row):
    num_times_duly_paid = 0
    if row['PAY_0']==-1:
        num_times_duly_paid +=1 
    if row['PAY_2']==-1:
        num_times_duly_paid += 1
    if row['PAY_3']==-1:
        num_times_duly_paid += 1
    if row['PAY_4']==-1:
        num_times_duly_paid += 1
    if row['PAY_5']==-1:
        num_times_duly_paid += 1
    if row['PAY_6']==-1:
        num_times_duly_paid += 1
    return num_times_duly_paid

In [7]:
df['num_times_duly_paid'] = pd.Series([func(row) for idx, row in df.iterrows()])

In [8]:
import matplotlib.pyplot as plt

In [9]:
alt.Chart(df.sample(5000)).mark_bar().encode(
    alt.X('num_times_duly_paid', bin=True),
    y='count()',
    color='SEX:N',
    column='SEX:N')

In [10]:
alt.Chart(df.sample(5000)).mark_bar().encode(
    alt.X('num_times_duly_paid', bin=True),
    y='count()',
    color='EDUCATION:N',
    column='EDUCATION:N')

In [11]:
df['num_times_duly_paid'].describe()

count    30000.000000
mean         1.154667
std          1.929111
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max          6.000000
Name: num_times_duly_paid, dtype: float64