In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
performance  = pd.read_csv('performance_train.csv')
facturation  = pd.read_csv('facturation_train.csv')
payments     = pd.read_csv('paiements_train.csv')
transactions = pd.read_csv('transactions_train.csv')

customer_ids = performance['ID_CPTE']

print('Proportion of clients who default:', sum(performance['Default'])/len(performance))

Proportion of clients who default: 0.19336134453781512


# Raw Data Preview

#### Performance

In [3]:
performance.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,99690111,2015-12-01,0
1,57427180,2012-12-01,0
2,29617912,2015-12-01,0
3,61632809,2015-12-01,0
4,14117855,2013-12-01,0


#### Facturation

In [4]:
facturation.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


#### Payments

In [5]:
payments.head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
4,99690111,250.0,2015-11-24 00:00:00,Q


#### Transactions

In [6]:
transactions.head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
0,99690111,A,365767,DP,C,5927.0,52.53,E,2015-06-20 12:00:00,F,AN
1,99690111,L,2635650,DP,C,13343.0,28.35,B,2015-01-25 12:00:00,F,AN
2,99690111,L,2635650,DP,C,13343.0,0.0,A,2015-01-26 12:00:00,G,AN
3,99690111,J,680536,AF,C,9430.0,0.0,A,2015-03-25 08:00:00,G,AW
4,99690111,J,680536,AF,C,10600.0,0.0,A,2015-03-03 08:00:00,G,AW


In [7]:
print('performance length:\t', len(performance))
print('facturation length:\t', len(facturation))
print('payments length:\t', len(payments))
print('transactions length:\t', len(transactions))

performance length:	 11900
facturation length:	 166543
payments length:	 292320
transactions length:	 690730


In [14]:
class customer:
    
    def __init__(self, customer_id, performance, facturation, payments, transactions):
        
        self.customer_id  = customer_id
        self.performance  = performance
        self.facturation  = facturation
        self.payments     = payments
        self.transactions = transactions
        
        self.assessment = performance['PERIODID_MY'][0]
        self.default    = performance['Default'][0]

In [15]:
def generate_clients(customer_ids, *dfs):
    
    for cus in customer_ids:
        
        yield [cus] + [df[df['ID_CPTE'] == cus] for df in dfs]

In [16]:
client_generator = generate_clients(customer_ids, performance, facturation, payments, transactions)
clients = [customer(*client_info) for client_info in client_generator]

## Notes

- All ```PERIODID_MY``` occur on the 1st day of a given month, i.e. ```PERIODID_MY = 201X-XX-01```.

#### Performance

In [31]:
clients[0].performance

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,99690111,2015-12-01,0


#### Facturation

In [120]:
facturation['DelqCycle'].value_counts()

0    157173
1      7899
2      1285
3       151
4        30
5         5
Name: DelqCycle, dtype: int64

In [113]:
clients[0].facturation.sort_values(by=['PERIODID_MY']).head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
7,99690111,2014-12-01,2014-12-03,1151.85,0.0,12000.0,0
10,99690111,2015-01-01,2015-01-02,2298.96,350.0,12000.0,0
8,99690111,2015-02-01,2015-01-31,4045.67,1148.45,16200.0,0
11,99690111,2015-03-01,2015-03-03,5926.2,2567.25,16200.0,0


In [114]:
clients[1].facturation.sort_values(by=['PERIODID_MY']).head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
26,57427180,2011-11-01,2011-11-04,392.2,0.0,2800.0,0
27,57427180,2011-12-01,2011-12-02,219.0,0.0,2800.0,0
20,57427180,2012-01-01,2012-01-02,233.58,236.08,2800.0,0
25,57427180,2012-02-01,2012-02-03,234.0,0.0,2800.0,0
21,57427180,2012-03-01,2012-03-03,809.12,0.0,2800.0,1


#### Payments

In [115]:
clients[0].payments.sort_values(by=['TRANSACTION_DTTM']).head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
12,99690111,262.5,2015-01-26 00:00:00,Q
11,99690111,303.0,2015-02-26 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
0,99690111,208.0,2015-04-26 00:00:00,Q


#### Transactions

In [116]:
clients[0].transactions.sort_values(by=['TRANSACTION_DTTM']).head()

Unnamed: 0,ID_CPTE,MERCHANT_CATEGORY_XCD,MERCHANT_CITY_NAME,MERCHANT_COUNTRY_XCD,DECISION_XCD,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT,TRANSACTION_CATEGORY_XCD,TRANSACTION_DTTM,TRANSACTION_TYPE_XCD,SICGROUP
193,99690111,YZ,365767,DP,C,10196.0,21.63,E,2014-12-31 16:00:00,F,AG
246,99690111,QQ,2122999,DP,C,10174.0,51.5,B,2015-01-01 12:00:00,D,AJ
93,99690111,EE,2269449,DP,C,10124.0,42.84,E,2015-01-04 16:00:00,F,AX
234,99690111,QQ,2122999,DP,C,10082.0,300.0,A,2015-01-05 12:00:00,E,AJ
64,99690111,EE,365767,DP,C,9767.0,60.0,E,2015-01-06 16:00:00,F,AX


In [88]:
clients[7].transactions['TRANSACTION_TYPE_XCD']

{'C', 'F'}

In [117]:
payments[payments['PAYMENT_REVERSAL_XFLG'] == 'N'].head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
2464,13888046,555.33,2012-01-05 12:00:00,N
4121,16989442,27952.68,2014-07-01 12:00:00,N
33905,60095553,525.0,2016-05-30 00:00:00,N
46169,13461194,1010.0,2014-05-18 12:00:00,N
55058,43175568,1008.06,2015-05-02 12:00:00,N
