In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [444]:
#Client generation functions
#0 - ok, 1 - 1 bucket, 2 - 2, 3 - 3, 4 - 4+, C - closed
#rr[i,j] - p ith to jth, j=5 => Early Repayment

def states2string(states):
    mp = {0:'0',1:'1',2:'2',3:'3',4:'4',5:'C', 6:'L'}
    if type(states) is list:
        return ''.join([mp[x] for x in states])
    return mp[states]

def augmentSymbol(s, pm):
    if s in ('C', 'N'):
        return s
    return np.random.choice(['-',s],p=[pm,1-pm])

def augmentMissing(pp, pm):
    return ''.join([augmentSymbol(x, pm) for x in pp])

def string2states(s):
    mp = {'0':0,'1':1,'2':2,'3':3,'4':4,'C':5, 'L':6}
    return [mp[x] for x in s]

def genPP(age, term, rr, s0, pMissing=0.05):
    if age <= 0:
        return ""
    pp = [np.random.choice(range(6), p=s0)]
    if age <= 1:
        return states2string(pp)
    for i in range(age-1):
        prev = pp[-1]
        nxt = 5
        if prev < 5 and (i < term or prev > 0):
            nxt = np.random.choice(range(6), p=rr[prev,:])
            if i >= term and nxt==0:
                nxt = 5
        pp.append(nxt)
    return augmentMissing(states2string(list(reversed(pp))), pm=pMissing)

def genCreditRR(rr, lamAge=20, lamTerm=10, emu=np.log(1e5), esigma=3, pMissing=0.1):
    s0 = np.zeros(6)
    s0[0] = rr[0,0] / (rr[0,0] + rr[0,5])
    s0[5] = rr[0,5] / (rr[0,0] + rr[0,5])
    
    age = np.random.poisson(lam=lamAge)
    term = np.random.poisson(lam=lamTerm)
    limit = np.ceil(np.exp(np.random.normal(loc=emu, scale=esigma)) / 1e3) * 1e3
    pp = genPP(age, term, rr, s0, pMissing=pMissing)
    return (limit, term, pp)    

def genCreditSimple(pBad=0.1, pEarlyRepayment=0.1, lamAge=20, pMissing=0.1):
    pGood = (1 - pBad) 
    r0 = [(pGood - pEarlyRepayment), pBad, 0, 0, 0, pEarlyRepayment]
    r1x = np.array([pGood * 0.33 / 0.9, 0.33, pBad * 0.33 / 0.1, 0, 0, pEarlyRepayment * pGood * 0.2])
    r1s = np.sum(r1x)
    r1 = [x/r1s for x in r1x]
    r2p = [[0.10, 0.20, 0.10, 0.60, 0.0, 0.0],
      [0.05, 0.05, 0.05, 0.05, 0.8, 0.0],
      [0.03, 0.03, 0.02, 0.02, 0.9, 0.0]]
    rr = np.array([r0] + [r1] + r2p)
    return genCreditRR(rr, lamAge=lamAge, pMissing=pMissing)

def getClientTarget(data):
    num0 = 0
    num1 = 0
    num2p = 0
    for r in data:
        num0 += r[2].count('0') + r[2].count('L')
        num1 += r[2].count('1')
        num2p += 2 * r[2].count('2') + 3 * r[2].count('3') + 4 * r[2].count('4')
    pGood = 0.5
    if num0 + num1 + num2p > 0:
        pGood = num0 / (0.1 + num0 + num1 + num2p)
    pBad = 1 - pGood
    return (np.random.binomial(1, pBad), pBad)

def genClient(lamNum=2, muBad=0.1, sigmaBad=0.1, pEarlyRepayment=0.05, muAge=20, sigmaAge=5, pMissing=0.1):
    numCredits = max(1, np.random.poisson(lam=lamNum))
    data = [genCreditSimple(pBad=min(0.5,np.random.lognormal(mean=np.log(muBad), sigma=sigmaBad)),
                            pEarlyRepayment=pEarlyRepayment, pMissing=pMissing,
                           lamAge=np.random.lognormal(mean=np.log(muAge), sigma=np.log(sigmaAge)))
            for i in range(numCredits)]
    target, prob = getClientTarget(data)
    return (data, target, prob)

In [440]:
#Generate sample (as in RRs)
def genSample(numObs=1000, maxEntries=5, genObs=genClient):
    res = []
    for i in range(numObs):
        (obs, trgt, prob) = genObs()
        row = [i, trgt, prob, len(obs)]
        for j in range(min(len(obs), maxEntries)):
            row += [obs[j][0], obs[j][1], obs[j][2]]
        for j in range(len(obs), maxEntries):
            row += [None, None, None]
        res.append(row)
    return pd.DataFrame(np.array(res),
                        columns=['accnt_id', 'trgt', 'prob', 'num'] +
                        [f.format(i) for i in range(maxEntries) for f in ['limit{0}','term{0}','pp{0}']])

In [446]:
train_sample = genSample(10000)
valid_sample = genSample(1000)
train_sample[:10]

Unnamed: 0,accnt_id,trgt,prob,num,limit0,term0,pp0,limit1,term1,pp1,limit2,term2,pp2,limit3,term3,pp3,limit4,term4,pp4
0,0,0,0.269103,4,252000.0,8,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,9291000.0,7.0,110000000,345000.0,9.0,,74000.0,12.0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,
1,1,0,0.148936,2,606000.0,9,0-00-0,6000.0,10.0,C-0000011000,,,,,,,,,
2,2,0,0.00497512,2,222000.0,19,0000000000000,6000.0,8.0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,,,,,,,
3,3,0,0.00621118,3,120000.0,17,00000-000-00,2905000.0,10.0,000000,601000.0,4.0,,,,,,,
4,4,1,0.382716,1,8000.0,6,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,,,,,,,,,,
5,5,1,0.957537,2,85033000.0,12,CCCCCCCCCCCCCCCCCCCCCCC34444--4444432-10,1724000.0,8.0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,,,,,,,
6,6,1,0.879663,2,641000.0,6,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC44...,2953000.0,10.0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,,,,,,,
7,7,0,0.5,1,161000.0,10,CCCCCCCCCCCCCC,,,,,,,,,,,,
8,8,0,0.00552486,2,10000.0,11,CCCCCCCCCCCCC0000000000,4000.0,8.0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC0000000-0,,,,,,,,,
9,9,1,1.0,2,145000.0,7,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,3000.0,13.0,,,,,,,,,,


In [400]:
#Transformation functions
def transformPP(term, pp):
    if pp is None:
        return pp
    pplen = len(pp)
    lst = max(0, pplen - term)
    return pp[:lst] + pp[lst:].replace('C', 'L')

def truncPP(term, pp, trlen=60):
    if pp is None:
        if term is None:
            return 'X'*trlen
        else:
            return 'N'*trlen
    pplen = len(pp)
    if pplen >= trlen:
        return pp[:trlen]
    return pp + 'N'*(trlen - pplen)

def transformDF(df, name='pp{0}t', trlen=60):
    num = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    res = df.copy()
    for i in range(num):
        cols = ['pp{0}'.format(i), 'term{0}'.format(i)]
        res[name.format(i)] = [truncPP(t, transformPP(t, p), trlen) for _,(p,t) in df[cols].iterrows()]
    return res


def transformToTensor(df, pp='pp{0}t', useX=False, numCredits=None):
    #check dimensions
    #num credits
    num_credits = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    if numCredits is not None:
        if num_credits < numCredits:
            raise "Provided <numCredits> is greater than number of fields in DF"
        num_credits = numCredits
    num_mobs = None
    for i in range(num_credits):
        lens = list(set(len(x) for x in df[pp.format(i)] if x is not None))
        numx = np.sum(['X' in x for x in df[pp.format(i)] if x is not None])
        if numx > 0 and not useX:
            raise "Not supposed to use X, but X is found in observations!"
        if len(lens) != 1:
            raise "Expected same length in all observations!"
        if num_mobs is None:
            num_mobs = lens[0]
        if num_mobs != lens[0]:
            raise "Expected same length in all observations!"
    mapping = {'0':0,'1':1,'2':2,'3':3,'4':4,'-':5,'L':6,'C':7,'N':8}
    if useX:
        mapping['X'] = 9
    
    res = []
    res_meta = []
    res_trgt = []
    for _, r in df.iterrows():
        cred = []
        cred_meta = []
        res_trgt.append(r.trgt)
        for i in range(num_credits):
            cred.append([mapping[x] for x in reversed(r[pp.format(i)])])
            cred_meta.append([-1 if r[f.format(i)] is None else r[f.format(i)] for f in ['limit{0}','term{0}']])
        res.append(cred)
        res_meta.append(cred_meta)
    return np.array(res, dtype=np.int32), np.array(res_meta, dtype=np.float32), np.array(res_trgt, dtype=np.int32)

In [451]:
train_sample = transformDF(train_sample[train_sample.num <= 5])
valid_sample = transformDF(valid_sample[valid_sample.num <= 5])

In [587]:
param_MOBI_O   = 10
param_RNN_size = 10
param_OUT_H1   = 10
param_LR       = 1e-1

size_pp_dictionary =  9
size_meta_vars = 2

size_mob_vars = size_pp_dictionary + size_meta_vars

tf.reset_default_graph()

tfIn_Trgt = tf.placeholder(shape=(None,), dtype=tf.int32)
tfIn_PP = tf.placeholder(shape=(None, None, None), dtype=tf.int32)
tfIn_Meta = tf.placeholder(shape=(None, None, size_meta_vars), dtype=tf.float32)

with tf.name_scope(name='DATA-TRANSFORMATION'):
    tfY  = tf.one_hot(tfIn_Trgt, 2)
    tfXP = tf.one_hot(tfIn_PP, size_pp_dictionary)
    tfXM = tfIn_Meta
    tfXMt = tf.reshape(tf.tile(tfXM, multiples=[1,1,tf.shape(tfXP)[2]]),
                       shape=[tf.shape(tfXM)[0], tf.shape(tfXM)[1], tf.shape(tfXP)[2], size_meta_vars])
    tfX = tfXP#tf.concat([tfXP, tfXMt], axis=3)

#1 define observation importance
with tf.name_scope(name='CR-MOB-Importance'):
    tfMI1 = tf.layers.dense(tfX, param_MOBI_H1, activation=tf.nn.relu, name='MOBI-H1')
    tfMIO  = tf.layers.dense(tfMI1, param_MOBI_O, activation=tf.nn.relu, name='MOBI-OUT')
    tfMIW  = tf.layers.dense(tfMI1, param_MOBI_O, name='MOBI-WEIGHT')

#2 recombine importance
with tf.name_scope(name='CR-MOB-Weighting'):
    tfWI = tf.nn.softmax(tfMIO, dim=1)
    tfWO = tf.reduce_sum(tfMIO * tfWI, axis=1)
    
#3 define RNN on these inputs
with tf.name_scope(name='MOB-RNN'):
    rnnCell = tf.nn.rnn_cell.GRUCell(num_units=param_RNN_size)
    _, tfMO = tf.nn.dynamic_rnn(rnnCell, inputs=tfWO, dtype=tf.float32)

#4 define output layer
with tf.name_scope(name='OUTPUT-FFNN'):
    tfOH1 = tf.layers.dense(tfMO, param_OUT_H1, activation=tf.nn.relu, name='OUT-H1')
    tfOO = tf.layers.dense(tfOH1, 2, name='OUT-OUT')
    tft = tf.reduce_mean(tfOO)

#5a define loss functions
with tf.name_scope(name='LOSS-OPTIMIZER'):
    tfLoss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfY,logits=tfOO))
    tfTrain = tf.train.AdamOptimizer(param_LR).minimize(tfLoss)
    
#5b prediction
tfOutProb = tf.nn.softmax(tfOO)[:,1]
    
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_BurPat_RNN', tf.get_default_graph())
print('Graph creation complete')

Graph creation complete


In [576]:
param_MOBI_O   = 10
param_RNN_size = 10
param_OUT_H1   = 10
param_LR       = 1e-1

size_pp_dictionary =  9
size_meta_vars = 2

size_mob_vars = size_pp_dictionary + size_meta_vars

tf.reset_default_graph()

tfIn_Trgt = tf.placeholder(shape=(None,), dtype=tf.int32)
tfIn_PP = tf.placeholder(shape=(None, 1, None), dtype=tf.int32)
tfIn_Meta = tf.placeholder(shape=(None, 1, size_meta_vars), dtype=tf.float32)

with tf.name_scope(name='DATA-TRANSFORMATION'):
    tfY  = tf.one_hot(tfIn_Trgt, 2)
    tfXP = tf.one_hot(tfIn_PP, size_pp_dictionary)
    tfXM = tfIn_Meta
    tfXMt = tf.reshape(tf.tile(tfXM, multiples=[1,1,tf.shape(tfXP)[2]]),
                       shape=[tf.shape(tfXM)[0], tf.shape(tfXM)[1], tf.shape(tfXP)[2], size_meta_vars])
    #tfX = tf.concat([tfXP, tfXMt], axis=3)
    tfX = tfXP
    
    tfWO = tf.reduce_sum(tfX, axis=1)
    
#3 define RNN on these inputs
with tf.name_scope(name='MOB-RNN'):
    rnnCell = tf.nn.rnn_cell.BasicRNNCell(num_units=param_RNN_size)
    tfMZero = rnnCell.zero_state(batch_size=tf.shape(tfY)[0], dtype=tf.float32)
    tfMHist, tfMO = tf.nn.dynamic_rnn(rnnCell, inputs=tfWO, initial_state=tfMZero)

#4 define output layer
with tf.name_scope(name='OUTPUT-FFNN'):
    tfOO = tf.layers.dense(tfMO, 2, name='OUT-OUT')
    tft = tf.reduce_mean(tfOO)

#5a define loss functions
with tf.name_scope(name='LOSS-OPTIMIZER'):
    tfLoss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfY,logits=tfOO))
    tfTrain = tf.train.AdamOptimizer(param_LR).minimize(tfLoss)
    
#5b prediction
tfOutProb = tf.nn.softmax(tfOO)[:,1]
    
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_BurPat_RNN', tf.get_default_graph())
print('Graph creation complete')

Graph creation complete


In [483]:
train_pp, train_meta, train_trgt = transformToTensor(train_sample[train_sample.num==1], useX=False, numCredits=1)
valid_pp, valid_meta, valid_trgt = transformToTensor(valid_sample[valid_sample.num==1], useX=False, numCredits=1)

In [588]:
num_epoch = 10
num_step  = 10
train_batch = {tfIn_Meta: train_meta, tfIn_PP: train_pp, tfIn_Trgt: train_trgt}
valid_batch = {tfIn_Meta: valid_meta, tfIn_PP: valid_pp, tfIn_Trgt: valid_trgt}
with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for i in range(num_epoch):
        for j in range(num_step):
            tfTrain.run(feed_dict=train_batch)
        print('Epoch {0}: loss={1}'.format(i, tfLoss.eval(feed_dict=valid_batch)))
    train_prob = tfOutProb.eval(feed_dict=train_batch)
    valid_prob = tfOutProb.eval(feed_dict=valid_batch)

Epoch 0: loss=0.6302306056022644
Epoch 1: loss=0.6020666360855103
Epoch 2: loss=0.545937180519104
Epoch 3: loss=0.6190164089202881
Epoch 4: loss=0.5081198811531067
Epoch 5: loss=0.49319905042648315
Epoch 6: loss=0.4992390275001526
Epoch 7: loss=0.4954121708869934
Epoch 8: loss=0.5095335245132446
Epoch 9: loss=0.5117131471633911


In [589]:
train_prob[:10]

array([ 0.23962305,  0.43548912,  0.23962305,  0.45882943,  0.4783622 ,
        0.92447424,  0.00269646,  0.23962305,  0.23962305,  0.01044583], dtype=float32)

In [590]:
train_sample[train_sample.num==1].prob[:10]

4       0.382716
7            0.5
11      0.047619
13      0.779006
15      0.683258
16      0.909502
19     0.0909091
21      0.960032
22      0.855453
23    0.00990099
Name: prob, dtype: object

In [575]:
tmp[3,:]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)