In [28]:
import tensorflow as tf
import numpy as np
import pandas as pd
import datetime
import time

In [3]:
#Client generation functions
#0 - ok, 1 - 1 bucket, 2 - 2, 3 - 3, 4 - 4+, C - closed
#rr[i,j] - p ith to jth, j=5 => Early Repayment

def states2string(states):
    mp = {0:'0',1:'1',2:'2',3:'3',4:'4',5:'C', 6:'L'}
    if type(states) is list:
        return ''.join([mp[x] for x in states])
    return mp[states]

def augmentSymbol(s, pm):
    if s in ('C', 'N'):
        return s
    return np.random.choice(['-',s],p=[pm,1-pm])

def augmentMissing(pp, pm):
    return ''.join([augmentSymbol(x, pm) for x in pp])

def string2states(s):
    mp = {'0':0,'1':1,'2':2,'3':3,'4':4,'C':5, 'L':6}
    return [mp[x] for x in s]

def genPP(age, term, rr, s0, pMissing=0.05):
    if age <= 0:
        return ""
    pp = [np.random.choice(range(6), p=s0)]
    if age <= 1:
        return states2string(pp)
    for i in range(age-1):
        prev = pp[-1]
        nxt = 5
        if prev < 5 and (i < term or prev > 0):
            nxt = np.random.choice(range(6), p=rr[prev,:])
            if i >= term and nxt==0:
                nxt = 5
        pp.append(nxt)
    return augmentMissing(states2string(list(reversed(pp))), pm=pMissing)

def genCreditRR(rr, lamAge=20, lamTerm=10, emu=np.log(1e5), esigma=3, pMissing=0.1):
    s0 = np.zeros(6)
    s0[0] = rr[0,0] / (rr[0,0] + rr[0,5])
    s0[5] = rr[0,5] / (rr[0,0] + rr[0,5])
    
    age = np.random.poisson(lam=lamAge)
    term = np.random.poisson(lam=lamTerm)
    limit = np.ceil(np.exp(np.random.normal(loc=emu, scale=esigma)) / 1e3) * 1e3
    pp = genPP(age, term, rr, s0, pMissing=pMissing)
    return (limit, term, pp)    

def genCreditSimple(pBad=0.1, pEarlyRepayment=0.1, lamAge=20, pMissing=0.1):
    pGood = (1 - pBad) 
    r0 = [(pGood - pEarlyRepayment), pBad, 0, 0, 0, pEarlyRepayment]
    r1x = np.array([pGood * 0.33 / 0.9, 0.33, pBad * 0.33 / 0.1, 0, 0, pEarlyRepayment * pGood * 0.2])
    r1s = np.sum(r1x)
    r1 = [x/r1s for x in r1x]
    r2p = [[0.10, 0.20, 0.10, 0.60, 0.0, 0.0],
      [0.05, 0.05, 0.05, 0.05, 0.8, 0.0],
      [0.03, 0.03, 0.02, 0.02, 0.9, 0.0]]
    rr = np.array([r0] + [r1] + r2p)
    return genCreditRR(rr, lamAge=lamAge, pMissing=pMissing)

def getClientTarget(data):
    num0 = 0
    num1 = 0
    num2p = 0
    for r in data:
        num0 += r[2].count('0') + r[2].count('L')
        num1 += r[2].count('1')
        num2p += 2 * r[2].count('2') + 3 * r[2].count('3') + 4 * r[2].count('4')
    pGood = 0.5
    if num0 + num1 + num2p > 0:
        pGood = num0 / (0.1 + num0 + num1 + num2p)
    pBad = 1 - pGood
    return (np.random.binomial(1, pBad), pBad)

def genClient(lamNum=2, muBad=0.1, sigmaBad=0.1, pEarlyRepayment=0.05, muAge=20, sigmaAge=5, pMissing=0.1):
    numCredits = max(1, np.random.poisson(lam=lamNum))
    data = [genCreditSimple(pBad=min(0.5,np.random.lognormal(mean=np.log(muBad), sigma=sigmaBad)),
                            pEarlyRepayment=pEarlyRepayment, pMissing=pMissing,
                           lamAge=np.random.lognormal(mean=np.log(muAge), sigma=np.log(sigmaAge)))
            for i in range(numCredits)]
    target, prob = getClientTarget(data)
    return (data, target, prob)

In [4]:
#Generate sample (as in RRs)
def genSample(numObs=1000, maxEntries=5, genObs=genClient):
    res = []
    for i in range(numObs):
        (obs, trgt, prob) = genObs()
        row = [i, trgt, prob, len(obs)]
        for j in range(min(len(obs), maxEntries)):
            row += [obs[j][0], obs[j][1], obs[j][2]]
        for j in range(len(obs), maxEntries):
            row += [None, None, None]
        res.append(row)
    return pd.DataFrame(np.array(res),
                        columns=['accnt_id', 'trgt', 'prob', 'num'] +
                        [f.format(i) for i in range(maxEntries) for f in ['limit{0}','term{0}','pp{0}']])

In [5]:
train_sample = genSample(50000)
valid_sample = genSample(5000)
train_sample[:10]

Unnamed: 0,accnt_id,trgt,prob,num,limit0,term0,pp0,limit1,term1,pp1,limit2,term2,pp2,limit3,term3,pp3,limit4,term4,pp4
0,0,1,0.57346,2,9420000.0,8,CCCCCCCCCCCCCCCC00044-210,2265000.0,7.0,CCCCCCCCCCCCCCCCCCCCCCCCCC000-1-00,,,,,,,,,
1,1,0,0.0728477,1,158000.0,14,CCCCCCCCCCCCC000010000000000,,,,,,,,,,,,
2,2,1,0.646302,2,7000.0,13,10000-,105000.0,14.0,4443-211-0000000--,,,,,,,,,
3,3,0,0.047619,1,190000.0,11,CCCCCC00,,,,,,,,,,,,
4,4,0,0.67033,1,633000.0,5,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,,,,,,,,,,
5,5,0,0.202279,3,19000.0,15,CCCCCCCCCCCC12100-00000000000,1510000.0,14.0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,3000.0,13.0,0.0,,,,,,
6,6,0,0.00763359,1,2647000.0,15,CCCCCCCCCCCCCCCCCCCCCCCC000-00-0-0000000,,,,,,,,,,,,
7,7,1,0.970326,2,1027000.0,9,CCCCCCCCCCCCCCCCCCCCCCCCC14444444444-444321144...,94000.0,8.0,0,,,,,,,,,
8,8,0,0.00990099,2,241000.0,12,CCCCCCC0000,65000.0,10.0,000000,,,,,,,,,
9,9,1,0.135802,1,1000.0,10,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,,,,,,,,,,,


In [25]:
#Transformation functions
def transformPP(term, pp):
    if pp is None:
        return pp
    pplen = len(pp)
    lst = max(0, pplen - term)
    return pp[:lst] + pp[lst:].replace('C', 'L')

def truncPP(term, pp, trlen=60):
    if pp is None:
        if term is None:
            return 'X'*trlen
        else:
            return 'N'*trlen
    pplen = len(pp)
    if pplen >= trlen:
        return pp[:trlen]
    return pp + 'N'*(trlen - pplen)

def transformDF(df, name='pp{0}t', trlen=60):
    num = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    res = df.copy()
    for i in range(num):
        cols = ['pp{0}'.format(i), 'term{0}'.format(i)]
        res[name.format(i)] = [truncPP(t, transformPP(t, p), trlen) for _,(p,t) in df[cols].iterrows()]
    return res


def transformToTensor(df, pp='pp{0}t', useX=False, numCredits=None):
    #check dimensions
    #num credits
    num_credits = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    if numCredits is not None:
        if num_credits < numCredits:
            raise "Provided <numCredits> is greater than number of fields in DF"
        num_credits = numCredits
    num_mobs = None
    for i in range(num_credits):
        lens = list(set(len(x) for x in df[pp.format(i)] if x is not None))
        numx = np.sum(['X' in x for x in df[pp.format(i)] if x is not None])
        if numx > 0 and not useX:
            raise "Not supposed to use X, but X is found in observations!"
        if len(lens) != 1:
            raise "Expected same length in all observations!"
        if num_mobs is None:
            num_mobs = lens[0]
        if num_mobs != lens[0]:
            raise "Expected same length in all observations!"
    mapping = {'0':0,'1':1,'2':2,'3':3,'4':4,'-':5,'L':6,'C':7,'N':8}
    if useX:
        mapping['X'] = 9
    
    res = []
    res_meta = []
    res_trgt = []
    for _, r in df.iterrows():
        cred = []
        cred_meta = []
        res_trgt.append(r.trgt)
        for i in range(num_credits):
            cred.append([mapping[x] for x in reversed(r[pp.format(i)])])
            cred_meta.append([-1 if r[f.format(i)] is None else r[f.format(i)] for f in ['limit{0}','term{0}']])
        res.append(cred)
        res_meta.append(cred_meta)
    return np.array(res, dtype=np.int32), np.array(res_meta, dtype=np.float32), np.array(res_trgt, dtype=np.int32)

def randomBatch(tensorTuple, batchSize=64):
    ids = np.random.choice(range(tensorTuple[0].shape[0]), batchSize)
    return (x[ids,] for x in tensorTuple)

In [7]:
train_sample = transformDF(train_sample[train_sample.num <= 5])
valid_sample = transformDF(valid_sample[valid_sample.num <= 5])

In [38]:
param_MOBI_H1  = 30
param_MOBI_O   = 20
param_RNN_size = 60
param_OUT_H1   = 100
param_LR       = 1e-3

size_pp_dictionary =  9
size_meta_vars = 2

size_mob_vars = size_pp_dictionary + size_meta_vars

tf.reset_default_graph()

tfIn_Trgt = tf.placeholder(shape=(None,), dtype=tf.int32)
tfIn_PP = tf.placeholder(shape=(None, None, None), dtype=tf.int32)
tfIn_Meta = tf.placeholder(shape=(None, None, size_meta_vars), dtype=tf.float32)

with tf.name_scope(name='DATA-TRANSFORMATION'):
    tfY  = tf.one_hot(tfIn_Trgt, 2)
    tfXP = tf.one_hot(tfIn_PP, size_pp_dictionary)
    tfXM = tfIn_Meta
    tfXMt = tf.reshape(tf.tile(tfXM, multiples=[1,1,tf.shape(tfXP)[2]]),
                       shape=[tf.shape(tfXM)[0], tf.shape(tfXM)[1], tf.shape(tfXP)[2], size_meta_vars])
    tfX = tfXP#tf.concat([tfXP, tfXMt], axis=3)

#1 define observation importance
with tf.name_scope(name='CR-MOB-Importance'):
    tfMI1 = tf.layers.dense(tfX, param_MOBI_H1, activation=tf.nn.relu, name='MOBI-H1')
    tfMIO  = tf.layers.dense(tfMI1, param_MOBI_O, activation=tf.nn.relu, name='MOBI-OUT')
    tfMIW  = tf.layers.dense(tfMI1, param_MOBI_O, name='MOBI-WEIGHT')

#2 recombine importance
with tf.name_scope(name='CR-MOB-Weighting'):
    tfWI = tf.nn.softmax(tfMIO, dim=1)
    tfWO = tf.reduce_sum(tfMIO * tfWI, axis=1)
    
#3 define RNN on these inputs
with tf.name_scope(name='MOB-RNN'):
    #rnnCell = tf.nn.rnn_cell.GRUCell(num_units=param_RNN_size)
    rnnCell = tf.nn.rnn_cell.BasicRNNCell(num_units=param_RNN_size, activation=tf.nn.relu)
    _, tfMO = tf.nn.dynamic_rnn(rnnCell, inputs=tfWO, dtype=tf.float32)

#4 define output layer
with tf.name_scope(name='OUTPUT-FFNN'):
    tfOH1 = tf.layers.dense(tfMO, param_OUT_H1, activation=tf.nn.relu, name='OUT-H1')
    tfOO = tf.layers.dense(tfOH1, 2, name='OUT-OUT')
    tft = tf.reduce_mean(tfOO)

#5a define loss functions
with tf.name_scope(name='LOSS-OPTIMIZER'):
    tfLoss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfY,logits=tfOO))
    tfLossSummary = tf.summary.scalar('Cross-Entropy-Loss', tfLoss)
    tfTrain = tf.train.AdamOptimizer(param_LR).minimize(tfLoss)
    
#5b prediction
tfOutProb = tf.nn.softmax(tfOO)[:,1]

dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_BurPat_RNN-{0}'.format(dt_now), tf.get_default_graph())
print('Graph creation complete')

Graph creation complete


In [27]:
train_pp, train_meta, train_trgt = transformToTensor(train_sample[train_sample.num==3], useX=False, numCredits=3)
valid_pp, valid_meta, valid_trgt = transformToTensor(valid_sample[valid_sample.num==3], useX=False, numCredits=3)

In [40]:
num_epoch = 50
num_step  = 20
batch_size = 2048
train_batch = {tfIn_Meta: train_meta, tfIn_PP: train_pp, tfIn_Trgt: train_trgt}
valid_batch = {tfIn_Meta: valid_meta, tfIn_PP: valid_pp, tfIn_Trgt: valid_trgt}
with tf.Session() as tfs:    
    tfs.run(tf.global_variables_initializer())
    for i in range(num_epoch):
        mini_pp, mini_meta, mini_trgt = randomBatch((train_pp, train_meta, train_trgt), batch_size)
        mini_batch = {tfIn_Meta: mini_meta, tfIn_PP: mini_pp, tfIn_Trgt: mini_trgt}
        
        time0 = time.perf_counter()
        loss0 = tfLoss.eval(feed_dict=mini_batch)
        for j in range(num_step):
            tfTrain.run(feed_dict=mini_batch)
        loss1 = tfLoss.eval(feed_dict=mini_batch)
        time1 = time.perf_counter()
        
        valid_loss_str = tfLossSummary.eval(feed_dict=valid_batch)
        tffw.add_summary(valid_loss_str, i)
        print('Epoch {0} ({3:1.2} sec): loss changed from {1:1.3} to {2:1.3}'.format(i, loss0, loss1, time1-time0))
    train_prob = tfOutProb.eval(feed_dict=train_batch)
    valid_prob = tfOutProb.eval(feed_dict=valid_batch)

Epoch 0 (1.4e+01 sec): loss changed from 0.682 to 0.591
Epoch 1 (1.4e+01 sec): loss changed from 0.593 to 0.566
Epoch 2 (1.4e+01 sec): loss changed from 0.529 to 0.523
Epoch 3 (1.4e+01 sec): loss changed from 0.511 to 0.559
Epoch 4 (1.4e+01 sec): loss changed from 0.551 to 0.515
Epoch 5 (1.4e+01 sec): loss changed from 0.512 to 0.498
Epoch 6 (1.4e+01 sec): loss changed from 0.529 to 0.569
Epoch 7 (1.4e+01 sec): loss changed from 0.579 to 0.588
Epoch 8 (1.4e+01 sec): loss changed from 0.587 to 0.59
Epoch 9 (1.4e+01 sec): loss changed from 0.609 to 0.559
Epoch 10 (1.4e+01 sec): loss changed from 0.555 to 0.558
Epoch 11 (1.4e+01 sec): loss changed from 0.549 to 0.512
Epoch 12 (1.4e+01 sec): loss changed from 0.502 to 0.653
Epoch 13 (1.4e+01 sec): loss changed from 0.65 to 0.597
Epoch 14 (1.4e+01 sec): loss changed from 0.605 to 0.593
Epoch 15 (1.4e+01 sec): loss changed from 0.585 to 0.571
Epoch 16 (1.4e+01 sec): loss changed from 0.581 to 0.574
Epoch 17 (1.4e+01 sec): loss changed from 0

In [41]:
train_prob[:10]

array([ 0.31241983,  0.74321616,  0.04921245,  0.07860521,  0.24277399,
        0.1340812 ,  0.19599579,  0.36678466,  0.21368399,  0.1826335 ], dtype=float32)

In [44]:
tr3 = train_sample[train_sample.num==3]

In [75]:
tr3['nnp'] = train_prob
tr3 = tr3[['accnt_id','trgt','prob','nnp','num','pp0','pp1','pp2','pp0t','pp1t','pp2t']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [47]:
import sklearn.metrics

In [53]:
sklearn.metrics.log_loss(np.array(tr3.trgt, dtype=np.float32), np.array(tr3.prob))

0.38132649495811122

In [54]:
sklearn.metrics.log_loss(np.array(tr3.trgt, dtype=np.float32), np.array(tr3.nnp))

0.48112884177981685

In [56]:
sklearn.metrics.roc_auc_score(np.array(tr3.trgt, dtype=np.float32), np.array(tr3.prob))*2-1

0.79544564873872292

In [57]:
sklearn.metrics.roc_auc_score(np.array(tr3.trgt, dtype=np.float32), np.array(tr3.nnp))*2-1

0.64245916203384468

In [76]:
tr3[abs(np.log(np.array(tr3.prob/tr3.nnp, dtype=np.float32)))>1][:10]

Unnamed: 0,accnt_id,trgt,prob,nnp,num,pp0,pp1,pp2,pp0t,pp1t,pp2t
16,16,0,0.00900901,0.049212,3,00,0-0000000,CCCCC0,00NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,0-0000000NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,LLLLL0NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
35,35,1,0.7921,0.242774,3,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCLLLLLNNNNNNNNNNNN...
88,88,0,0.00584795,0.213684,3,,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
93,93,1,0.794721,0.126423,3,0,CCCCCCCCCCCCCCCCCCCCCCCCCC00,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,0NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CCCCCCCCCCCCCCCCCCLLLLLLLL00NNNNNNNNNNNNNNNNNN...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
112,112,0,0.00826446,0.089642,3,C000-000,CCCCCCCCCCCC000000,CCCCCCCCC,C000-000NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CCCCCCCLLLLL000000NNNNNNNNNNNNNNNNNNNNNNNNNNNN...,LLLLLLLLLNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
132,132,1,0.968529,0.20068,3,0,000,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,0NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,000NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
148,148,0,0.00414938,0.189764,3,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCC00-0000000,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCC00-0000000NNNNNNNNNNNNNNNNNNNNNNNNNNN...
151,151,0,0.00763359,0.043753,3,0000,CCCCCCCCCCCCC000-00000,0,0000NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CCCCCCCCCCCCC000-00000NNNNNNNNNNNNNNNNNNNNNNNN...,0NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
162,162,0,0.0521327,0.199327,3,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
191,191,0,0.00900901,0.077299,3,00000,CCC0,0000-0,00000NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,LLL0NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,0000-0NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...


In [72]:
abs(np.log(np.array(tr3.prob/tr3.nnp, dtype=np.float32)))>1

array([False, False,  True, ..., False, False, False], dtype=bool)

In [68]:
np

<module 'numpy' from 'C:\\Anaconda3\\envs\\pytf\\lib\\site-packages\\numpy\\__init__.py'>