In [116]:
import tensorflow as tf
import numpy as np
import pandas as pd
import datetime
import time

from sklearn.linear_model import LogisticRegression
import sklearn.metrics

In [31]:
#Client generation functions
#0 - ok, 1 - 1 bucket, 2 - 2, 3 - 3, 4 - 4+, C - closed
#rr[i,j] - p ith to jth, j=5 => Early Repayment

def states2string(states):
    mp = {0:'0',1:'1',2:'2',3:'3',4:'4',5:'C', 6:'L'}
    if type(states) is list:
        return ''.join([mp[x] for x in states])
    return mp[states]

def augmentSymbol(s, pm):
    if s in ('C', 'N'):
        return s
    return np.random.choice(['-',s],p=[pm,1-pm])

def augmentMissing(pp, pm):
    return ''.join([augmentSymbol(x, pm) for x in pp])

def string2states(s):
    mp = {'0':0,'1':1,'2':2,'3':3,'4':4,'C':5, 'L':6}
    return [mp[x] for x in s]

def genPP(age, term, rr, s0, pMissing=0.05):
    if age <= 0:
        return ""
    pp = [np.random.choice(range(6), p=s0)]
    if age <= 1:
        return states2string(pp)
    for i in range(age-1):
        prev = pp[-1]
        nxt = 5
        if prev < 5 and (i < term or prev > 0):
            nxt = np.random.choice(range(6), p=rr[prev,:])
            if i >= term and nxt==0:
                nxt = 5
        pp.append(nxt)
    return augmentMissing(states2string(list(reversed(pp))), pm=pMissing)

def genCreditRR(rr, lamAge=20, lamTerm=10, emu=np.log(1e5), esigma=3, pMissing=0.1):
    s0 = np.zeros(6)
    s0[0] = rr[0,0] / (rr[0,0] + rr[0,5])
    s0[5] = rr[0,5] / (rr[0,0] + rr[0,5])
    
    age = np.random.poisson(lam=lamAge)
    term = np.random.poisson(lam=lamTerm)
    limit = np.ceil(np.exp(np.random.normal(loc=emu, scale=esigma)) / 1e3) * 1e3
    pp = genPP(age, term, rr, s0, pMissing=pMissing)
    return (limit, term, pp)    

def genCreditSimple(pBad=0.1, pEarlyRepayment=0.1, lamAge=20, pMissing=0.1):
    pGood = (1 - pBad) 
    r0 = [(pGood - pEarlyRepayment), pBad, 0, 0, 0, pEarlyRepayment]
    r1x = np.array([pGood * 0.33 / 0.9, 0.33, pBad * 0.33 / 0.1, 0, 0, pEarlyRepayment * pGood * 0.2])
    r1s = np.sum(r1x)
    r1 = [x/r1s for x in r1x]
    r2p = [[0.10, 0.20, 0.10, 0.60, 0.0, 0.0],
      [0.05, 0.05, 0.05, 0.05, 0.8, 0.0],
      [0.03, 0.03, 0.02, 0.02, 0.9, 0.0]]
    rr = np.array([r0] + [r1] + r2p)
    return genCreditRR(rr, lamAge=lamAge, pMissing=pMissing)

def getClientTarget(data):
    num0 = 0
    num1 = 0
    num2p = 0
    for r in data:
        num0 += r[2].count('0') + r[2].count('L') + min(1, r[2].count('C'))
        num1 += r[2].count('1')
        num2p += 2 * r[2].count('2') + 3 * r[2].count('3') + 4 * r[2].count('4')
    pGood = 0.5
    if num0 + num1 + num2p > 0:
        pGood = num0 / (0.1 + num0 + num1 + num2p)
    pBad = 1 - pGood
    return (np.random.binomial(1, pBad), pBad)

def genClient(muBad=0.1, sigmaBad=0.1, pEarlyRepayment=0.05, muAge=20, sigmaAge=5, pMissing=0.1):
    numCredits = 1
    data = [genCreditSimple(pBad=min(0.5,np.random.lognormal(mean=np.log(muBad), sigma=sigmaBad)),
                            pEarlyRepayment=pEarlyRepayment, pMissing=pMissing,
                           lamAge=np.random.lognormal(mean=np.log(muAge), sigma=np.log(sigmaAge)))
            for i in range(numCredits)]
    target, prob = getClientTarget(data)
    return (data, target, prob)

#Generate sample (as in RRs)
def genSample(numObs=1000, genObs=genClient):
    res = []
    for i in range(numObs):
        (obs, trgt, prob) = genObs()
        row = [i, trgt, prob] + list(obs[0])
        res.append(row)
    return pd.DataFrame(np.array(res),
                        columns=['accnt_id', 'trgt', 'prob', 'limit0', 'term0', 'pp0'])

In [32]:
train_sample = genSample(10000)
valid_sample = genSample(2000)
train_sample[:10]

Unnamed: 0,accnt_id,trgt,prob,limit0,term0,pp0
0,0,0,0.0476190476190476,978734000.0,10,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
1,1,1,0.9402220324508966,29000.0,17,CCCCCC13444444444444433224324444444321100000-0
2,2,1,0.953198127925117,2000.0,9,C4444--44444-4444--32111100
3,3,0,0.0909090909090909,14000.0,8,0-
4,4,0,0.6208530805687205,178000.0,11,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
5,5,0,0.0243902439024389,66000.0,11,0000
6,6,0,0.0322580645161291,541000.0,16,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
7,7,0,0.0076335877862595,30000.0,12,CCCCCCCCCCCCC000000000000
8,8,1,0.833729216152019,4468000.0,19,444-32212102132211000000
9,9,0,0.0322580645161291,3420000.0,12,CCCCC00-


In [28]:
#Transformation functions
def transformPP(term, pp):
    if pp is None:
        return pp
    pplen = len(pp)
    lst = max(0, pplen - term)
    return pp[:lst] + pp[lst:].replace('C', 'L')

def truncPP(term, pp, trlen=60):
    if pp is None:
        if term is None:
            return 'X'*trlen
        else:
            return 'N'*trlen
    pplen = len(pp)
    if pplen >= trlen:
        return pp[:trlen]
    return pp + 'N'*(trlen - pplen)

def transformDF(df, name='pp{0}t', trlen=60):
    num = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    res = df.copy()
    for i in range(num):
        cols = ['pp{0}'.format(i), 'term{0}'.format(i)]
        res[name.format(i)] = [truncPP(int(t), transformPP(int(t), p), trlen) for _,(p,t) in df[cols].iterrows()]
    return res


def transformToTensor(df, pp='pp{0}t', useX=False, numCredits=None):
    #check dimensions
    #num credits
    num_credits = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    if numCredits is not None:
        if num_credits < numCredits:
            raise "Provided <numCredits> is greater than number of fields in DF"
        num_credits = numCredits
    num_mobs = None
    for i in range(num_credits):
        lens = list(set(len(x) for x in df[pp.format(i)] if x is not None))
        numx = np.sum(['X' in x for x in df[pp.format(i)] if x is not None])
        if numx > 0 and not useX:
            raise "Not supposed to use X, but X is found in observations!"
        if len(lens) != 1:
            raise "Expected same length in all observations!"
        if num_mobs is None:
            num_mobs = lens[0]
        if num_mobs != lens[0]:
            raise "Expected same length in all observations!"
    mapping = {'0':0,'1':1,'2':2,'3':3,'4':4,'-':5,'L':6,'C':7,'N':8}
    if useX:
        mapping['X'] = 9
    
    res = []
    res_meta = []
    res_trgt = []
    for _, r in df.iterrows():
        cred = []
        cred_meta = []
        res_trgt.append(r.trgt)
        for i in range(num_credits):
            cred.append([mapping[x] for x in reversed(r[pp.format(i)])])
            cred_meta.append([-1 if r[f.format(i)] is None else r[f.format(i)] for f in ['limit{0}','term{0}']])
        res.append(cred)
        res_meta.append(cred_meta)
    return np.array(res, dtype=np.int32), np.array(res_meta, dtype=np.float32), np.array(res_trgt, dtype=np.int32)

def randomBatch(tensorTuple, batchSize=64):
    ids = np.random.choice(range(tensorTuple[0].shape[0]), batchSize)
    return (x[ids,] for x in tensorTuple)

In [108]:
def safe_logit(x, clampX=1e-3):
    p0 = np.maximum(clampX, x)
    p1 = np.maximum(clampX, 1-x)
    return np.log(p0 / p1)
    
def extractFeatures(df, pp='pp{0}'):
    num_credits = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    cnt_symbols = ['0', '1', '2', '3', '4', '5', 'L', 'C', '00', '01', '0L', '0C', '10', '11', '12', '1L', '1C']
    
    rr_names = ['f_rr{0}_01{1}', 'f_rr{0}_00{1}', 'f_rr{0}_10{1}', 'f_rr{0}_11{1}', 'f_rr{0}_12{1}']
    rr_types = ['', 'f0', 'f1', 'lgt']
    rr_final = [x.format(i) for x in ['f_rr{0}_0d', 'f_rr{0}_1d'] for i in range(num_credits)] + [x.format(i, y) for x in rr_names for y in rr_types for i in range(num_credits)]
    
    new_features = ['f_num{0}_{1}'.format(i, x) for i in range(num_credits) for x in cnt_symbols] + rr_final
    res = pd.concat([df, pd.DataFrame(columns=new_features)])
    
    for _, r in res.iterrows():
        for i in range(num_credits):
            paypat = r[pp.format(i)]
            cnts = {x:paypat.count(x) for x in cnt_symbols}
            for x,v in cnts.items():
                r['f_num{0}_{1}'.format(i, x)] = v
            r0 = cnts['00'] + cnts['01'] + cnts['0L'] + cnts['0C']
            r1 = cnts['10'] + cnts['11'] + cnts['12'] + cnts['1L'] + cnts['1C']
            r['f_rr{0}_0d'.format(i)] = r0
            r['f_rr{0}_1d'.format(i)] = r0
            r['f_rr{0}_01'.format(i)] = (cnts['01'] / r0 if r0 > 0 else 0) 
            r['f_rr{0}_00'.format(i)] = ((cnts['00'] + cnts['0L'] + cnts['0C']) / r0 if r0 > 0 else 1)
            r['f_rr{0}_12'.format(i)] = (cnts['12'] / r1 if r1 > 0 else 0)
            r['f_rr{0}_11'.format(i)] = (cnts['11'] / r1 if r1 > 0 else 0)
            r['f_rr{0}_10'.format(i)] = ((cnts['10'] + cnts['1L'] + cnts['1C']) / r1 if r1 > 0 else 0)
            for f in ['f_rr{0}_01{1}', 'f_rr{0}_00{1}', 'f_rr{0}_10{1}', 'f_rr{0}_11{1}', 'f_rr{0}_12{1}']:
                r[f.format(i, 'f0')] = (1 if r[f.format(i,'')]==0 else 0)
                r[f.format(i, 'f1')] = (1 if r[f.format(i,'')]==1 else 0)
                r[f.format(i, 'lgt')] = safe_logit(r[f.format(i,'')])   
    return res

def featuresToTensor(df):
    features = [x for x in df.columns if x.find('f_') == 0]
    return np.array(df[features]), np.array(df.trgt)

In [109]:
train_wf = extractFeatures(train_sample)
valid_wf = extractFeatures(valid_sample)

train_sample = transformDF(train_sample)
valid_sample = transformDF(valid_sample)

In [128]:
train_x, train_y = featuresToTensor(train_wf)
valid_x, valid_y = featuresToTensor(valid_wf)

logreg0 = LogisticRegression().fit(train_x, train_y)
train_p = logreg0.predict_proba(train_x)[:,1]
valid_p = logreg0.predict_proba(valid_x)[:,1]

array([ 0.47899599,  0.08283167,  0.1556964 ])

In [155]:
param_RNN_size = 30
param_OUT_H1   = 30
param_LR       = 1e-3

size_pp_dictionary =  9
size_meta_vars = 2

size_mob_vars = size_pp_dictionary + size_meta_vars

tf.reset_default_graph()

tfIn_Train = tf.placeholder(shape=(), dtype=tf.bool)
tfIn_Trgt = tf.placeholder(shape=(None,), dtype=tf.int32)
tfIn_PP = tf.placeholder(shape=(None, 1, None), dtype=tf.int32)

with tf.name_scope(name='DATA-TRANSFORMATION'):
    tfY  = tf.one_hot(tfIn_Trgt, 2)
    tfX  = tf.reduce_sum(tf.one_hot(tfIn_PP, size_pp_dictionary), axis=1)
    tfXn = tfX[:,1:,:]

#3 define RNN on these inputs
with tf.name_scope(name='RNN'):
    #rnnCell = tf.nn.rnn_cell.GRUCell(num_units=param_RNN_size, activation=tf.nn.tanh)
    rnnCell = tf.nn.rnn_cell.LSTMCell(num_units=param_RNN_size)
    #rnnCell = tf.nn.rnn_cell.BasicRNNCell(num_units=param_RNN_size, activation=tf.nn.relu)
    #rnnCell = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicRNNCell(num_units=param_RNN_size, activation=tf.nn.relu) for _ in range(3)])
    _, tfMO = tf.nn.dynamic_rnn(rnnCell, inputs=tfX, dtype=tf.float32)

#4 define output layer
with tf.name_scope(name='OUTPUT-FFNN'):
    tfOH1 = tf.layers.dense(tf.layers.dropout(tfMO, training=tfIn_Train), param_OUT_H1, activation=tf.nn.relu, name='OUT-H1')
    tfOO = tf.layers.dense(tf.layers.dropout(tfOH1, training=tfIn_Train), 2, name='OUT-OUT')
    tft = tf.reduce_mean(tfOO)

#5a define loss functions
with tf.name_scope(name='LOSS-OPTIMIZER'):
    tfLoss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfY,logits=tfOO))
    tfTrain = tf.train.AdamOptimizer(param_LR).minimize(tfLoss)

tfLossSummary = tf.summary.scalar('Cross-Entropy-Loss', tfLoss)
#5b prediction
tfOutProb = tf.nn.softmax(tfOO)[:,1]

print('Graph creation complete')

Graph creation complete


In [37]:
train_pp, train_meta, train_trgt = transformToTensor(train_sample, useX=False, numCredits=1)
valid_pp, valid_meta, valid_trgt = transformToTensor(valid_sample, useX=False, numCredits=1)

In [152]:
num_epoch = 20
num_step  = 20
batch_size = 2000

dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_BurPat_SRNN-{0}'.format(dt_now), tf.get_default_graph())
train_batch = {tfIn_PP: train_pp, tfIn_Trgt: train_trgt, tfIn_Train:True}
valid_batch = {tfIn_PP: valid_pp, tfIn_Trgt: valid_trgt, tfIn_Train:False}
with tf.Session() as tfs:    
    tfs.run(tf.global_variables_initializer())
    for i in range(num_epoch):
        mini_pp, mini_meta, mini_trgt = randomBatch((train_pp, train_meta, train_trgt), batch_size)
        mini_batch = {tfIn_PP: mini_pp, tfIn_Trgt: mini_trgt, tfIn_Train:True}
        
        time0 = time.perf_counter()
        loss0 = tfLoss.eval(feed_dict=mini_batch)
        for j in range(num_step):
            tfTrain.run(feed_dict=mini_batch)
        loss1 = tfLoss.eval(feed_dict=mini_batch)
        time1 = time.perf_counter()
        
        valid_loss_str = tfLossSummary.eval(feed_dict=valid_batch)
        tffw.add_summary(valid_loss_str, i)
        print('Epoch {0} ({3:1.2} sec): loss changed from {1:1.3} to {2:1.3}'.format(i, loss0, loss1, time1-time0))
    train_prob = tfOutProb.eval(feed_dict=train_batch)
    valid_prob = tfOutProb.eval(feed_dict=valid_batch)

InvalidArgumentError: logits and labels must be same size: logits_size=[4000,2] labels_size=[2000,2]
	 [[Node: LOSS-OPTIMIZER/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](LOSS-OPTIMIZER/Reshape, LOSS-OPTIMIZER/Reshape_1)]]

Caused by op 'LOSS-OPTIMIZER/SoftmaxCrossEntropyWithLogits', defined at:
  File "C:\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "C:\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Anaconda3\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-150-2701e23cf335>", line 37, in <module>
    tfLoss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tfY,logits=tfOO))
  File "C:\Anaconda3\lib\site-packages\tensorflow\python\ops\nn_ops.py", line 1594, in softmax_cross_entropy_with_logits
    precise_logits, labels, name=name)
  File "C:\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py", line 2380, in _softmax_cross_entropy_with_logits
    features=features, labels=labels, name=name)
  File "C:\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): logits and labels must be same size: logits_size=[4000,2] labels_size=[2000,2]
	 [[Node: LOSS-OPTIMIZER/SoftmaxCrossEntropyWithLogits = SoftmaxCrossEntropyWithLogits[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](LOSS-OPTIMIZER/Reshape, LOSS-OPTIMIZER/Reshape_1)]]


In [144]:
vsmpl = valid_sample.copy()
vsmpl['nnp'] = valid_prob
vsmpl['lrp'] = valid_p
vsmpl = vsmpl[['accnt_id','trgt','prob','nnp','lrp','pp0','pp0t']]

In [44]:
sklearn.metrics.roc_auc_score(np.array(vsmpl.trgt, dtype=np.float32), np.array(vsmpl.prob, dtype=np.float32))*2-1

0.8407087210992128

In [145]:
sklearn.metrics.roc_auc_score(np.array(vsmpl.trgt, dtype=np.float32), np.array(vsmpl.nnp, dtype=np.float32))*2-1

0.74813644419875613

In [131]:
sklearn.metrics.roc_auc_score(np.array(vsmpl.trgt, dtype=np.float32), np.array(vsmpl.lrp, dtype=np.float32))*2-1

0.8087475993184059

In [75]:
tmp = {a:a+'-' for a in 'xyz'}

(10, 39)