In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import datetime
import time

from sklearn.linear_model import LogisticRegression
import sklearn.metrics

In [2]:
#Client generation functions
#0 - ok, 1 - 1 bucket, 2 - 2, 3 - 3, 4 - 4+, C - closed
#rr[i,j] - p ith to jth, j=5 => Early Repayment

def states2string(states):
    mp = {0:'0',1:'1',2:'2',3:'3',4:'4',5:'C', 6:'L'}
    if type(states) is list:
        return ''.join([mp[x] for x in states])
    return mp[states]

def augmentSymbol(s, pm):
    if s in ('C', 'N'):
        return s
    return np.random.choice(['-',s],p=[pm,1-pm])

def augmentMissing(pp, pm):
    return ''.join([augmentSymbol(x, pm) for x in pp])

def string2states(s):
    mp = {'0':0,'1':1,'2':2,'3':3,'4':4,'C':5, 'L':6}
    return [mp[x] for x in s]

def genPP(age, term, rr, s0, pMissing=0.05):
    if age <= 0:
        return ""
    pp = [np.random.choice(range(6), p=s0)]
    if age <= 1:
        return states2string(pp)
    for i in range(age-1):
        prev = pp[-1]
        nxt = 5
        if prev < 5 and (i < term or prev > 0):
            nxt = np.random.choice(range(6), p=rr[prev,:])
            if i >= term and nxt==0:
                nxt = 5
        pp.append(nxt)
    return augmentMissing(states2string(list(reversed(pp))), pm=pMissing)

def genCreditRR(rr, lamAge=20, lamTerm=10, emu=np.log(1e5), esigma=3, pMissing=0.1):
    s0 = np.zeros(6)
    s0[0] = rr[0,0] / (rr[0,0] + rr[0,5])
    s0[5] = rr[0,5] / (rr[0,0] + rr[0,5])
    
    age = np.random.poisson(lam=lamAge)
    term = np.random.poisson(lam=lamTerm)
    limit = np.ceil(np.exp(np.random.normal(loc=emu, scale=esigma)) / 1e3) * 1e3
    pp = genPP(age, term, rr, s0, pMissing=pMissing)
    return (limit, term, pp)    

def genCreditSimple(pBad=0.1, pEarlyRepayment=0.1, lamAge=20, pMissing=0.1):
    pGood = (1 - pBad) 
    r0 = [(pGood - pEarlyRepayment), pBad, 0, 0, 0, pEarlyRepayment]
    r1x = np.array([pGood * 0.33 / 0.9, 0.33, pBad * 0.33 / 0.1, 0, 0, pEarlyRepayment * pGood * 0.2])
    r1s = np.sum(r1x)
    r1 = [x/r1s for x in r1x]
    r2p = [[0.10, 0.20, 0.10, 0.60, 0.0, 0.0],
      [0.05, 0.05, 0.05, 0.05, 0.8, 0.0],
      [0.03, 0.03, 0.02, 0.02, 0.9, 0.0]]
    rr = np.array([r0] + [r1] + r2p)
    return genCreditRR(rr, lamAge=lamAge, pMissing=pMissing)

def getClientTarget(data):
    num0 = 0
    num1 = 0
    num2p = 0
    for r in data:
        num0 += r[2].count('0') + r[2].count('L') + min(1, r[2].count('C'))
        num1 += r[2].count('1')
        num2p += 2 * r[2].count('2') + 3 * r[2].count('3') + 4 * r[2].count('4')
    pGood = 0.5
    if num0 + num1 + num2p > 0:
        pGood = num0 / (0.1 + num0 + num1 + num2p)
    pBad = 1 - pGood
    return (np.random.binomial(1, pBad), pBad)

def genClient(muBad=0.1, sigmaBad=0.1, pEarlyRepayment=0.05, muAge=20, sigmaAge=5, pMissing=0.1):
    numCredits = 1
    data = [genCreditSimple(pBad=min(0.5,np.random.lognormal(mean=np.log(muBad), sigma=sigmaBad)),
                            pEarlyRepayment=pEarlyRepayment, pMissing=pMissing,
                           lamAge=np.random.lognormal(mean=np.log(muAge), sigma=np.log(sigmaAge)))
            for i in range(numCredits)]
    target, prob = getClientTarget(data)
    return (data, target, prob)

#Generate sample (as in RRs)
def genSample(numObs=1000, genObs=genClient):
    res = []
    for i in range(numObs):
        (obs, trgt, prob) = genObs()
        row = [i, trgt, prob] + list(obs[0])
        res.append(row)
    return pd.DataFrame(np.array(res),
                        columns=['accnt_id', 'trgt', 'prob', 'limit0', 'term0', 'pp0'])

In [28]:
#30 sec (20k and 2k)
train_sample_src = genSample(20000)
valid_sample_src = genSample(2000)
train_sample_src[:10]

Unnamed: 0,accnt_id,trgt,prob,limit0,term0,pp0
0,0,1,0.9750623441396508,3976000.0,13,4444444432110
1,1,0,0.0322580645161291,69000.0,12,CCC00
2,2,0,0.2366412213740457,1000.0,15,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
3,3,0,0.099099099099099,4519000.0,9,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
4,4,1,0.4409937888198758,234000.0,11,CCCCC032100001000
5,5,0,0.0909090909090909,1288000.0,6,0
6,6,0,0.0909090909090909,4000.0,14,0
7,7,0,0.0140845070422535,29000.0,11,CCCCCCC0000-00
8,8,0,0.3548387096774194,167000.0,11,CCCCCC-10
9,9,0,0.3693693693693693,10000.0,11,CCCCCCCC000000111-1-


In [4]:
#Transformation functions
def transformPP(term, pp):
    if pp is None:
        return pp
    pplen = len(pp)
    lst = max(0, pplen - term)
    return pp[:lst] + pp[lst:].replace('C', 'L')

def truncPP(term, pp, trlen=60):
    if pp is None:
        if term is None:
            return 'X'*trlen
        else:
            return 'N'*trlen
    pplen = len(pp)
    if pplen >= trlen:
        return pp[:trlen]
    return pp + 'N'*(trlen - pplen)

def transformDF(df, name='pp{0}t', trlen=60):
    num = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    res = df.copy()
    for i in range(num):
        cols = ['pp{0}'.format(i), 'term{0}'.format(i)]
        res[name.format(i)] = [truncPP(int(t), transformPP(int(t), p), trlen) for _,(p,t) in df[cols].iterrows()]
    return res


def transformToTensor(df, pp='pp{0}t', useX=False, numCredits=None):
    #check dimensions
    #num credits
    num_credits = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    if numCredits is not None:
        if num_credits < numCredits:
            raise "Provided <numCredits> is greater than number of fields in DF"
        num_credits = numCredits
    num_mobs = None
    for i in range(num_credits):
        lens = list(set(len(x) for x in df[pp.format(i)] if x is not None))
        numx = np.sum(['X' in x for x in df[pp.format(i)] if x is not None])
        if numx > 0 and not useX:
            raise "Not supposed to use X, but X is found in observations!"
        if len(lens) != 1:
            raise "Expected same length in all observations!"
        if num_mobs is None:
            num_mobs = lens[0]
        if num_mobs != lens[0]:
            raise "Expected same length in all observations!"
    mapping = {'0':0,'1':1,'2':2,'3':3,'4':4,'-':5,'L':6,'C':7,'N':8}
    if useX:
        mapping['X'] = 9
    
    res = []
    res_meta = []
    res_trgt = []
    for _, r in df.iterrows():
        cred = []
        cred_meta = []
        res_trgt.append(r.trgt)
        for i in range(num_credits):
            cred.append([mapping[x] for x in reversed(r[pp.format(i)])])
            cred_meta.append([-1 if r[f.format(i)] is None else r[f.format(i)] for f in ['limit{0}','term{0}']])
        res.append(cred)
        res_meta.append(cred_meta)
    return np.array(res, dtype=np.int32), np.array(res_meta, dtype=np.float32), np.array(res_trgt, dtype=np.int32)

def randomBatch(tensorTuple, batchSize=64):
    ids = np.random.choice(range(tensorTuple[0].shape[0]), batchSize)
    return (x[ids,] for x in tensorTuple)

In [5]:
def safe_logit(x, clampX=1e-3):
    p0 = np.maximum(clampX, x)
    p1 = np.maximum(clampX, 1-x)
    return np.log(p0 / p1)
    
def extractFeatures(df, pp='pp{0}'):
    num_credits = np.sum([x.replace('pp','').isnumeric() for x in df.columns])
    cnt_symbols = ['0', '1', '2', '3', '4', '5', 'L', 'C', '00', '01', '0L', '0C', '10', '11', '12', '1L', '1C']
    
    rr_names = ['f_rr{0}_01{1}', 'f_rr{0}_00{1}', 'f_rr{0}_10{1}', 'f_rr{0}_11{1}', 'f_rr{0}_12{1}']
    rr_types = ['', 'f0', 'f1', 'lgt']
    rr_final = [x.format(i) for x in ['f_rr{0}_0d', 'f_rr{0}_1d'] for i in range(num_credits)] + [x.format(i, y) for x in rr_names for y in rr_types for i in range(num_credits)]
    
    new_features = ['f_num{0}_{1}'.format(i, x) for i in range(num_credits) for x in cnt_symbols] + rr_final
    res = pd.concat([df, pd.DataFrame(columns=new_features)])
    
    for _, r in res.iterrows():
        for i in range(num_credits):
            paypat = r[pp.format(i)]
            cnts = {x:paypat.count(x) for x in cnt_symbols}
            for x,v in cnts.items():
                r['f_num{0}_{1}'.format(i, x)] = v
            r0 = cnts['00'] + cnts['01'] + cnts['0L'] + cnts['0C']
            r1 = cnts['10'] + cnts['11'] + cnts['12'] + cnts['1L'] + cnts['1C']
            r['f_rr{0}_0d'.format(i)] = r0
            r['f_rr{0}_1d'.format(i)] = r0
            r['f_rr{0}_01'.format(i)] = (cnts['01'] / r0 if r0 > 0 else 0) 
            r['f_rr{0}_00'.format(i)] = ((cnts['00'] + cnts['0L'] + cnts['0C']) / r0 if r0 > 0 else 1)
            r['f_rr{0}_12'.format(i)] = (cnts['12'] / r1 if r1 > 0 else 0)
            r['f_rr{0}_11'.format(i)] = (cnts['11'] / r1 if r1 > 0 else 0)
            r['f_rr{0}_10'.format(i)] = ((cnts['10'] + cnts['1L'] + cnts['1C']) / r1 if r1 > 0 else 0)
            for f in ['f_rr{0}_01{1}', 'f_rr{0}_00{1}', 'f_rr{0}_10{1}', 'f_rr{0}_11{1}', 'f_rr{0}_12{1}']:
                r[f.format(i, 'f0')] = (1 if r[f.format(i,'')]==0 else 0)
                r[f.format(i, 'f1')] = (1 if r[f.format(i,'')]==1 else 0)
                r[f.format(i, 'lgt')] = safe_logit(r[f.format(i,'')])   
    return res

def featuresToTensor(df):
    features = [x for x in df.columns if x.find('f_') == 0]
    return np.array(df[features]), np.array(df.trgt)

In [29]:
#20 sec (20k and 2k)
train_sample = transformDF(train_sample_src)
valid_sample = transformDF(valid_sample_src)
train_wf = extractFeatures(train_sample, pp='pp{0}t')
valid_wf = extractFeatures(valid_sample, pp='pp{0}t')

In [30]:
train_x, train_y = featuresToTensor(train_wf)
valid_x, valid_y = featuresToTensor(valid_wf)

logreg0 = LogisticRegression().fit(train_x, train_y)
train_p = logreg0.predict_proba(train_x)[:,1]
valid_p = logreg0.predict_proba(valid_x)[:,1]

In [70]:
param_H_size   = [300, 200, 100]
param_RBM_hid  = 90
param_LR       = 1e-3

size_pp_dictionary = 9
size_pp_time = 60
size_input = size_pp_time * size_pp_dictionary

tf.reset_default_graph()

tfIn_PP = tf.placeholder(shape=(None, 1, None), dtype=tf.int32)
tfIn_hrand = tf.placeholder(shape=(None, param_RBM_hid), dtype=tf.float32)
#tfIn_vrand = tf.placeholder(shape=(None, size_input), dtype=tf.float32)

with tf.name_scope(name='DATA-TRANSFORMATION'):
    tfX  = tf.reshape(tf.reduce_sum(tf.one_hot(tfIn_PP, size_pp_dictionary), axis=1), shape=(-1, size_input))
    
with tf.name_scope(name='RBM-STATE'):
    tfW  = tf.Variable(tf.random_normal(mean=0.0, stddev=0.1, shape=(size_input, param_RBM_hid)), name='RBM-W')
    tfBH = tf.Variable(tf.zeros([param_RBM_hid]), name='RBM-BIAS-H')
    tfBV = tf.Variable(tf.zeros([size_input]), name='RBM-BIAS-V')
    
#1-step gibbs sampling:
with tf.name_scope(name='CD-k'):
    tf_hp0 = tf.nn.sigmoid(tf.matmul(tfX, tfW) + tfBH)
    tf_hs0 = tf.nn.relu(tf.sign(tf_hp0 - tfIn_hrand))
    tf_positive = tf.matmul(tf.transpose(tfX), tf_hs0, name='POSITIVE')
    tf_vp = tf.nn.sigmoid(tf.matmul(tf_hp0, tf.transpose(tfW)) + tfBV)
    tf_hp1 = tf.nn.sigmoid(tf.matmul(tf_vp, tfW) + tfBH)
    tf_hs1 = tf.nn.relu(tf.sign(tf_hp1 - tfIn_hrand))
    tf_negative = tf.matmul(tf.transpose(tf_vp), tf_hp1)
    
tfFeatures = tf_hp0

with tf.name_scope(name='TRAINING'):
    tf_train_w = tfW.assign_add(param_LR * (tf_positive - tf_negative))
    tf_train_bh = tfBH.assign_add(param_LR * tf.reduce_mean(tf_hp0 - tf_hp1, axis=0))
    tf_train_bv = tfBV.assign_add(param_LR * tf.reduce_mean(tfX - tf_vp, axis=0))
    tfTrain = tf.group(tf_train_w, tf_train_bh, tf_train_bv)
    tfLoss = tf.sqrt(tf.reduce_mean(tf.square(tfX - tf_vp)))

#tfCostSummary = tf.summary.scalar('RBM-Cost', tfCost)

print('Graph creation complete')

Graph creation complete


In [31]:
train_pp, train_meta, train_trgt = transformToTensor(train_sample, useX=False, numCredits=1)
valid_pp, valid_meta, valid_trgt = transformToTensor(valid_sample, useX=False, numCredits=1)

In [71]:
num_epoch = 1000
num_step  = 20
batch_size = 500


dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_A', tf.get_default_graph())
train_batch = {tfIn_PP: train_pp}
valid_batch = {tfIn_PP: valid_pp}
with tf.Session() as tfs:    
    tfs.run(tf.global_variables_initializer())
    for i in range(num_epoch):
        mini_pp, mini_meta, mini_trgt = randomBatch((train_pp, train_meta, train_trgt), batch_size)
        mini_batch = {tfIn_PP: mini_pp, tfIn_hrand: np.random.rand(mini_pp.shape[0], param_RBM_hid)}
        
        time0 = time.perf_counter()
        loss0 = tfLoss.eval(feed_dict=mini_batch)
        for j in range(num_step):
            tfTrain.run(feed_dict=mini_batch)
        loss1 = tfLoss.eval(feed_dict=mini_batch)
        time1 = time.perf_counter()
        
        valid_loss = tfLoss.eval(feed_dict=valid_batch)
        #valid_loss_str = tfLossSummary.eval(feed_dict=valid_batch)
        #tffw.add_summary(valid_loss_str, i)
        if i % 10 == 0:
            print('Epoch {0} ({3:1.2f} sec): loss changed from {1:1.3f} to {2:1.3f}\t\t{4:1.3f}'.format(i, loss0, loss1, time1-time0,valid_loss))
    train_dbnf = tfFeatures.eval(feed_dict=train_batch)
    valid_dbnf = tfFeatures.eval(feed_dict=valid_batch)

Epoch 0 (0.34 sec): loss changed from 0.509 to 0.126		0.124
Epoch 10 (0.33 sec): loss changed from 0.096 to 0.093		0.095
Epoch 20 (0.33 sec): loss changed from 0.090 to 0.087		0.087
Epoch 30 (0.33 sec): loss changed from 0.082 to 0.079		0.082
Epoch 40 (0.33 sec): loss changed from 0.075 to 0.072		0.079
Epoch 50 (0.33 sec): loss changed from 0.077 to 0.074		0.075
Epoch 60 (0.43 sec): loss changed from 0.070 to 0.066		0.073
Epoch 70 (0.33 sec): loss changed from 0.076 to 0.073		0.071
Epoch 80 (0.34 sec): loss changed from 0.068 to 0.065		0.069
Epoch 90 (0.33 sec): loss changed from 0.066 to 0.063		0.069
Epoch 100 (0.32 sec): loss changed from 0.068 to 0.065		0.067
Epoch 110 (0.33 sec): loss changed from 0.062 to 0.059		0.066
Epoch 120 (0.35 sec): loss changed from 0.062 to 0.059		0.065
Epoch 130 (0.33 sec): loss changed from 0.059 to 0.056		0.064
Epoch 140 (0.32 sec): loss changed from 0.062 to 0.058		0.063
Epoch 150 (0.33 sec): loss changed from 0.063 to 0.060		0.062
Epoch 160 (0.33 sec

In [72]:
logreg1 = LogisticRegression().fit(train_dbnf, train_y)
train_nn = logreg1.predict_proba(train_dbnf)[:,1]
valid_nn = logreg1.predict_proba(valid_dbnf)[:,1]

vsmpl = valid_sample.copy()
vsmpl['nnp'] = valid_nn
vsmpl['lrp'] = valid_p
vsmpl = vsmpl[['accnt_id','trgt','prob','nnp','lrp', 'pp0','pp0t']]

In [34]:
sklearn.metrics.roc_auc_score(np.array(vsmpl.trgt, dtype=np.float32), np.array(vsmpl.prob, dtype=np.float32))*2-1

0.83261771341241553

In [73]:
sklearn.metrics.roc_auc_score(np.array(vsmpl.trgt, dtype=np.float32), np.array(vsmpl.nnp, dtype=np.float32))*2-1

0.68266573058158597

In [68]:
sklearn.metrics.roc_auc_score(np.array(vsmpl.trgt, dtype=np.float32), np.array(vsmpl.lrp, dtype=np.float32))*2-1

0.70034717911617017

In [75]:
tmp = {a:a+'-' for a in 'xyz'}

In [23]:
train_dbnf[:10]

array([[  5.41255005e-08,   2.24236214e-06,   9.99820173e-01, ...,
          1.65283825e-06,   7.56276535e-08,   9.98151362e-01],
       [  3.54402630e-07,   1.12323498e-04,   7.45301776e-08, ...,
          3.61397746e-04,   7.63305507e-05,   5.18086134e-04],
       [  6.03610943e-07,   4.99429552e-05,   6.28965964e-08, ...,
          3.13086173e-04,   2.11163133e-04,   1.00003462e-03],
       ..., 
       [  1.10576241e-06,   8.29852434e-05,   8.05648170e-08, ...,
          3.61195911e-04,   5.51577832e-04,   1.13630970e-03],
       [  6.62167281e-07,   6.95560593e-05,   1.22061323e-07, ...,
          3.96315387e-04,   4.25364822e-04,   1.07475719e-03],
       [  1.05501829e-06,   1.67262813e-04,   8.49903427e-06, ...,
          1.10150264e-04,   8.94813274e-05,   4.08689346e-04]], dtype=float32)