In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from functools import reduce

import os

  from ._conv import register_converters as _register_converters


In [2]:
performance  = pd.read_csv('train/performance_train.csv')
facturation  = pd.read_csv('train/facturation_train.csv')
payments     = pd.read_csv('train/paiements_train.csv')
transactions = pd.read_csv('train/transactions_train.csv')

customer_ids = performance['ID_CPTE']

print('Proportion of clients who default:', sum(performance['Default'])/len(performance))

Proportion of clients who default: 0.19336134453781512


In [22]:
class ManualError(Exception):
    '''
    For any undesirable results that I feel like throwing
    '''
    pass


def duplicate_first(array):
    assert len(array) > 0

    return np.array([array[0]] + list(array))
        
        
def generate_clients(customer_ids, *dfs):

    for cus in customer_ids:
        yield [cus] + [df[df['ID_CPTE'] == cus] for df in dfs]

        
def normalize(array, cus):
    std = np.std(array)
    
    if all(elem == 0 for elem in array):
        return np.zeros(shape=[len(array)])
    
    return (array - np.mean(array))/std


def next_batch(batch_size, *data):

    start_index  = 0
    num_elements = len(x)
    
    while True:

        try:
            if ((start_index + batch_size) // num_elements) == 0:
                yield tuple(d[start_index: start_index + batch_size] for d in data)

            elif ((start_index + batch_size) // num_elements) == 1:
                raise ManualError
                
            elif ((start_index + batch_size)// num_elements) > 1:
                print('start_index', start_index)
                print('bs', batch_size)
                print('n', num_elements)
                print((start_index + batch_size)// num_elements)
                
                raise ValueError
                
            
            start_index = start_index + batch_size

        except (IndexError, ManualError) as e:
            yield tuple(np.array(list(d[start_index:]) + list(d[: (start_index + batch_size) % num_elements])) for d in data)
            start_index =  (start_index + batch_size) % num_elements
            
def create_batch_tuples(length, batch_size):
    output = np.arange(0, length, batch_size)

    return list(zip(output[:-1], output[1:]))

def inc_or_dec(array):
    prev_value = array[0]
    
    for elem in array:
        if elem > prev_value:
            return 1
        elif elem < prev_value:
            return -1
    
    return 0

In [4]:
class customer:
    
    def __init__(self, customer_id, performance, facturation, payments, transactions):
        
        self.customer_id  = customer_id
        self.performance  = performance
        self.facturation  = facturation.sort_values(by='PERIODID_MY')
        self.payments     = payments.sort_values(by='TRANSACTION_DTTM')
        self.transactions = transactions.sort_values(by=['TRANSACTION_DTTM'])
        
        self.assessment = list(performance['PERIODID_MY'])[0]
        self.default    = list(performance['Default'])[0]
        
        self.total_balance_change = (facturation['CurrentTotalBalance'].values[1:] - facturation['CurrentTotalBalance'].values[:-1])/100
        self.total_balance_rel    = normalize(facturation['CurrentTotalBalance'].values/facturation['CreditLimit'].values, self.customer_id)
        self.cash_balance_change  = (facturation['CashBalance'].values[1:] - facturation['CashBalance'].values[:-1])/100
        self.prop_to_interest     = 0
        
        self.total_balance_change = duplicate_first(self.total_balance_change)
        self.cash_balance_change  = duplicate_first(self.cash_balance_change)
        
        self.max_length = len(self.total_balance_change)
        assert self.max_length == len(self.total_balance_rel)
        assert self.max_length == len(self.cash_balance_change)
        
        self.max_length = len(self.total_balance_change)
        self.input_data = np.vstack((self.total_balance_change,
                                    self.total_balance_rel,
                                    self.cash_balance_change)).T
        
        self.inc_credit_limit  = inc_or_dec(self.facturation['CreditLimit'].values)
        self.delq_cycle        = (self.facturation['DelqCycle'] > 0).any()
        self.over_credit_limit = ((self.facturation['CurrentTotalBalance'].values - self.facturation['CreditLimit'].values) > 0).any()
        self.binary_data = np.array([self.inc_credit_limit,
                                     self.delq_cycle,
                                     self.over_credit_limit], dtype=float)
        
    def pad_input_data(self, length):
        if len(self.input_data) == length:
            pass
        else:
            '''
            len(self.input_data) in {13, 14} for all clients
            
            a bit awkward, but it works
            '''
            print('a')
            self.input_data = np.array([np.array([0, 0, 0])] + list(self.input_data) )
            assert len(self.input_data) == length
            
            self.max_length = len(self.input_data)

In [5]:
def get_customer(client_list, client_id):
    return [cl for cl in client_list if cl.customer_id == client_id][0]

In [6]:
'''
clients is a list of client objects, with their information collected
into instances
'''

client_generator = generate_clients(customer_ids, performance, facturation, payments, transactions)
clients = [customer(*client_info) for client_info in client_generator]
#test_client = customer(*next(client_generator))

In [7]:
max_length = 0

for cl in clients:
    if cl.max_length > max_length:
        print(max_length)
        max_length = cl.max_length

for cl in clients:
    '''
    Paddings for LSTM
    '''
    cl.pad_input_data(max_length)

0
63095782
a
89632033
a
19572191
a
13937077
a
13674165
a
40655521
a
52271077
a
93944458
a
58041541
a
92123955
a
16239838
a
76492913
a
53935809
a
76968418
a
77137298
a
75870127
a
70801413
a
54953280
a
94366450
a
74544693
a
79623132
a
57032740
a
18671987
a
96720381
a
12354480
a
94282476
a
64003419
a
55483984
a
10792372
a
52667024
a
59148803
a
75117297
a
63621034
a
38788092
a
19777498
a
42604423
a
89226857
a
92470612
a
94506444
a
85254607
a
92446268
a
67360911
a
35236172
a
88064779
a
30717593
a
61291709
a
16440317
a
73429064
a
87795476
a
76748158
a
75655385
a
27164284
a
56583098
a
88734393
a
50599840
a
25786707
a
88161490
a


In [8]:
def new_variable(shape, name):
    return tf.get_variable(name=name, 
                           shape=shape, 
                           dtype=tf.float32, 
                           initializer=tf.truncated_normal_initializer(stddev=0.02))
    
def flatten_layer(layer):
    layer_shape  = layer.get_shape().as_list()
    num_features = reduce(lambda x, y: x*y, layer_shape[1:4])
    layer_flat   = tf.reshape(layer, [-1, num_features])

    return layer_flat, num_features


def new_fc_layer(_input,          
                 num_inputs,     
                 num_outputs,
                 prefix,
                 use_relu=True,
                 batch_norm=False):
    
    with tf.variable_scope(prefix):
        weights = new_variable(shape=[num_inputs, num_outputs], name='fc_weights')
        biases  = new_variable(shape=[num_outputs], name='fc_bias')
    
    if batch_norm:
        layer = tf.contrib.layers.batch_norm(layer, epsilon=1e-5)
    
    if use_relu:
        layer = tf.nn.leaky_relu(tf.add(tf.matmul(_input, weights), biases))
    else:
        layer = tf.add(tf.matmul(_input, weights), biases)

    return layer

def conv_pool_1d_layer(_input,
                       filter_size, 
                       num_input_channels, 
                       num_output,
                       prefix,
                       max_pooling=True,
                       use_relu=True):
    
    '''
    If this function is called multiple times with the same variable names,
    ValueErrors will rise, hence the need of a prefix.
    '''
    assert isinstance(prefix, str)
    assert prefix != ''
    
    CONV_STRIDES = 1
    POOL_STRIDES = 1
    POOL_SIZE    = 2
    LEAKY_ALPHA  = 0.2

    with tf.variable_scope(prefix):
        conv_matrix = new_variable(name='conv_matrix', shape=[filter_size, num_input_channels, num_output])
        bias        = new_variable(name='bias', shape=[num_output])
        

    conv_layer = tf.nn.conv1d(value=_input,
                              filters=conv_matrix,
                              stride=CONV_STRIDES,
                              padding='SAME',
                              name='conv_layer')

    conv_layer += bias

    if max_pooling:
        pooled_layer = tf.layers.max_pooling1d(inputs=conv_layer,
                                               pool_size=POOL_SIZE,
                                               strides=POOL_STRIDES,
                                               name='pooled_layer')
    else:
        pooled_layer = conv_layer

    
    if use_relu:
        return tf.nn.leaky_relu(pooled_layer, alpha=LEAKY_ALPHA)
    
    return pooled_layer

### Definition of Train and Test Data

In [10]:
x_train = np.array([c.input_data for c in clients])
y_train = performance['Default'].values
y_train = np.vstack((y_train, np.ones(shape=[len(y_train)]) - y_train)).T

binary_x_train = np.array([c.binary_data for c in clients])

In [26]:
def predictor(x_train, y_train, binary_x_train, batch_size):
    '''
    1 convolution layer + LSTM + 2 fc layers
    '''
    
    NUM_FEEDS = 30000
    
    _, max_length_x, num_features  = x_train.shape
    _, num_binary = binary_x_train.shape
    num_hidden_cells = 4
    
    max_length_cb = 3
    
    tf.reset_default_graph()
    
    x = tf.placeholder(dtype=tf.float32, shape=[batch_size, max_length_x, num_features])
    y = tf.placeholder(dtype=tf.float32, shape=[batch_size, 2])
    binary = tf.placeholder(dtype=tf.float32, shape=[batch_size, num_binary])
    
    
    #cash_balance = tf.placeholder(dtype=tf.float32, shape=[batch_size, max_length_cb, 2])
    
    conv_x = conv_pool_1d_layer(_input=x,
                                filter_size=3, 
                                num_input_channels=num_features,
                                num_output=8,
                                prefix='conv_x')
    

    
    #conv_input = tf.concat([conv_x, conv_cb], axis=1)
    
    lstm_cell     = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(num_units=num_hidden_cells)])
    initial_state = lstm_cell.zero_state(batch_size, dtype=tf.float32)
    output, state = tf.nn.dynamic_rnn(lstm_cell, inputs=x, initial_state=initial_state)
    
    flattened_output, num_features_flatten = flatten_layer(output)
    
    
    with_binary = tf.concat([flattened_output, binary], axis=1)
    '''
    can add other discrete/binary variables here
    '''
    fc_1 = new_fc_layer(with_binary, num_features_flatten + num_binary, 16, 'fc_1')
    fc   = new_fc_layer(fc_1, 16, 2, 'fc')
    
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=fc)
    cost          = tf.add(tf.reduce_mean(cross_entropy), 0.7*tf.reduce_mean(tf.slice(fc, [0, 1], [batch_size, 1])))
    optimizer     = tf.train.AdamOptimizer(learning_rate=1e-5).minimize(cost)
    
    predicted = tf.argmax(fc, axis=1)
    accuracy  = tf.reduce_mean(tf.cast(tf.equal(predicted, tf.argmax(y, axis=1)), dtype='float'))
    
    batch_generator    = next_batch(batch_size, (x_train, y_train, binary_x_train))
    train_batch_tuples = create_batch_tuples(len(x_train), batch_size)
    
    predicted_train = []
    
    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())

        for i in range(NUM_FEEDS):
            x_next, y_next, binary_next = next(batch_generator)
                
            _ = sess.run(optimizer, feed_dict={x: x_next, 
                                               y: y_next,
                                               binary: binary_next})
                
            if i % 1000 == 0:

                batch_accuracies = []

                for (start, end) in train_batch_tuples:
                    acc = sess.run(accuracy, feed_dict={x: x_train[start: end], 
                                                        y: y_train[start: end],
                                                        binary: binary_x_train[start: end]})

                    batch_accuracies.append(acc)



                print('Step {}: accuracy = {}%'.format(i, np.mean(acc)))
            
            
        for (start, end) in train_batch_tuples:
            y_pred = sess.run(predicted, feed_dict={x: x_train[start: end], 
                                                    y: y_train[start: end],
                                                    binary: binary_x_train[start: end]})
            predicted_train.append(y_pred)
            
    print('\nFinished Training')
    return np.array(predicted_train)

In [27]:
predictor(x_train, y_train, binary_x_train, 256)

TypeError: object of type 'Tensor' has no len()