## Variables Chosen

The four predictor variables are as followed:
- If there exists a ```DelqCycle``` $>0$;
- If there exists an instance where ```CurrentTotalBalance``` $>$ ```CreditLimit```;
- 2nd largest decrease in ```CurrentTotalBalance``` from statement to statement;
- Max number of days where ```CurrentTotalBalance``` is strictly increasing

In [233]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.model_selection import train_test_split

import os

%matplotlib inline

In [2]:
performance  = pd.read_csv('performance_train.csv')
facturation  = pd.read_csv('facturation_train.csv')
payments     = pd.read_csv('paiements_train.csv')
transactions = pd.read_csv('transactions_train.csv')

customer_ids = performance['ID_CPTE']

print('Proportion of clients who default:', sum(performance['Default'])/len(performance))

Proportion of clients who default: 0.19336134453781512


In [20]:
class customer:
    
    def __init__(self, customer_id, performance, facturation, payments, transactions):
        
        self.customer_id  = customer_id
        self.performance  = performance
        self.facturation  = facturation
        self.payments     = payments
        self.transactions = transactions
        
        self.assessment = list(performance['PERIODID_MY'])[0]
        self.default    = list(performance['Default'])[0]
        
def generate_clients(customer_ids, *dfs):

    for cus in customer_ids:
        yield [cus] + [df[df['ID_CPTE'] == cus] for df in dfs]
        
client_generator = generate_clients(customer_ids, performance, facturation, payments, transactions)
clients = [customer(*client_info) for client_info in client_generator]

In [200]:
y_data = performance['Default'].values.reshape(-1, 1)

In [312]:
def get_max_consecutive_true(array):
    try:
        output = max([sum(g) for b, g in itertools.groupby(array) if b])
    except ValueError:
        return 0
    return output


def create_x_train(performance, facturation, payments, transactions):
    '''
    can use for 
    '''
    second_biggest_payment  = []
    relative_cash_balance   = []
    longest_streak_increase = []
    
    customer_ids = performance['ID_CPTE']

    client_generator = generate_clients(customer_ids, performance, facturation, payments, transactions)
    clients = [customer(*client_info) for client_info in client_generator]
    
    for cl in clients:
        ctb = cl.facturation.sort_values(by='PERIODID_MY')['CurrentTotalBalance'].values
        second_biggest_payment.append(np.sort(ctb[:-1] - ctb[1:])[-2])

        rel_cb = max(cl.facturation['CashBalance']/cl.facturation['CreditLimit'])
        relative_cash_balance.append(rel_cb > 0)

        longest_streak_increase.append(get_max_consecutive_true((ctb[1:] - ctb[:-1]) > 0))

    over_limit_people = set(facturation[facturation['CurrentTotalBalance'] > facturation['CreditLimit']]['ID_CPTE'])
    has_delqcycle     = set(facturation[facturation['DelqCycle'] > 0]['ID_CPTE'])
        
    over_limit_one_hot = np.array([cl in over_limit_people for cl in performance['ID_CPTE']], dtype=np.float32)
    has_delq_one_hot   = np.array([cl in has_delqcycle for cl in performance['ID_CPTE']], dtype=np.float32)
    
    x_data = np.vstack((second_biggest_payment,
                     relative_cash_balance,
                     longest_streak_increase,
                     over_limit_one_hot,
                     has_delq_one_hot)).T
    
    return x_data

In [313]:
train_data, test_data = train_test_split(list(zip(x_data, y_data)))

x_train, y_train = zip(*train_data)
x_test, y_test   = zip(*test_data)

x_train = np.array(x_train)
y_train = np.array(y_train)
x_test  = np.array(x_test)
y_test  = np.array(y_test)

In [314]:
y_predicted = {}

logistic_regression_classifier = LR()
lrc = logistic_regression_classifier.fit(x_train, y_train.reshape(-1))

gradient_boosting_classifier = GBC()
gbc = gradient_boosting_classifier.fit(x_train, y_train.reshape(-1))

decision_tree_classifier = DTC()
dtc = decision_tree_classifier.fit(x_train, y_train.reshape(-1))

In [315]:
performance_test  = pd.read_csv('../test/performance_test.csv')
facturation_test  = pd.read_csv('../test/facturation_test.csv')
payments_test     = pd.read_csv('../test/paiements_test.csv')
transactions_test = pd.read_csv('../test/transactions_test.csv')

submission_test_data = create_x_train(performance_test, facturation_test, payments_test, transactions_test)

In [316]:
submission = pd.read_csv('sample_solution.csv')

In [317]:
y_test_lrc = lrc.predict(submission_test_data)
y_test_gbc = gbc.predict(submission_test_data)
y_test_dtc = dtc.predict(submission_test_data)

In [329]:
submission['Default'] = np.array((y_test_lrc + y_test_gbc + y_test_dtc)/3 > 0.5, dtype=int)
submission.to_csv('submission_1.csv')

### Neural Network

Training the neural network below doesn't really seem to work...

In [195]:
def create_batch_tuples(length, batch_size):
    output = np.arange(0, length, batch_size)
    
    if output[-1] != length:
        output = np.append(output, length)
        
    return list(zip(output[:-1], output[1:]))


def simple_neural_network(x_train, y_train, x_test, batch_size):
    tf.reset_default_graph()
    
    _, x_dim = np.shape(x_train)
    _, y_dim = np.shape(y_train)
    
    NUM_HIDDEN = 10
    EPOCHS     = 50
    
    x = tf.placeholder(dtype=tf.float32, shape=[None, x_dim])
    y = tf.placeholder(dtype=tf.float32, shape=[None, y_dim])
    
    w_1 = tf.get_variable(name='w_1', shape=[x_dim, NUM_HIDDEN], initializer=tf.truncated_normal_initializer(stddev=0.02))
    b_1 = tf.get_variable(name='b_1', shape=[NUM_HIDDEN], initializer=tf.truncated_normal_initializer(stddev=0.02))
    
    hidden = tf.matmul(x, w_1) + b_1
    hidden = tf.nn.leaky_relu(hidden)
    
    w_2 = tf.get_variable(name='w_2', shape=[NUM_HIDDEN, y_dim], initializer=tf.truncated_normal_initializer(stddev=0.02))
    b_2 = tf.get_variable(name='b_2', shape=[y_dim], initializer=tf.truncated_normal_initializer(stddev=0.02))
    
    predicted = tf.matmul(hidden, w_2) + b_2
    y_pred = tf.sigmoid(predicted)
    
    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred, labels=y)
    cost          = tf.reduce_mean(cross_entropy)
    optimizer     = tf.train.AdamOptimizer(1e-2).minimize(cost)
    
    batches = create_batch_tuples(len(x_train), batch_size)
    batches_test = create_batch_tuples(len(x_test), batch_size)
    predicted_values = []
    
    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        
        for e in range(EPOCHS):
            
            for (start, end) in batches:
                _ = sess.run(optimizer, feed_dict={x: x_train[start: end],
                                                   y: y_train[start: end]})
            
            predicted_values = []
            
            for (start, end) in batches:
                predicted_values.append(y_pred.eval({x: x_train[start: end]}))
                
            predicted_values = np.around(np.concatenate(predicted_values))
            print('Epoch {}, accuracy = {}%'.format(e, np.mean(predicted_values == y_train.reshape(-1,))*100))
            
        predicted_values = []    
        
        for (start, end) in batches_test:
            predicted_values.append(y_pred.eval({x: x_test[start: end]}))
        
        predicted_values = np.around(np.concatenate(predicted_values))

        
    return predicted_values

In [196]:
y_pred = simple_neural_network(x_train, y_train, x_train, 128)

Epoch 0, accuracy = 80.31342136854742%
Epoch 1, accuracy = 80.35980368618036%
Epoch 2, accuracy = 80.45256832144622%
Epoch 3, accuracy = 80.45772191229433%
Epoch 4, accuracy = 80.45256832144622%
Epoch 5, accuracy = 80.45256832144622%
Epoch 6, accuracy = 80.44741473059813%
Epoch 7, accuracy = 80.44741473059813%
Epoch 8, accuracy = 80.44741473059813%
Epoch 9, accuracy = 80.44741473059813%
Epoch 10, accuracy = 80.44741473059813%
Epoch 11, accuracy = 80.45256832144622%
Epoch 12, accuracy = 80.45772191229433%
Epoch 13, accuracy = 80.45772191229433%


KeyboardInterrupt: 

In [207]:
train_data, test_data = train_test_split(list(zip(x_data, y_data)))

In [220]:
x_train, y_train = zip(*train_data)
x_test, y_test   = zip(*test_data)

x_train = np.array(x_train)
y_train = np.array(y_train)
x_test  = np.array(x_test)
y_test  = np.array(y_test)