In [None]:
# LIBRARIES

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import h5py
import scipy

from scipy import ndimage
from sklearn import preprocessing
import math


import tensorflow as tf
from tensorflow.python.framework import ops

from sklearn.model_selection import train_test_split

In [None]:
# CREATE DATASETS

path = "./Data/Taiwan_credit_card.csv"
data_set = pd.read_csv(path,header=0)
#data_set = data_set.sample(frac = 1).reset_index(drop=True)
data_set['Default'] = data_set['Default'].apply(pd.to_numeric, downcast = 'integer', errors = 'coerce')


# Normalize columns to [0,1] range
cols_to_norm = ['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE','PAY_1','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'
, 'BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',                                           
                'PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']

data_set[cols_to_norm] = data_set[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

train_df, test_df = train_test_split(data_set, test_size=0.2, random_state=2018, shuffle=True )


# Split the data
#X_test = data_set.loc[0:5999,'ID':'PAY_AMT6']
X_test = test_df.loc[:,'ID':'PAY_AMT6']
X_train = train_df.loc[:,'ID':'PAY_AMT6']
Y_test = test_df.loc[:,'Default']
Y_train = train_df.loc[:,'Default']
X_test = pd.DataFrame.as_matrix(X_test)
X_train = pd.DataFrame.as_matrix(X_train)
Y_test = pd.DataFrame.as_matrix(Y_test)
Y_train = pd.DataFrame.as_matrix(Y_train)
Y_test = np.reshape(Y_test, [Y_test.shape[0],1])
Y_train = np.reshape(Y_train, [Y_train.shape[0], 1])


# Export datasets
np.savetxt("./Data/X_train.csv", X_train, delimiter=",")
np.savetxt("./Data/Y_train.csv", Y_train, delimiter=",")
np.savetxt("./Data/X_test.csv", X_test, delimiter=",")
np.savetxt("./Data/Y_test.csv", Y_test, delimiter=",")


In [None]:
# read in data

path_X_train = "./Data/X_train.csv"
X_train = pd.read_csv(path_X_train,header=0)
path_Y_train = "./Data/Y_train.csv"
Y_train = pd.read_csv(path_Y_train,header=0)

path_X_test = "./Data/X_test.csv"
X_test = pd.read_csv(path_X_test,header=0)
path_Y_test = "./Data/Y_test.csv"
Y_test = pd.read_csv(path_Y_test,header=0)


In [None]:
X_test = pd.DataFrame.as_matrix(X_test)
X_train = pd.DataFrame.as_matrix(X_train)
Y_test = pd.DataFrame.as_matrix(Y_test)
Y_train = pd.DataFrame.as_matrix(Y_train)
Y_test = np.reshape(Y_test, [Y_test.shape[0],1])
Y_train = np.reshape(Y_train, [Y_train.shape[0], 1])


In [None]:
def create_placeholders(n_x, n_y):
    """
    Creates the placeholders for the tensorflow session.
    
    Arguments:
    n_x -- scalar, number of features
    n_y -- scalar, number of classes ({0,1}-->2)
    
    Returns:
    X -- placeholder for the data input, of shape [n_x, None] and dtype "float"
    Y -- placeholder for the input labels, of shape [n_y, None] and dtype "float" ?int
    """
    
    X = tf.placeholder(tf.float32, shape=(None, n_x))
    Y = tf.placeholder(tf.float32, shape=(None, n_y))
    
    return X, Y

In [None]:
def initialize_parameters(layer_dims):
    """
    Arguments:
    layer_dims -- python array (list) containing the dimensions of each layer in our network
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])
                    bl -- bias vector of shape (1,layer_dims[l])
    """
     
    parameters = {}
    L = len(layer_dims)            # number of layers in the network

    for l in range(1, L):
        parameters['W' + str(l)] = tf.get_variable('W' + str(l), [layer_dims[l-1], layer_dims[l]], initializer = tf.contrib.layers.xavier_initializer(), dtype=tf.float32)
        parameters['b' + str(l)] = tf.get_variable('b' + str(l), [1, layer_dims[l]], initializer = tf.zeros_initializer(), dtype=tf.float32)

        assert(parameters['W' + str(l)].shape == (layer_dims[l-1], layer_dims[l]))
        assert(parameters['b' + str(l)].shape == (1,layer_dims[l]))
    
    return parameters

In [None]:
def forward_propagation(X, parameters, rate):
    """
    Implements the forward propagation for the model: LINEAR -> SOFTMAX
    
    Arguments:
    X -- input dataset placeholder, of shape (n_x, number of examples)
    parameters -- python dictionary containing your parameters "W1", "b1", ...
                  the shapes are given in initialize_parameters

    Returns:
    Z4 -- the output of the LINEAR unit
    """
    
    # Retrieve the parameters from the dictionary "parameters" 
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    W4 = parameters['W4']
    b4 = parameters['b4']
            
    Z1 = tf.add(tf.matmul(X, W1),b1)
    A1 = tf.nn.dropout(tf.nn.relu(Z1), rate = rate[0])      
    
    Z2 = tf.add(tf.matmul(A1,W2),b2)
    A2 = tf.nn.dropout(tf.nn.relu(Z2), rate = rate[1])
                                             
    Z3 = tf.add(tf.matmul(A2, W3), b3) 
    A3 = tf.nn.dropout(tf.nn.relu(Z3) , rate = rate[2])
                                          
    Z4 = tf.add(tf.matmul(A3, W4), b4)                                             
   
    
    return Z4


In [None]:
def compute_cost(Z4, Y):
    """
    Computes the cost
    
    Arguments:
    Z1 -- output of forward propagation (output of the LINEAR unit), of shape (n_y, number of examples)
    Y -- "true" labels vector placeholder, same shape as Z1
    
    Returns:
    cost - Tensor of the cost function
    """
   
    logits = Z4
    labels = Y
    
  
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels))

    
    return cost

In [None]:
def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    """
    Creates a list of random minibatches from (X, Y)
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    mini_batch_size - size of the mini-batches, integer
    seed -- this is only for the purpose of grading, so that you're "random minibatches are the same as ours.
    
    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """
    
    m = X.shape[0]                  # number of training examples
    mini_batches = []
    np.random.seed(seed)
    
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation,:]
    shuffled_Y = Y[permutation,:].reshape((m, Y.shape[1]))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[k * mini_batch_size : k * mini_batch_size + mini_batch_size, :]
        mini_batch_Y = shuffled_Y[k * mini_batch_size : k * mini_batch_size + mini_batch_size, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size : m, :]
        mini_batch_Y = shuffled_Y[num_complete_minibatches * mini_batch_size : m, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [None]:
def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.001,
          num_epochs = 100, minibatch_size = 32, print_cost = True):
    """
    Implements a multi-layer tensorflow neural network: [LINEAR->RELU]x -> LINEAR -> SIGMOID.
    
    Arguments:
    X_train -- training set, of shape (input size = n_x, number of training examples = train_m)
    Y_train -- test set, of shape (output size = n_y, number of training examples = train_m)
    X_test -- training set, of shape (input size = n_x, number of training examples = test_m)
    Y_test -- test set, of shape (output size = n_y, number of test examples = test_m)
    learning_rate -- learning rate of the optimization
    num_epochs -- number of epochs of the optimization loop
    minibatch_size -- size of a minibatch
    print_cost -- True to print the cost every 100 epochs
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    
    ops.reset_default_graph()                         # to be able to rerun the model without overwriting tf variables
    seed = 3                                          # to keep consistent results
    (m, n_x) = X_train.shape                          # (n_x: input size, m : number of examples in the train set)
    n_y = Y_train.shape[1]                            # n_y : output size
    rate_test=[0.0, 0.0, 0.0]
    rate_train = [1-0.35, 0.5, 1-0.75]
    costs = []                                        # To keep track of the cost
    
    # Create Placeholders of shape (n_x, n_y)
    X, Y = create_placeholders(n_x, n_y)
    rate = tf.placeholder(tf.float32)
   

    # Initialize parameters
    layer_dims = [n_x, 100, 50, 10, n_y]
    parameters = initialize_parameters(layer_dims)
    
    
    # Forward propagation: Build the forward propagation in the tensorflow graph
    Z4 = forward_propagation(X, parameters, rate) 
    
    # Cost function: Add cost function to tensorflow graph
    cost = compute_cost(Z4, Y)
    
    # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
    gvs = optimizer.compute_gradients(cost)
    train_op = optimizer.apply_gradients(gvs)
   
   
    # Initialize all the variables
    init = tf.global_variables_initializer()

    # Start the session to compute the tensorflow graph
    with tf.Session() as sess:
        
        # Run the initialization
        sess.run(init)     
        
        # Do the training loop
        for epoch in range(num_epochs):
            epoch_cost = 0.                       # Defines a cost related to an epoch
            num_minibatches = int(m / minibatch_size) # number of minibatches of size minibatch_size in the train set
            
            seed = seed + 1
            minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)
            count=0
            
            for minibatch in minibatches:
                count+=1
                # Select a minibatch
                (minibatch_X, minibatch_Y) = minibatch
                             
                # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
                _ , minibatch_cost = sess.run([train_op, cost], feed_dict={X: minibatch_X, Y: minibatch_Y, rate:rate_train})
                
                
                
                #print ("count, minibatch_cost:", count, minibatch_cost)
                epoch_cost += minibatch_cost / num_minibatches

            # Print the cost every epoch
            if print_cost == True and epoch % 100 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
            if print_cost == True and epoch % 5 == 0:
                costs.append(epoch_cost)
                
        # plot the cost
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per tens)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

        # lets save the parameters in a variable
        parameters = sess.run(parameters)
        print ("Parameters have been trained!")

        # Calculate the correct predictions
        raw_prediction = tf.sigmoid(Z4)
        prediction = raw_prediction >0.5
        correct_prediction = tf.equal(tf.cast(prediction, tf.int32), tf.cast(Y, tf.int32))
         
        
        # Calculate accuracy on the test set
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        

        print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train, rate:rate_train}))
        print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test, rate:rate_test}))
    
        return parameters, raw_prediction.eval({X: X_test, Y: Y_test, rate:rate_test})

In [None]:
parameters, test_predictions = model(X_train[:,1:] , Y_train, X_test[:,1:], Y_test)

In [None]:
# SAVE PREDICTIONS

np.savetxt('test_nn', test_predictions, delimiter=',')  


In [None]:
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp



# Binarize the output
y = label_binarize(Y_test, classes=[0, 1])
y_score = test_predictions
n_classes = y.shape[1]





# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])




In [None]:
# CONFUSION MATRIX

ypreds = test_predictions
ypreds[ypreds<=0.5]=0
ypreds[ypreds>0.5]=1

from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, ypreds)
tn, fp, fn, tp = confusion_matrix(Y_test, ypreds).ravel()

In [None]:
# PRECISION

tp/(tp+fp)

In [None]:
# RECALL

tp/(tp+fn)

In [None]:
# ACCURACY

(tp+tn)/(6000)

In [None]:
# ROC, AUC, GINI

plt.figure()
lw = 1
plt.plot(fpr[0], tpr[0], color='navy',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='red', lw=lw, linestyle='-', label='Random guess benchmark')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.text(0.25, 0.5, 'AUC = %0.3f\nGini = %0.3f' % (roc_auc[0], 2*roc_auc[0]-1),color='navy')
#plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


In [None]:
# AUC

roc_auc[0]

In [None]:
# OCCLUSION TEST #4526, 641

In [None]:
# TP AND TN EXAMPLES

X_test[5736, :] # X_test[5736, :]--> TP, X_test[4669, :] -->  TN

In [None]:
# MEAN VALUES TO REPLACE EACH FEATURE IN TURN WITH

X = np.concatenate([X_train, X_test], axis=0)
xm = np.mean(X, axis = 0)
xm = np.reshape(xm, [1, 24])
xm

In [None]:
# MODIFIED SIGMOID SCORE FOR EACH FEATURE

z=[]
count=0

for i in range(1, 24):
    x = list(X_test[641, :])
    X = np.reshape(x, [1,24])
    a = xm[0, i]
    X[0, i] = a
    
    X = tf.cast(X[:, 1:], tf.float32)    # Retrieve the parameters from the dictionary "parameters" 
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    W4 = parameters['W4']
    b4 = parameters['b4']
            
    Z1 = tf.add(tf.matmul(X, W1),b1)                                          
    A1 = tf.nn.relu(Z1)                                            
    Z2 = tf.add(tf.matmul(A1,W2),b2)
    A2 = tf.nn.relu(Z2)                                           
    Z3 = tf.add(tf.matmul(A2, W3), b3) 
    A3 = tf.nn.relu(Z3)                                        
    Z4 = tf.add(tf.matmul(A3, W4), b4) 
    A4 = tf.sigmoid(Z4)
    z.append(A4)
    count=count+1
    
with tf.Session() as sess:
    score = sess.run(z)
    flat_list = [item for sublist in score for item in sublist]
    flat_list = [item for sublist in flat_list for item in sublist]
    print(flat_list)

In [None]:
# PLOT THE TP AND TN EXAMPLES ON THE HEATMAP

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.DataFrame({"TN [0.227]": flat_list},
                  index=["LIMIT_BAL", "GENDER", "EDUCATION", "MARITAL_STATUS", "AGE", "PAY_1", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", 
"BILL_AMT1", "BILL_AMT2","BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", 
"PAY_AMT5", "PAY_AMT6"])
plt.subplots(figsize=(20,2))
sns.heatmap(df.T, annot=True,annot_kws={"size":10}, fmt=".3f", cmap='Blues', linewidth=2, linecolor='white', cbar=True)


# PROFILE

In [None]:
# MEAN VALUES TO REPLACE EACH FEATURE IN TURN WITH

X = np.concatenate([X_train, X_test], axis=0)
xm = np.mean(X, axis = 0)
xm = np.reshape(xm, [1, 24])

z=[]
count=0

for i in range(1, 24):
    a = xm[0, i]
    X = X_test
    X[0, i] = a
    
    X = tf.cast(X[:, 1:], tf.float32)    # Retrieve the parameters from the dictionary "parameters" 
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    W4 = parameters['W4']
    b4 = parameters['b4']
            
    Z1 = tf.add(tf.matmul(X, W1),b1)                                          
    A1 = tf.nn.relu(Z1)                                            
    Z2 = tf.add(tf.matmul(A1,W2),b2)
    A2 = tf.nn.relu(Z2)                                           
    Z3 = tf.add(tf.matmul(A2, W3), b3) 
    A3 = tf.nn.relu(Z3)                                        
    Z4 = tf.add(tf.matmul(A3, W4), b4) 
    A4 = tf.sigmoid(Z4)
    z.append(A4)
    count=count+1
    
with tf.Session() as sess:
    score = sess.run(z)
    flat_list = [item for sublist in score for item in sublist]
    flat_list = [item for sublist in flat_list for item in sublist]
    print(len(flat_list))

In [None]:
# ADD IDs to MODIFIED SIGMOID SCORES FOR EACH FEATURE

new_var = np.reshape(flat_list, [6000, 23])
X = X_test
new_array = np.concatenate([np.reshape(X[:,0], [6000,1]), new_var], axis=1)

In [None]:
# COMPUTE MODIFIED QII = SIGMOID_ORIGINAL - MODIFIED SIGMOID

p = new_array[:,1:] - test_predictions


In [None]:
# ASSIGN CLUSTERS, K=3

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(p)
y_kmeans = kmeans.predict(p)

In [None]:
# PLOT CLUSTERS FOR 2 FEATURES

plt.scatter(p[:, 6], p[:, 10], c=y_kmeans, s=50, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 8], c='red', s=200, alpha=0.5);

In [None]:
# ELBOW METHOD

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(p)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# TSNE on DATASET

from sklearn.manifold import TSNE
import random
comp_array = np.concatenate([p, np.reshape(y_kmeans, [6000,1])], axis=1 )
tsneData = TSNE(2).fit_transform(comp_array[:, :23])

In [None]:
# PLOT RESULTS

import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 5, 5
fig, ax = plt.subplots()
ax.scatter(tsneData[:,0],
                     tsneData[:,1],
                     c = 'g',
                     cmap=plt.cm.PuOr,
                     s=100)

In [None]:
# COLOURCODE BY CLUSTER ASSIGNMENT

import matplotlib.cm as cm

colvals = [dt for dt in comp_array[:, -1]]
minima = min(colvals)
maxima = max(colvals)
norm = matplotlib.colors.Normalize(vmin=minima, vmax=maxima, clip=True)
mapper = cm.ScalarMappable(norm=norm, cmap=cm.viridis)
mycolors = [mapper.to_rgba(v) for v in colvals]

In [None]:
rcParams['figure.figsize'] = 15, 15
fig, ax = plt.subplots()
ind_0 = np.where(comp_array[:,-1]==0)
ind_1 = np.where(comp_array[:,-1]==1)
ind_2 = np.where(comp_array[:,-1]==2)

ax.scatter(tsneData[ind_0,0],
                    tsneData[ind_0,1],
                     c = 'b',
                     cmap=plt.cm.PuOr,
                     s=40)

ax.scatter(tsneData[ind_1,0],
                    tsneData[ind_1,1],
                     c = 'g',
                     cmap=plt.cm.PuOr,
                     s=40)

ax.scatter(tsneData[ind_2,0],
                    tsneData[ind_2,1],
                     c = 'r',
                     cmap=plt.cm.PuOr,
                     s=40)


In [None]:
final  =np.concatenate([np.reshape(new_array[:,0],[6000,1]), comp_array], axis=1 )

In [None]:
final_0= final[np.where(final[:,24]==0),0]
final_1= final[np.where(final[:,24]==1),0]
final_2= final[np.where(final[:,24]==2),0]

In [None]:
np.savetxt("./Data/fianl0.csv", final_0, delimiter=",")
np.savetxt("./Data/fianl1.csv", final_1, delimiter=",")
np.savetxt("./Data/fianl2.csv", final_2, delimiter=",")

In [None]:
average_0 = final[np.where(final[:,24]==0),:]

In [None]:
np.savetxt("./Data/av0.csv", final, delimiter=",")