In [None]:
!curl https://raw.githubusercontent.com/jamesandersen/aws-machine-learning-demo/master/keras-deeplearning/train-model/lc-2015-loans.zip -o lc_data.zip
!unzip lc_data.zip
!mv lc-2015-loans.csv train_data.csv

In [None]:
import os
import json
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
from tensorflow.python.estimator.export.export_output import PredictOutput

In [None]:
train_data = None # dict containing 'features' and 'labels' which store ndarrays
eval_data = None

# columns to extract from the CSV
APPLICANT_NUMERIC = ['annual_inc', 'dti', 'age_earliest_cr', 'loan_amnt', 'installment']
APPLICANT_CATEGORICAL = ['application_type', 'home_ownership', 'addr_state', 'term']
CREDIT_NUMERIC = ['acc_now_delinq', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy',
                  'bc_util', 'delinq_2yrs', 'delinq_amnt', 'fico_range_high', 'fico_range_low',
                  'last_fico_range_high', 'last_fico_range_low', 'open_acc', 'pub_rec', 'revol_util',
                  'revol_bal', 'tot_coll_amt', 'tot_cur_bal', 'total_acc', 'total_rev_hi_lim',
                  'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats',
                  'num_bc_tl', 'num_il_tl', 'num_rev_tl_bal_gt_0', 'pct_tl_nvr_dlq',
                  'percent_bc_gt_75', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bc_limit',
                  'total_il_high_credit_limit', 'all_util', 'loan_to_income',
                  'installment_pct_inc', 'il_util', 'il_util_ex_mort', 'total_bal_il', 'total_cu_tl']

FEATURES = APPLICANT_NUMERIC + APPLICANT_CATEGORICAL + CREDIT_NUMERIC
LABEL = 'grade'
COLUMNS = FEATURES + [LABEL]

INPUT_TENSOR_NAME = "inputs"

In [None]:
# Model definition for Tensorflow Estimator.  Creates a model, defines a loss function, and produces an EstimatorSpec
# for training by TensorFlow
def model_fn(features, labels, mode, params = {}):
    print ("Creating EstimatorSpec...")
    
    learning_rate = 0.001
    if 'learning_rate' in params:
        learning_rate = params['learning_rate']
        
    print ('Generating layers with input {}...'.format (features[INPUT_TENSOR_NAME]))
    print ('Using a learning rate of {}'.format (learning_rate))
    
    layer1 = tf.layers.dense(features[INPUT_TENSOR_NAME], 100, activation=tf.nn.relu, kernel_constraint = tf.keras.constraints.max_norm (3))
    layer2 = tf.layers.dropout (layer1, rate = 0.2)
    layer3 = tf.layers.dense (layer2, 60, activation = tf.nn.relu, kernel_constraint = tf.keras.constraints.max_norm (3))
    layer4 = tf.layers.dropout (layer3, rate = 0.2)
    logits = tf.layers.dense (inputs = layer4, units = 7)
    
    print ("Output layer: {}".format (logits))
    predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input = logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    
    print ("Labels data type: {}".format (labels))
    print ("Predictions: {}".format (predictions))
    
    # Provide an estimator spec for `ModeKeys.PREDICT`.
    if mode == tf.estimator.ModeKeys.PREDICT:
        print ('Returning prediction EstimatorSpec')
        print ("Prediction classes are {}".format (predictions['classes']))
        print ("Prediction probs are {}".format (predictions['probabilities']))
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions = predictions,
            export_outputs={"grade_prediction": PredictOutput(predictions)})
    
    # 2. Define the loss function for training/evaluation using Tensorflow.
    m_labels = labels
    m_predictions = predictions
    
    print ('Generate loss function...')
    # onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=7)
    loss = tf.losses.softmax_cross_entropy(
        onehot_labels = tf.cast (labels, tf.int32), 
        logits=logits
    )
    
    print ('Test for train mode...')
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
        train_op = optimizer.minimize(
            loss = loss,
            global_step = tf.train.get_global_step()
        )
        
        print ("Returning EstimatorSpec for TRAIN mode")
        return tf.estimator.EstimatorSpec(
            mode=mode, 
            loss=loss, 
            train_op=train_op
        )
    
    print ('Generate evaluation metrics...')
    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(
            labels = tf.argmax (input = labels, axis = 1), 
            predictions=predictions["classes"]
        )
    }
    
    print ("Returning EstimatorSpec")
    return tf.estimator.EstimatorSpec(
        mode=mode, 
        loss=loss, 
        eval_metric_ops=eval_metric_ops
    )


In [None]:
# for training data epochs should be None and shuffle_flag set to True
# for evaluation or prediction data epochs should be 1 and shuffle_flag set to False
def gen_input_fn(data, epochs = 1, shuffle_flag = False):   
    return tf.estimator.inputs.numpy_input_fn(
            x = {INPUT_TENSOR_NAME: data['features']},
            y = data['labels'],
            num_epochs = epochs,
            shuffle = shuffle_flag)

In [None]:
# Parse the data CSV and return Pandas DataFrames for training / evaluation / and prediction in an 60 / 20 / 20 split
def read_csv_data (training_dir): 
    global train_data, eval_data
    
    grade_categories = [g for g in "ABCDEFG"]
    
    print ("Reading training data from {}".format (os.path.join (training_dir, 'train_data.csv')))
    
    lg_data = pd.read_csv (os.path.join (training_dir, 'train_data.csv'), usecols = COLUMNS)
    lg_data['grade'] = lg_data['grade'].astype ('category', categories = grade_categories, ordered = True)
    # shuffle the data set
    lg_data = lg_data.sample (frac = 1, random_state = 2501)
    
    bad_rows = lg_data.isnull ().T.any ().T.sum ()
    if bad_rows > 0:
        print("Rows with null/NaN values: {}".format(bad_rows))
        print("Columns with null/NaN values:")
        print(pd.isnull(lg_data).sum() > 0)
        print("Dropping bad rows...")
        lg_data.dropna(axis=0, how='any', inplace=True)
        print("Rows with null/NaN values: {}".format(lg_data.isnull().T.any().T.sum()))
        
    # Subset to get feature data
    x_df = lg_data.loc[:, APPLICANT_NUMERIC + CREDIT_NUMERIC + APPLICANT_CATEGORICAL]

    # Update our X dataframe with categorical values replaced by one-hot encoded values
    for col in APPLICANT_CATEGORICAL:
        # use get_dummies() to do one hot encoding of categorical column
        x_df = x_df.merge(pd.get_dummies(x_df[col]), left_index=True, right_index=True)
        
        # drop the original categorical column
        x_df.drop(col, axis=1, inplace=True)
    
    # Ensure all numeric features are on the same scale
    for col in APPLICANT_NUMERIC + CREDIT_NUMERIC:
        x_df[col] = (x_df[col] - x_df[col].mean()) / x_df[col].std()
    x_df = x_df.astype (np.float32)

    # Specify the target labels and flatten the array
    y = pd.get_dummies(lg_data[LABEL]).astype (np.float32)

    # Create train, eval, and test sets
    eval_start = int(x_df.shape[0]*.8)
    
    train_data = {
        'features': x_df.iloc[:eval_start, :].as_matrix (),
        'labels': y.iloc[:eval_start, :].as_matrix ()
    }
    eval_data = {
        'features': x_df.iloc[eval_start:, :].as_matrix (),
        'labels': y.iloc[eval_start:, :].as_matrix ()
    }
    print ("Training data set feature shape {}, label shape {}, read complete".format (train_data['features'].shape, train_data['labels'].shape))
    

In [None]:
read_csv_data ('.')
print ("Training data shape: {}".format (train_data['features'].shape))
print ("Eval data shape: {}".format (eval_data['features'].shape))

In [None]:
train_data['features'][0]

In [None]:
# Instantiate the Estimator and prepare to train it
model_params = {'learning_rate': 0.001}
estimator = tf.estimator.Estimator (model_fn = model_fn, params = model_params)

In [None]:
# train the model, accuracy will be very low until 100k+ training steps
estimator.train (input_fn = gen_input_fn (train_data, epochs = None, shuffle_flag = True), steps = 250000)

In [None]:
# assess the effectiveness of the training using the evaluation data
estimator.evaluate (input_fn = gen_input_fn (eval_data, epochs = 1, shuffle_flag = False))

In [None]:
# use the prediction data set to further evaluate the model
pred = estimator.predict (input_fn = gen_input_fn (eval_data, epochs = 1, shuffle_flag = False))
predictions = [p for p in pred]
pred_classes = [p['classes'] for p in predictions]
pred_probs = [p['probabilities'] for p in predictions]

print ("Predictions len: {}".format (len(pred_classes)))
print ("Probabilities shape: {}".format (pd.DataFrame (pred_probs).shape))
print (predictions[0:3])

In [None]:

print ("labels type: {}".format (type(eval_data['labels'])))
print (pred_classes[0:3])
print ("Length: {}".format (len (pred_classes)))
print ("-----")
label_classes = eval_data['labels'].argmax (1).tolist ()
print (label_classes[0:3])
print ("Length: {}".format (len(label_classes)))

num_right = 0
num_wrong = 0
for i in range (len(pred_classes)):
    if pred_classes[i] == label_classes[i]:
        num_right += 1
    else:
        num_wrong += 1

print ("Right {}, Wrong {}, Accuracy: {}".format (num_right, num_wrong, float(num_right) / (num_right + num_wrong)))


In [None]:
import matplotlib.pyplot as plt
import itertools 
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score (label_classes, pred_classes)
cf_matrix = confusion_matrix (label_classes, pred_classes)

plt.imshow (cf_matrix, interpolation = 'nearest', cmap = plt.cm.Oranges)
plt.title ('Endpoint Confusion Matrix')
plt.colorbar ()
classes = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
tick_marks = np.arange (len (classes))
plt.yticks (tick_marks, classes)
plt.xticks (tick_marks, classes)
plt.ylabel ('True label')
plt.xlabel ('Predicted label')

thresh = cf_matrix.max() / 2.
for i, j in itertools.product(range(cf_matrix.shape[0]), range(cf_matrix.shape[1])):
        plt.text(j, i, format(cf_matrix[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cf_matrix[i, j] > thresh else "black")

plt.show ()
print ("Overall accuracy over {} samples: {}".format (len(pred_classes), accuracy))