In [1]:
#Importing the necessary packages
import tensorflow as tf #This code is with Python3.4 Tensorflow1.0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime

In [2]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
#Loading the data
#This is 2015 file
data = pd.read_csv('./LoanStats3d.csv', low_memory=False)

In [5]:
len(data) #421095

421095

In [6]:
data['loan_status'].unique()

array(['Current', 'Fully Paid', 'Default', 'Charged Off',
       'Late (16-30 days)', 'Late (31-120 days)', 'In Grace Period'], dtype=object)

In [7]:
data['safe_loans'] = data['loan_status'].apply(lambda x : 0 
                                               if x in ["Charged Off ",
                                                        "Default",
                                                        "In Grace Period", 
                                                        "Late (16-30 days)",
                                                        "Late (31-120 days)"] 
                                               else +1)
#create two ratios
data['debt_ratio'] = data['loan_amnt']/(data['annual_inc'])
data['current_ratio'] = data['funded_amnt']/(data['loan_amnt'])

In [8]:
col_subset = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'emp_length',                # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'acc_now_delinq',            # The number of accounts on which the borrower is now delinquent 
            'debt_ratio',
            'current_ratio',
            'safe_loans'      
           ]

In [9]:
#include all bad loans and 60000 good_loans
# if I chose a random selection there are very few bad loans
good_loans = data[data['safe_loans']==1]
bad_loans = data[data['safe_loans']==0]
good_loans_subset = good_loans.sample(n=60000)
data_clean = pd.concat([good_loans_subset,bad_loans])
len(data_clean)

75184

In [11]:
#remove data with N/A
data_clean = data_clean[col_subset].dropna()

array(['C', 'A', 'B', 'D', 'F', 'E', 'G'], dtype=object)

In [12]:
#data_clean = data_clean.sample(n=100000)
print(sum(data_clean['safe_loans']==0))
print(sum(data_clean['safe_loans']==1))

15175
59978


In [13]:
data_clean = data_clean[data_clean['dti']<100]
data_clean['dti'].mean()

19.408858180753676

In [14]:
data_clean.is_copy=False
#Transform term into float
data_clean['term'] = data_clean['term'].apply(lambda x : x.split(' ')[1])
data_clean['term'] = data_clean['term'].apply(lambda x : float(x) *1.0)

In [15]:
# Reduce term and dti
mamoy = data_clean['term'].mean()
data_clean['term'] = data_clean['term'] / mamoy
mamoy = data_clean['dti'].mean()
data_clean['dti'] = data_clean['dti'] / mamoy

In [16]:
#Transform revol_util to float
data_clean['revol_util'] = data_clean['revol_util'].str.replace('%','')
data_clean['revol_util'] = data_clean['revol_util'].apply(lambda x : float(x) * 1.0)
data_clean['revol_util'] = data_clean['revol_util']/100.0

In [17]:
#Transform emp_length to float
data_clean['emp_length'] = data_clean['emp_length'].str.replace('< ','0')
data_clean['emp_length'] = data_clean['emp_length'].str.replace('n/a','0')
data_clean['emp_length'] = data_clean['emp_length'].apply(lambda x : x.split(' ')[0])
data_clean['emp_length'] = data_clean['emp_length'].str.replace('+','')
data_clean['emp_length'] = data_clean['emp_length'].apply(lambda x : int(x) * 1.0)
data_clean.tail()
data_clean['emp_length'].unique()

array([  3.,   2.,  10.,   6.,   9.,   5.,   8.,   7.,   4.,   1.,   0.])

In [18]:
#Transform grade - can also do the same with sub_grade
grade_dict = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6}
data_clean['grade'] = data_clean['grade'].apply(lambda x : grade_dict[x])

In [19]:
#show data for information
data_clean.tail()

Unnamed: 0,grade,sub_grade,emp_length,home_ownership,dti,purpose,term,revol_util,total_rec_late_fee,acc_now_delinq,debt_ratio,current_ratio,safe_loans
421002,5,F2,5.0,MORTGAGE,0.543051,home_improvement,1.359204,0.292,90.6,0,0.2,1.0,0
421010,1,B4,10.0,MORTGAGE,1.237064,debt_consolidation,1.359204,0.246,0.0,0,0.144907,1.0,0
421015,3,D1,1.0,OWN,1.13814,debt_consolidation,1.359204,0.882,0.0,0,0.307018,1.0,0
421025,4,E5,5.0,MORTGAGE,1.212333,credit_card,1.359204,0.862,0.0,0,0.416667,1.0,0
421074,3,D2,3.0,RENT,0.271526,debt_consolidation,1.359204,0.924,0.0,0,0.242718,1.0,0


In [20]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'emp_length',                # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
            'acc_now_delinq',            # The number of accounts on which the borrower is now delinquent
            'debt_ratio',
            'current_ratio'
           ]

LABEL_COLUMN = 'safe_loans'      

In [21]:
tensor_features = {}
CATEGORICAL_COLUMNS = []
CONTINUOUS_COLUMNS = []

for i in features:
    if (data_clean[i].dtype == 'int64') or (data_clean[i].dtype == 'float64'):
        CONTINUOUS_COLUMNS.append(i) 
        tensor_features[i] = tf.contrib.layers.real_valued_column(i)
    else:
        CATEGORICAL_COLUMNS.append(i)
        tensor_features[i] = tf.contrib.layers.sparse_column_with_keys(
                              column_name=i, keys=list(data[i].unique()), combiner="sqrtn")

In [22]:
CATEGORICAL_COLUMNS

['sub_grade', 'home_ownership', 'purpose']

In [23]:
deep_columns = [tf.contrib.layers.embedding_column(tensor_features['sub_grade'], dimension=35),
                tf.contrib.layers.embedding_column(tensor_features['purpose'], dimension=12),
                tf.contrib.layers.embedding_column(tensor_features['home_ownership'], dimension=4)
               ]

In [24]:
CONTINUOUS_COLUMNS

['grade',
 'emp_length',
 'dti',
 'term',
 'revol_util',
 'total_rec_late_fee',
 'acc_now_delinq',
 'debt_ratio',
 'current_ratio']

In [25]:
wide_columns = [tensor_features['emp_length'],
                tensor_features['dti'], 
                tensor_features['term'],                 
                tensor_features['revol_util'], 
                tensor_features['total_rec_late_fee'],
                tensor_features['acc_now_delinq'],
                tensor_features['debt_ratio'],
                tensor_features['current_ratio']]

In [26]:
def merge_two_dicts(x, y):
    z = {}
    z.update(x)
    z.update(y)
    return z


def input_fn(df):
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values)
                       for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {k: tf.SparseTensor(
        indices=[[i, 0] for i in range(df[k].size)],
        values=df[k].values,
        dense_shape=[df[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    #feature_cols = dict(continuous_cols.items() + categorical_cols.items())
    feature_cols = merge_two_dicts(continuous_cols.items(), categorical_cols.items())
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label

def train_input_fn():
    return input_fn(training_set)

def eval_input_fn():
    return input_fn(test_set)

In [27]:
training_set, test_set = train_test_split(data_clean, test_size = 0.2)

In [28]:
import tempfile
model_dir = tempfile.mkdtemp()
model = tf.contrib.learn.DNNLinearCombinedClassifier(
    model_dir=model_dir,
    linear_feature_columns=wide_columns,
    dnn_feature_columns=deep_columns,
    dnn_hidden_units=[20, 30, 20],
    dnn_optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.001,
    l1_regularization_strength=0.001,
    l2_regularization_strength=0.001))

In [29]:
import datetime
print(datetime.datetime.now())
model.fit(input_fn=train_input_fn, steps=5000)
print(datetime.datetime.now())

2017-04-02 18:16:42.026308
2017-04-02 18:25:21.910603


In [30]:
results = model.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

accuracy: 0.739937
accuracy/baseline_label_mean: 0.804404
accuracy/threshold_0.500000_mean: 0.739937
auc: 0.577493
global_step: 5002
labels/actual_label_mean: 0.804404
labels/prediction_mean: 0.812532
loss: 0.760423
precision/positive_threshold_0.500000_mean: 0.81723
recall/positive_threshold_0.500000_mean: 0.87164


In [31]:
pred_labels = model.predict(input_fn=eval_input_fn, as_iterable=False)

In [32]:
pred_labels

array([1, 0, 1, ..., 1, 1, 1])

In [33]:
print(sum(pred_labels==0))
print(sum(pred_labels==1))

2135
12896


In [34]:
true_labels = np.array(test_set['safe_loans']).astype(int)
pred_labels = np.array(pred_labels).astype(int)

In [35]:
true_positive = sum((pred_labels == 1) & (true_labels == 1)) 
true_negative = sum((pred_labels == 0) & (true_labels == 0))
false_positive = sum((pred_labels == 1) & (true_labels == 0))
false_negative = sum((pred_labels == 0) & (true_labels == 1))
print('True Positive: ',true_positive)
print('False Positive: ',false_positive)
print('True Negative: ',true_negative)
print('False Negative: ',false_negative)

True Positive:  10539
False Positive:  2357
True Negative:  583
False Negative:  1552


Too many False positive and False Negatives
Model is not efficient