In [1]:
import os
from os.path import normpath, join
import csv
import random

import pandas as pd
import numpy as np
import sklearn
import statsmodels.api as sm

from sklearn.metrics import roc_curve, auc

  from pandas.core import datetools


## Load data from csv
** Load the data from the provided CSV. **  Assume csv is in the same directory as this Jupyter notebook.

In [2]:
CURRENT_DIRECTORY = os.getcwd()
TRAIN_CSV_NAME = "LoanStats3a-split-train.csv"
TEST_CSV_NAME = "LoanStats3a-split-test.csv"
train_filepath = normpath(join(CURRENT_DIRECTORY, TRAIN_CSV_NAME))
test_filepath = normpath(join(CURRENT_DIRECTORY, TEST_CSV_NAME))

print( "train_filepath: %s" % train_filepath)
print( "test_filepath: %s" % test_filepath)

training = pd.read_csv("LoanStats3a-split-train.csv")
test = pd.read_csv("LoanStats3a-split-test.csv")

train_filepath: /Users/jgroob/Documents/Resume/2017/Assignments/Octane Lending/lending-club-problem-11-07-2017/LoanStats3a-split-train.csv
test_filepath: /Users/jgroob/Documents/Resume/2017/Assignments/Octane Lending/lending-club-problem-11-07-2017/LoanStats3a-split-test.csv


## Performance Metric

In [3]:
def calculate_auc(y, y_pred):
    # y: an array of labels marking the actualy result as default (1) or non-default (0) 
    # y_pred: a probability that a row has defaulted
    # fpr: false-positive rate
    # tpr: true-positive rate
    
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    auc_score = auc(fpr, tpr)
    return auc_score

## Setup default (y) variable

In [4]:
def is_default(row):
    if row in {"Charged Off", "Default"}:
        return 1.0
    elif row == "Fully Paid":
        return 0.0
    raise Exception("Invalid status: %s" % loan_status)

training['is_default'] = training['loan_status'].apply(is_default)
test['is_default'] = test['loan_status'].apply(is_default)

print("train_total: %s" % len(training['is_default']))
print("test_total: %s" % len(test['is_default']))

train_total: 15598
test_total: 2198


# Feature Engineering and Initial EDA

I want to quickly assess the correlation between the raw data and the default rates.  Any variables with a high correlation (positive or negative) will make a good candidate for using in a model.

Some initial ideas:
1. Look at comparison of raw data directly
2. Look at comparisons between factor variables.
2a. With factor variables, I'll need to convert to OHE so it's reasonable to only include the top results (i.e. those with enough support so that the results have predictive power).
3. Computed variables based on raw data.  
3a. Maybe look at time to time variables (i.e. time since last payment)
3b. Convert numerical values to OHE. (i.e. did the user default vs. time since last default)

In [7]:
# Converting time variables to epoch times

from datetime import datetime
from time import mktime

def date_to_epoch(df,col):
    temp = (df
            .apply(lambda x: np.NaN if pd.isna(x[col]) 
                   else mktime(datetime.strptime(str(x[col]), '%b-%Y').timetuple())
                   , axis=1))
    return(temp)

training['last_credit_pull_epoch'] = date_to_epoch(training,'last_credit_pull_d')
training['last_pymnt_epoch'] = date_to_epoch(training,'last_pymnt_d')
training['issue_epoch'] = date_to_epoch(training,'issue_d')
training['earliest_cr_line_epoch'] = date_to_epoch(training,'earliest_cr_line')
training['next_pymnt_epoch'] = date_to_epoch(training,'next_pymnt_d')

In [8]:
def term_to_month(df,col):
    temp = (df
           .apply(lambda x: int(x[col].split()[0])
                 , axis=1)
           )
    return(temp)

training['term_months'] = term_to_month(training, 'term')   

In [9]:
# Clean-up percent variables.  convert to decimal.
def fix_percent(df,col):
    temp = (
        df
        .apply(lambda x: float(str(x[col]).split('%')[0])/100
                 , axis=1)
    )
        
    return(temp)

training['revol_util'] = fix_percent(training,'revol_util')

In [10]:
# Round / bucket numeric values to look at correlation to defaulting.
from math import floor

def trunc_decimal(df,col,trunc_value):
    temp = (df
           .apply(lambda x: np.NaN if pd.isna(x[col]) 
                   else floor(float(x[col])*trunc_value)/trunc_value
                 , axis=1)
           )
    return(temp)

def trunc_number(df,col,trunc_value):
    temp = (df
           .apply(lambda x: np.NaN if pd.isna(x[col]) 
                   else floor(float(x[col])/trunc_value)*trunc_value
                 , axis=1)
           )
    return(temp)

training['debtToIncome_trunc'] = trunc_number(training, 'dti',5)
training['revol_util_trunc'] = trunc_decimal(training,'revol_util',10)
training['revol_bal_trunc'] = (trunc_number(training, 'revol_bal',5000)
                               .apply(lambda x: 50000 if x>50000 else x)
                              )



In [11]:
def value_exists(df,col):
    temp = (
        df
        .apply(lambda x: 0 if pd.isnull(x[col]) else 1
              , axis=1)
    )
    return(temp)
    
training['previous_delinq'] = value_exists(training,'mths_since_last_delinq')

In [12]:
pd.options.display.max_columns = None
training.head(5)

Unnamed: 0,id,member_id,term,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,last_pymnt_d,next_pymnt_d,last_credit_pull_d,policy_code,random,is_default,last_credit_pull_epoch,last_pymnt_epoch,issue_epoch,earliest_cr_line_epoch,next_pymnt_epoch,term_months,debtToIncome_trunc,revol_util_trunc,revol_bal_trunc,previous_delinq
0,1069971,1304884,36 months,Duracell,10+ years,MORTGAGE,110000.0,not verified,Dec-2011,Fully Paid,n,https://www.lendingclub.com/browse/loanDetail....,Borrower added on 12/15/11 > Payoff other le...,major_purchase,Holiday,067xx,CT,10.52,0,Aug-1993,0,,,20,0,22836,0.16,42,f,0.0,0.0,May-2013,,May-2014,1,6,0.0,1398917000.0,1367381000.0,1322716000.0,744177600.0,,36,10,0.1,20000,0
1,1069742,1304855,36 months,Network Interpreting Service,6 years,RENT,77385.19,not verified,Dec-2011,Fully Paid,n,https://www.lendingclub.com/browse/loanDetail....,,debt_consolidation,lowerratemeanseasiertogetoutofdebt!,921xx,CA,9.86,0,Jan-2001,0,,,8,0,7314,0.231,28,f,0.0,0.0,Jul-2012,,Jul-2012,1,5,0.0,1341115000.0,1341115000.0,1322716000.0,978325200.0,,36,5,0.2,5000,0
2,1069469,1304526,36 months,"Stewart Enterprises, Inc.",10+ years,MORTGAGE,45600.0,not verified,Dec-2011,Fully Paid,n,https://www.lendingclub.com/browse/loanDetail....,Borrower added on 12/16/11 > Debt Consolidat...,debt_consolidation,Debt-Consolidation,700xx,LA,5.34,0,Nov-1995,1,,,6,0,3378,0.325,28,f,0.0,0.0,Jul-2012,,Jun-2012,1,3,0.0,1338523000.0,1341115000.0,1322716000.0,815202000.0,,36,5,0.3,0,0
3,1069287,1304171,36 months,Helicoil,10+ years,RENT,60000.0,not verified,Dec-2011,Fully Paid,n,https://www.lendingclub.com/browse/loanDetail....,,credit_card,Credit Card Loan,067xx,CT,12.74,0,Sep-1992,1,,,11,0,14019,0.195,18,f,0.0,0.0,Jul-2012,,Sep-2015,1,3,0.0,1441080000.0,1341115000.0,1322716000.0,715320000.0,,36,10,0.1,10000,0
4,1068967,1303403,36 months,CaseStack,4 years,RENT,53000.0,VERIFIED - income source,Dec-2011,Fully Paid,n,https://www.lendingclub.com/browse/loanDetail....,,major_purchase,Auto Loan,908xx,CA,4.44,0,Oct-2000,0,,,7,0,8630,0.139,17,f,0.0,0.0,Jan-2015,,Dec-2014,1,5,0.0,1417410000.0,1420088000.0,1322716000.0,970372800.0,,36,0,0.1,5000,0


In [13]:
# Quick grouping by default status.

(training
 .groupby('is_default')
 .agg({
       'member_id':['count'] 
       , 'annual_inc': ['mean']
       , 'open_acc': ['mean']
       , 'total_acc': ['mean']
       , 'revol_bal': ['mean']})
)

Unnamed: 0_level_0,member_id,annual_inc,open_acc,total_acc,revol_bal
Unnamed: 0_level_1,count,mean,mean,mean,mean
is_default,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0.0,13923,69130.13251,9.331466,22.653092,12639.30719
1.0,1675,60833.222042,9.26209,22.192239,13279.952836


In [14]:
(training
 .groupby('is_default')
 .agg('mean')
)

Unnamed: 0_level_0,id,member_id,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,policy_code,random,last_credit_pull_epoch,last_pymnt_epoch,issue_epoch,earliest_cr_line_epoch,next_pymnt_epoch,term_months,debtToIncome_trunc,revol_util_trunc,revol_bal_trunc,previous_delinq
is_default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
0.0,683982.258421,852156.272499,69130.13251,12.649244,0.091719,0.745744,37.341149,66.270655,9.331466,0.036486,12639.30719,0.396455,22.653092,0.0,0.0,1.0,3.973138,1394487000.0,1362964000.0,1289156000.0,836811400.0,,38.821806,10.165553,0.349098,9569.058393,0.272642
1.0,709365.280597,882135.790448,60833.222042,13.757982,0.140299,0.924179,37.304104,71.532258,9.26209,0.059104,13279.952836,0.483932,22.192239,0.053701,0.053701,1.0,4.100299,1357574000.0,1341358000.0,1291637000.0,855153300.0,1443672000.0,44.009552,11.229851,0.435467,10340.298507,0.32


# Simple EDA

Looking for any basic correlations between variable and defaulting

In [15]:
def quick_eda(df,col):
    df[col] = df[col].fillna(-1)
    df['delinq_2yrs'] = df['delinq_2yrs'].fillna(0)
    
    n_records = len(df[col])
    
    df_return = (df
                .groupby(col)
                .agg({'is_default': ['count','mean']
                     , 'annual_inc': ['mean']
                     , 'delinq_2yrs': ['mean', 'max']
                     , 'open_acc': ['mean']})
                )
    df_return['percent_records'] = df_return['is_default']['count'] / n_records

        
    return(df_return)


### Homeownership

This looks interesting.  Convert to OHE and bucket 'none', 'other', and any new variables into one group

In [16]:
quick_eda(training,'home_ownership')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
home_ownership,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
MORTGAGE,7211,0.099293,81546.557766,0.101789,4,10.180003,0.462303
NONE,2,0.0,71100.0,0.0,0,7.5,0.000128
OTHER,40,0.175,76016.225,0.15,3,8.7,0.002564
OWN,1249,0.108086,59969.723819,0.098479,4,9.156125,0.080074
RENT,7096,0.115135,56127.003912,0.09146,8,8.48774,0.45493


### Purpose of Loan

Interesting, but lots of unique values, with little support.  Only OHE the top variables (>4% of users) and bucket everything else into 'other'

In [17]:
quick_eda(training,'purpose').sort_values('percent_records', ascending=False)

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
purpose,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
debt_consolidation,6827,0.11352,65504.091482,0.08774,4,9.648455,0.437684
credit_card,2158,0.077386,68956.577873,0.073216,3,9.646895,0.138351
other,1605,0.121495,64223.947414,0.119003,5,8.71028,0.102898
home_improvement,1266,0.082148,89198.778523,0.116114,4,9.483412,0.081164
major_purchase,991,0.066599,66842.812038,0.108981,4,8.597376,0.063534
car,718,0.086351,64541.818315,0.103064,8,8.771588,0.046032
small_business,644,0.226708,74082.159565,0.135093,4,8.675466,0.041287
wedding,372,0.091398,68839.751183,0.11828,6,8.741935,0.023849
medical,289,0.110727,66012.067405,0.121107,3,9.020761,0.018528
moving,251,0.14741,64051.027888,0.091633,3,8.665339,0.016092


### Debt to Income

This looks good!
 
Also, on the lending club website, they mention that any DTI ratio > 20% is 'high risk'.  Maybe create a factor variable if the DTI is high.

In [18]:
quick_eda(training,'debtToIncome_trunc')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
debtToIncome_trunc,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,2328,0.085481,82788.797126,0.093213,4,7.185997,0.14925
5,3429,0.092447,74223.666556,0.101487,5,8.564013,0.219836
10,3854,0.105345,67730.589533,0.106642,8,9.596523,0.247083
15,3346,0.124925,62120.942292,0.097729,4,10.216975,0.214515
20,2311,0.128083,56648.282371,0.085677,6,10.627867,0.14816
25,330,0.118182,52559.587879,0.033333,2,10.936364,0.021157


### Revol Util Income

Looks good!

In [19]:
quick_eda(training,'revol_util_trunc')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
revol_util_trunc,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1.0,7,0.428571,55057.142857,0.285714,2,3.285714,0.000449
0.0,2376,0.072811,70392.565901,0.112795,6,8.91835,0.152327
0.1,1850,0.075676,68579.986924,0.087027,3,10.0,0.118605
0.2,1971,0.096905,65958.651745,0.112633,4,9.991375,0.126362
0.3,1899,0.09584,65532.732027,0.105845,5,9.720906,0.121746
0.4,1791,0.093244,67429.439693,0.08431,4,9.629816,0.114822
0.5,1625,0.117538,68196.478074,0.099692,5,9.282462,0.10418
0.6,1484,0.133423,66320.654036,0.069407,8,9.126011,0.09514
0.7,1138,0.149385,70768.396854,0.084359,4,8.941125,0.072958
0.8,889,0.143982,71942.37604,0.092238,2,8.183352,0.056994


### Previous Delinquency

Interesting, but low support with high values. Same goes with months since last delinquency.

The factorized version (previos_delinq) might be more promising.

In [20]:
quick_eda(training,'delinq_2yrs')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
delinq_2yrs,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,14408,0.104456,67895.222998,0,0,9.295183,0.923708
1,960,0.133333,72681.050687,1,1,9.667708,0.061546
2,163,0.171779,74159.43092,2,2,9.828221,0.01045
3,50,0.2,59842.26,3,3,9.06,0.003206
4,13,0.153846,83876.923077,4,4,10.846154,0.000833
5,2,0.5,41800.0,5,5,5.5,0.000128
6,1,0.0,65000.0,6,6,16.0,6.4e-05
8,1,1.0,67200.0,8,8,7.0,6.4e-05


In [21]:
quick_eda(training,'previous_delinq')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
previous_delinq,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,11266,0.101101,67690.78083,0.0,0,9.176993,0.722272
1,4332,0.12373,69665.315104,0.34903,8,9.706371,0.277728


In [22]:
quick_eda(training,'mths_since_last_delinq')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
mths_since_last_delinq,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1.0,11266,0.101101,67690.780830,0.000000,0,9.176993,0.722272
0.0,185,0.102703,71015.037838,0.000000,0,9.594595,0.011860
1.0,6,0.166667,62860.666667,1.333333,3,8.166667,0.000385
2.0,26,0.153846,96838.307692,1.384615,3,8.769231,0.001667
3.0,39,0.128205,76764.461538,1.564103,4,10.871795,0.002500
4.0,31,0.161290,62992.709677,1.354839,4,11.806452,0.001987
5.0,39,0.102564,87027.487179,1.179487,3,10.102564,0.002500
6.0,52,0.192308,72938.930769,1.461538,4,9.615385,0.003334
7.0,33,0.242424,65847.722424,1.181818,2,9.878788,0.002116
8.0,42,0.095238,86680.114286,1.404762,4,9.571429,0.002693


In [23]:
quick_eda(training,'revol_bal_trunc')


Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
revol_bal_trunc,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,5423,0.096072,58117.699924,0.130186,8,7.815047,0.347673
5000,3543,0.110641,60285.111688,0.09173,4,9.162574,0.227145
10000,2410,0.107884,66158.597627,0.082158,3,9.919917,0.154507
15000,1427,0.119131,73211.74836,0.066573,3,10.308339,0.091486
20000,859,0.138533,81134.812666,0.065192,2,11.0,0.055071
25000,567,0.097002,88136.522681,0.049383,2,11.294533,0.036351
30000,332,0.123494,94809.021807,0.078313,4,11.599398,0.021285
35000,236,0.080508,100046.818814,0.080508,2,11.529661,0.01513
40000,170,0.129412,96171.417706,0.029412,4,11.752941,0.010899
45000,119,0.201681,112099.781513,0.058824,2,11.487395,0.007629


In [24]:
quick_eda(training,'open_acc')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
open_acc,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2,173,0.156069,47844.267514,0.086705,5,2,0.011091
3,408,0.147059,48269.467549,0.095588,4,3,0.026157
4,824,0.117718,54327.037961,0.07767,3,4,0.052827
5,1219,0.09516,55593.098302,0.086957,4,5,0.078151
6,1714,0.108518,60930.432596,0.085181,3,6,0.109886
7,1679,0.097677,63982.530256,0.097677,8,7,0.107642
8,1587,0.109641,64676.281834,0.100819,3,8,0.101744
9,1502,0.108522,68223.138269,0.100533,5,9,0.096294
10,1304,0.119632,71643.144425,0.09816,4,10,0.0836
11,1101,0.101726,74882.47891,0.098093,4,11,0.070586


In [25]:
quick_eda(training,'emp_title').sort_values('percent_records', ascending=False)

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
emp_title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1,1074,0.148976,65272.332886,0.085661,3,8.639665,0.068855
US Army,42,0.095238,71916.751429,0.119048,2,8.380952,0.002693
Bank of America,41,0.097561,72169.194878,0.048780,1,9.658537,0.002629
IBM,26,0.115385,112010.615385,0.076923,1,10.653846,0.001667
AT&T,20,0.200000,85457.450000,0.100000,1,9.200000,0.001282
Lockheed Martin,20,0.000000,159946.350000,0.400000,2,11.000000,0.001282
UPS,20,0.000000,70896.000000,0.200000,1,8.450000,0.001282
Wells Fargo,19,0.052632,61316.000000,0.000000,0,8.315789,0.001218
US Air Force,18,0.111111,74116.291111,0.000000,0,9.388889,0.001154
Kaiser Permanente,17,0.176471,79564.941176,0.058824,1,11.411765,0.001090


In [26]:
quick_eda(training,'term')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
term,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
36 months,13402,0.083271,67152.288106,0.090807,6,9.245859,0.859213
60 months,2196,0.254554,74872.275346,0.134335,8,9.801002,0.140787


In [27]:
quick_eda(training,'mths_since_last_record')

Unnamed: 0_level_0,is_default,is_default,annual_inc,delinq_2yrs,delinq_2yrs,open_acc,percent_records
Unnamed: 0_level_1,count,mean,mean,mean,max,mean,Unnamed: 7_level_1
mths_since_last_record,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
-1.0,14772,0.104996,68359.171174,0.096534,6,9.324871,0.947044
0.0,234,0.115385,73637.384615,0.094017,3,9.679487,0.015002
7.0,1,1.000000,60000.000000,0.000000,0,20.000000,0.000064
17.0,1,1.000000,45000.000000,0.000000,0,10.000000,0.000064
21.0,1,0.000000,50000.000000,1.000000,1,5.000000,0.000064
22.0,1,0.000000,103000.000000,0.000000,0,7.000000,0.000064
23.0,1,0.000000,25000.000000,0.000000,0,4.000000,0.000064
25.0,1,0.000000,61200.000000,0.000000,0,6.000000,0.000064
27.0,2,0.000000,65750.000000,0.500000,1,7.500000,0.000128
28.0,1,0.000000,31600.000000,0.000000,0,4.000000,0.000064
