In [15]:
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype
pd.set_option('max_columns', 999)

# script version

In [27]:
%%writefile ../../lendingclub/data_and_eval_preparation/06_data_and_eval_preparation.py
'''
renames loan_info to match what is received through the api
makes a scaled pmt_history and other various targets for evaluation of models
'''

import os

import numpy as np
import pandas as pd
import pickle
from pandas.api.types import is_string_dtype, is_numeric_dtype
from tqdm import tqdm

import j_utils.munging as mg
import lendingclub.config as config
import lendingclub.investing.investing_utils as investing_utils
# custom imports
import user_creds.account_info as acc_info

# set paths
ppath = config.prj_dir
dpath = config.data_dir

# load in dataframes
loan_info = pd.read_feather(os.path.join(dpath, 'clean_loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

# sort rows by loan_id (and date)
loan_info = loan_info.sort_values('id')
pmt_hist = pmt_hist.sort_values(['loan_id', 'date'])
strings = strings.sort_values('id')

# rename loan_id to id to match what comes through API
pmt_hist = pmt_hist.rename({'loan_id': 'id'}, axis=1)

# check how fields come in through API _______________________________________
# constants and setup for various accounts and APIs
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

# checking the fields from csv vs API
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

# rename some loan_info fields to match those coming through api
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
    # 'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)

loan_info.reset_index(drop=True, inplace=True)

# split loan info into dataframes for training off of and evaluating__________
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time',
             'maturity_time_stat_adj', 'maturity_paid_stat_adj',
             'rem_to_be_paid', 'roi_simple', 'target_loose',
             'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']
base_loan_info = loan_info[list(common_flds)]
eval_loan_info = loan_info[eval_flds + ['grade', 'sub_grade', 'term', 'int_rate']]
str_loan_info = strings[strb_flds]


# save it at bottom of script
# eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
# str_loan_info.reset_index(drop=True, inplace=True)
# str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# make a version of pmt_history where each loan is scaled to be equal size____
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts


# make npv_rois (using various discount rates and actual/known cashflows)_____
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]
npv_roi_holder = {}
disc_rates = np.arange(.05, .36, .01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0, 0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict

npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')
# some current loans I have no target_strict for and were not in pmt history.
# Fill with negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)


# SAVE this version of loan info
loan_info.to_feather(os.path.join(dpath, 'clean_loan_info_api_name_matched.fth'))
# SAVE
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
with open(os.path.join(dpath, 'base_loan_info_dtypes.pkl'), 'wb') as f:
    pickle.dump(base_loan_info.dtypes.to_dict(), f)
    
# SAVE
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath, 'scaled_pmt_hist.fth'))

# SAVE
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))

Overwriting ../../lendingclub/data_and_eval_preparation/06_data_and_eval_preparation.py


In [3]:
# examine how to ensure that data coming from api and loan_info LOOK the same
# e.g. percents are all XX.XX and all strings used in training are UPPERed

In [20]:
def check_api_csv_data(df1, df2, common_flds):
    '''
    Checks that data from LC API and LC CSVs come in same form
    e.g. percents are all XX.XX and all strings used in training are UPPERed
    
    NOTE:
        emp_length will always be an exception and must be dealt with in the
        investing script
    '''
    common_flds = [n for n in common_flds if n != 'emp_length']
    mismatched_type = []
    magnitude_incorrect = []
    for col in common_flds:
        if is_numeric_dtype(df1[col]) != is_numeric_dtype(df2[col]):
            print(col)

In [21]:
check_api_csv_data(api_loans, base_loan_info, common_flds)

revol_util


In [22]:
api_loans.head()[common_flds]

Unnamed: 0,mo_sin_old_il_acct,revol_util,sec_app_mths_since_last_major_derog,all_util,term,mo_sin_old_rev_tl_op,num_tl_30dpd,num_bc_tl,open_acc,num_il_tl,num_tl_90g_dpd_24m,open_il_24m,num_actv_rev_tl,sec_app_chargeoff_within_12_mths,sub_grade,sec_app_open_acc,mort_acc,sec_app_earliest_cr_line,mths_since_last_major_derog,int_rate,mths_since_last_delinq,mo_sin_rcnt_rev_tl_op,emp_length,chargeoff_within_12_mths,mths_since_recent_bc,purpose,home_ownership,tot_hi_cred_lim,sec_app_revol_util,num_op_rev_tl,total_bal_il,id,annual_inc_joint,percent_bc_gt_75,mths_since_recent_revol_delinq,dti_joint,num_rev_tl_bal_gt_0,mths_since_recent_bc_dlq,revol_bal_joint,pct_tl_nvr_dlq,mo_sin_rcnt_tl,num_actv_bc_tl,dti,avg_cur_bal,pub_rec_bankruptcies,addr_state,tot_coll_amt,grade,sec_app_fico_range_high,num_bc_sats,total_cu_tl,application_type,sec_app_collections_12_mths_ex_med,sec_app_open_act_il,mths_since_rcnt_il,num_sats,open_acc_6m,open_act_il,total_bc_limit,total_bal_ex_mort,mths_since_recent_inq,total_rev_hi_lim,earliest_cr_line,bc_util,acc_now_delinq,inq_fi,sec_app_fico_range_low,num_rev_accts,collections_12_mths_ex_med,open_rv_24m,open_il_12m,bc_open_to_buy,tax_liens,tot_cur_bal,open_rv_12m,num_tl_op_past_12m,sec_app_mort_acc,total_il_high_credit_limit,revol_bal,sec_app_num_rev_accts,pub_rec,fico_range_low,annual_inc,max_bal_bc,initial_list_status,mths_since_last_record,inq_last_12m,total_acc,fico_range_high,delinq_amnt
0,77.0,70.3,,86.6,36,141,0,6,18,14,0,5,8,,B5,,2,,,13.08,,3,12.0,0,3.0,debt_consolidation,MORTGAGE,438237,,8,108316.0,162942942,,33.3,,,8,,,100,3,6,20.94,21840,0,MD,0,B,,6,6,INDIVIDUAL,,,5.0,18,2,9,33000,142049,2.0,48000,2007-05-05T17:00:00.000-07:00,64.9,0,0,,9,0,6,1,11598.0,0,393119,3,4,,115937,33733.0,,0,660,108000.0,5364.0,F,,5,25,664,0.0
1,163.0,19.0,,18.8,60,195,0,4,11,8,0,3,6,0.0,B5,10.0,1,2001-06-25T17:00:00.000-07:00,,13.08,,21,,0,21.0,debt_consolidation,MORTGAGE,423065,91.4,7,94742.0,162493693,138000.0,0.0,,27.75,6,,56338.0,100,6,3,9999.0,33805,1,CA,0,B,679.0,3,1,JOINT,0.0,4.0,6.0,11,1,3,17400,100050,1.0,27900,2003-08-25T17:00:00.000-07:00,1.2,0,0,675.0,13,0,3,2,17190.0,0,371851,0,2,1.0,115328,5308.0,14.0,1,725,0.0,136.0,F,97.0,2,22,729,0.0
2,140.0,48.7,,22.6,36,170,0,5,10,11,0,0,4,,D3,,0,,,23.05,10.0,61,120.0,0,67.0,debt_consolidation,RENT,66914,,8,4156.0,162440644,,40.0,10.0,,4,10.0,,71,54,3,19.14,1512,0,CA,0,D,,5,1,INDIVIDUAL,,,54.0,10,0,2,13500,15118,3.0,22500,2005-09-24T17:00:00.000-07:00,59.1,0,2,,10,0,0,0,5526.0,0,15118,0,0,,44414,10962.0,,0,670,85000.0,4457.0,F,,2,21,674,0.0
3,128.0,0.0,,94.6,36,211,0,14,4,9,4,1,0,,B3,,0,,23.0,11.71,23.0,5,24.0,0,5.0,debt_consolidation,RENT,62734,,2,45080.0,163038266,,0.0,23.0,,0,23.0,,75,5,0,16.12,11270,0,IN,0,B,,2,0,INDIVIDUAL,,,8.0,4,2,2,1000,45080,15.0,1000,2002-05-07T17:00:00.000-07:00,0.0,0,0,,20,0,2,1,1000.0,0,45080,2,3,,61734,0.0,,0,695,98000.0,0.0,F,,0,29,699,0.0
4,149.0,77.8,,75.2,60,224,0,10,13,14,0,2,8,,A4,,4,,,8.19,,16,24.0,0,48.0,debt_consolidation,MORTGAGE,347196,,9,81706.0,162899885,,60.0,,,8,,,100,7,4,22.25,22802,0,TX,0,A,,5,9,INDIVIDUAL,,,7.0,13,0,3,41600,173093,,102600,2001-04-04T17:00:00.000-07:00,77.1,0,0,,20,0,1,1,9532.0,0,296424,0,1,,103003,91387.0,,0,710,160000.0,13966.0,F,,0,38,714,0.0


In [26]:
base_loan_info.query('id > 140000000').head()[common_flds]

Unnamed: 0,mo_sin_old_il_acct,revol_util,sec_app_mths_since_last_major_derog,all_util,term,mo_sin_old_rev_tl_op,num_tl_30dpd,num_bc_tl,open_acc,num_il_tl,num_tl_90g_dpd_24m,open_il_24m,num_actv_rev_tl,sec_app_chargeoff_within_12_mths,sub_grade,sec_app_open_acc,mort_acc,sec_app_earliest_cr_line,mths_since_last_major_derog,int_rate,mths_since_last_delinq,mo_sin_rcnt_rev_tl_op,emp_length,chargeoff_within_12_mths,mths_since_recent_bc,purpose,home_ownership,tot_hi_cred_lim,sec_app_revol_util,num_op_rev_tl,total_bal_il,id,annual_inc_joint,percent_bc_gt_75,mths_since_recent_revol_delinq,dti_joint,num_rev_tl_bal_gt_0,mths_since_recent_bc_dlq,revol_bal_joint,pct_tl_nvr_dlq,mo_sin_rcnt_tl,num_actv_bc_tl,dti,avg_cur_bal,pub_rec_bankruptcies,addr_state,tot_coll_amt,grade,sec_app_fico_range_high,num_bc_sats,total_cu_tl,application_type,sec_app_collections_12_mths_ex_med,sec_app_open_act_il,mths_since_rcnt_il,num_sats,open_acc_6m,open_act_il,total_bc_limit,total_bal_ex_mort,mths_since_recent_inq,total_rev_hi_lim,earliest_cr_line,bc_util,acc_now_delinq,inq_fi,sec_app_fico_range_low,num_rev_accts,collections_12_mths_ex_med,open_rv_24m,open_il_12m,bc_open_to_buy,tax_liens,tot_cur_bal,open_rv_12m,num_tl_op_past_12m,sec_app_mort_acc,total_il_high_credit_limit,revol_bal,sec_app_num_rev_accts,pub_rec,fico_range_low,annual_inc,max_bal_bc,initial_list_status,mths_since_last_record,inq_last_12m,total_acc,fico_range_high,delinq_amnt
84341,132.0,66.1%,,105.0,36,193.0,0.0,4.0,6.0,16.0,0.0,0.0,4.0,,A1,,1.0,NaT,51.0,6.11,51.0,11.0,3 years,0.0,11.0,credit_card,RENT,70018.0,,4.0,58745.0,140001276,,66.699997,,,4.0,,,94.699997,11.0,3.0,12.72,11380.0,0.0,FL,0.0,A,,3.0,0.0,INDIVIDUAL,,,78.0,6.0,0.0,2.0,12700.0,68278.0,11.0,14700.0,2002-08-01,66.099998,0.0,0.0,,7.0,0.0,1.0,0.0,4311.0,0.0,68278.0,1.0,1.0,,55318.0,9533.0,,0.0,685.0,55000.0,5208.0,W,,1.0,24.0,689.0,0.0
84342,98.0,79.9%,,83.0,36,131.0,0.0,4.0,13.0,17.0,0.0,3.0,7.0,,C3,,2.0,NaT,,15.02,,42.0,10+ years,0.0,61.0,debt_consolidation,MORTGAGE,564225.0,,8.0,117425.0,140005832,,100.0,,,7.0,,,100.0,10.0,3.0,26.42,40825.0,1.0,NV,0.0,C,,3.0,4.0,INDIVIDUAL,,,10.0,13.0,0.0,4.0,18600.0,143080.0,13.0,32100.0,2007-10-01,99.800003,0.0,3.0,,12.0,0.0,0.0,1.0,37.0,0.0,530731.0,0.0,1.0,,136125.0,25655.0,,1.0,675.0,140000.0,11187.0,W,114.0,0.0,31.0,679.0,0.0
84343,183.0,30.1%,,30.0,60,142.0,0.0,10.0,11.0,3.0,0.0,0.0,8.0,,D3,,0.0,NaT,,19.92,,3.0,7 years,0.0,3.0,credit_card,RENT,42700.0,,10.0,0.0,140007806,,25.0,,,8.0,,,100.0,3.0,4.0,7.28,1428.0,0.0,KY,0.0,D,,5.0,0.0,INDIVIDUAL,,,79.0,11.0,6.0,0.0,17400.0,12849.0,3.0,42700.0,2003-06-01,45.099998,0.0,0.0,,21.0,0.0,7.0,0.0,9547.0,0.0,12849.0,6.0,6.0,,0.0,12849.0,,0.0,710.0,74000.0,4976.0,W,,3.0,25.0,714.0,0.0
84344,91.0,59.5%,,58.0,60,170.0,0.0,9.0,8.0,6.0,0.0,1.0,4.0,,B4,,2.0,NaT,,11.55,,41.0,4 years,0.0,46.0,home_improvement,OWN,88865.0,,6.0,37732.0,140011736,,33.299999,,,4.0,,,100.0,23.0,3.0,11.36,7403.0,0.0,CA,0.0,B,,4.0,1.0,INDIVIDUAL,,,23.0,8.0,0.0,2.0,18500.0,51823.0,23.0,23700.0,2004-07-01,74.699997,0.0,1.0,,14.0,0.0,0.0,0.0,4675.0,0.0,51823.0,0.0,0.0,,65165.0,14091.0,,0.0,695.0,135000.0,10970.0,W,,0.0,22.0,699.0,0.0
84345,133.0,51.2%,38.0,75.0,36,224.0,0.0,6.0,17.0,9.0,0.0,1.0,5.0,0.0,C1,19.0,0.0,2006-10-01,102.0,13.56,102.0,6.0,6 years,0.0,6.0,debt_consolidation,MORTGAGE,80600.0,85.900002,11.0,37598.0,140012127,84000.0,50.0,102.0,16.43,5.0,102.0,30260.0,77.800003,6.0,2.0,33.669998,3569.0,0.0,NC,0.0,C,634.0,4.0,2.0,JOINT,0.0,18.0,13.0,17.0,1.0,6.0,16500.0,60678.0,13.0,45100.0,2000-01-01,71.300003,0.0,3.0,630.0,18.0,0.0,4.0,0.0,4742.0,0.0,60678.0,3.0,3.0,0.0,35500.0,23080.0,5.0,0.0,685.0,41000.0,7302.0,W,,0.0,27.0,689.0,0.0


In [None]:
# acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
#     str(inv_acc_id) + '/summary'
# order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
#     str(inv_acc_id) + '/orders'
# creds = service_account.Credentials.from_service_account_file(os.path.join(ppath, 'user_creds', 'credentials.json'))
# scope = ['https://spreadsheets.google.com/feeds']
# creds = creds.with_scopes(scope)
# gc = gspread.Client(auth=creds)
# gc.session = AuthorizedSession(creds)
# sheet = gc.open_by_key(invest_ss_key).sheet1
# sheetins = gc.open_by_key(investins_ss_key).sheet1

# Old stuff

In [21]:
%%writefile ../../scripts/data_and_eval_preparation/data_and_eval_preparation.py
import pandas as pd
import numpy as np
import sys
import os
from tqdm import tqdm
import requests
import datetime
import gspread
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
# custom imports
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg
import lendingclub.user_creds.account_info as acc_info
import lendingclub.scripts.investing.investing_utils as investing_utils

# set paths
ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
dpath = os.path.join(ppath,'data')

# load in dataframes
loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

# sort rows by loan_id (and date)
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('id', inplace=True)

# rename loan_id to id to match what comes through API
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)

# check how fields come in through API _______________________________________
# constants and setup for various accounts and APIs
now = datetime.datetime.now()
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}
acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'
creds = service_account.Credentials.from_service_account_file(os.path.join(ppath, 'user_creds', 'credentials.json'))
scope = ['https://spreadsheets.google.com/feeds']
creds = creds.with_scopes(scope)
gc = gspread.Client(auth=creds)
gc.session = AuthorizedSession(creds)
sheet = gc.open_by_key(invest_ss_key).sheet1
sheetins = gc.open_by_key(investins_ss_key).sheet1

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

# checking the fields from csv vs API
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

# rename some loan_info fields to match those coming through api
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
#     'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)

# save this version of loan info
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(os.path.join(dpath, 'loan_info_api_name_matched.fth'))

# split loan info into dataframes for training off of and evaluating__________
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']
base_loan_info = loan_info[list(common_flds)]
eval_loan_info = loan_info[eval_flds]
str_loan_info = strings[strb_flds]

# save
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# make a version of pmt_history where each loan is scaled to be equal size____
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts
    
# save
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

# make npv_rois (using various discount rates and actual/known cashflows)_____
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]
npv_roi_holder = {}
disc_rates = np.arange(.05,.36,.01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0,0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict
    
npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')
# some current loans I have no target_strict for and were not in pmt history.
# Fill with negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)

# save
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))

Overwriting ../../scripts/data_and_eval_preparation/data_and_eval_preparation.py


# notebook version

In [1]:
# this is run right after clean_loan_info

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 60
pd.options.display.max_seq_items = None

In [3]:
import sys
import os
from tqdm import tqdm
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg

ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
dpath = os.path.join(ppath,'data')

In [4]:
loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

In [5]:
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('id', inplace=True)

In [6]:
# rename loan_id to id to match what comes through API
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)

In [7]:
print('loan_info shape: {0}\n pmt_hist shape: {1}\n strings shape: {2}'.format(loan_info.shape, pmt_hist.shape, strings.shape))

loan_info shape: (2376196, 160)
 pmt_hist shape: (47284665, 40)
 strings shape: (2376196, 12)


# rename fields to match what comes through api, save

In [10]:
import requests
import lendingclub.user_creds.account_info as acc_info
import lendingclub.scripts.investing.investing_utils as investing_utils

# constants and setup for various accounts and APIs
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key

header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

#lendingclub api urls
acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

In [13]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [15]:
# api_flds_not_in_licsv
# licsv_flds_not_in_api
# api_loans[['service_fee_rate']]
# loan_info['loan']

In [33]:
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
    'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}

In [34]:
# rename loan_info columns to match api columns
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)

In [35]:
# save renamed loan_info
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(os.path.join(dpath, 'loan_info_api_name_matched.fth'))

In [42]:
#2019-08-19 11:08:56, a list of api_flds not being used in model
new_licsv_flds = set(loan_info.columns)
unused_api_flds = api_flds.difference(new_licsv_flds)
unused_api_flds

{'accept_d',
 'credit_pull_d',
 'desc',
 'disbursement_method',
 'emp_title',
 'exp_d',
 'exp_default_rate',
 'housing_payment',
 'ils_exp_d',
 'investor_count',
 'list_d',
 'member_id',
 'mtg_payment',
 'review_status',
 'review_status_d',
 'service_fee_rate'}

In [44]:
# can compare to the raw loan_info_csvs and make sure none of the
# above are used
# raw_loan_info = pd.read_feather('../../data/raw_loan_info.fth')

# Cut loan_info to api fields

In [51]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [172]:
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id', 'grade', 'int_rate', 'term']
strb_flds = ['desc', 'emp_title', 'id']

In [173]:
base_loan_info = loan_info[list(common_flds)]#+['id']
eval_loan_info = loan_info[eval_flds]#+['id']
str_loan_info = strings[strb_flds]#+['id']

In [174]:
print(base_loan_info.shape, eval_loan_info.shape, str_loan_info.shape)

(2376196, 103) (2376196, 15) (2376196, 3)


In [61]:
# save
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
# eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# scale pmt_hist to be independent of loan_size (so we can treat loans as us investing the same amount in each)

In [24]:
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts

100%|██████████| 2376196/2376196 [09:48<00:00, 4038.02it/s]


In [25]:
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

trying to change columns to smaller dtypes when possible
original dataframe is 25592.282068252563 MB or 24.992462957277894 GB


100%|██████████| 13/13 [00:14<00:00,  1.26s/it]


changed dtypes of 13 cols
reduced dataframe is 23247.607069015503 MB or 22.702741278335452 GB


# make npv_rois and add to eval_loan_info

In [62]:
pmt_hist = pd.read_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

In [63]:
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]

In [64]:
npv_roi_holder = {}
disc_rates = np.arange(.05,.36,.01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0,0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict

100%|██████████| 2376196/2376196 [27:33<00:00, 1436.93it/s]


In [65]:
npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

In [66]:
npv_roi_df.head()

Unnamed: 0,id,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35
0,54734,0.095902,0.081254,0.066865,0.052729,0.038841,0.025198,0.011792,-0.001379,-0.014321,-0.027038,-0.039534,-0.051815,-0.063884,-0.075745,-0.087403,-0.098861,-0.110124,-0.121195,-0.132078,-0.142776,-0.153294,-0.163634,-0.1738,-0.183796,-0.193624,-0.203289,-0.212792,-0.222138,-0.231329,-0.240368,-0.249259
1,55521,0.13769,0.124301,0.111112,0.098121,0.085324,0.072718,0.060299,0.048065,0.036013,0.024139,0.012441,0.000915,-0.010441,-0.02163,-0.032654,-0.043517,-0.054222,-0.06477,-0.075164,-0.085407,-0.095501,-0.105449,-0.115253,-0.124916,-0.134439,-0.143825,-0.153077,-0.162196,-0.171184,-0.180044,-0.188778
2,55716,0.274963,0.259472,0.244227,0.229225,0.214461,0.19993,0.185629,0.171554,0.157699,0.144063,0.13064,0.117426,0.104419,0.091615,0.079009,0.0666,0.054382,0.042352,0.030509,0.018847,0.007364,-0.003943,-0.015077,-0.026041,-0.036838,-0.047472,-0.057944,-0.068258,-0.078416,-0.088421,-0.098276
3,55742,0.08777,0.071643,0.055847,0.040373,0.025214,0.010362,-0.00419,-0.018448,-0.03242,-0.046112,-0.059532,-0.072685,-0.085577,-0.098215,-0.110604,-0.122751,-0.13466,-0.146337,-0.157788,-0.169018,-0.180031,-0.190833,-0.201429,-0.211822,-0.222018,-0.232022,-0.241836,-0.251467,-0.260917,-0.270191,-0.279292
4,56121,-0.385032,-0.390533,-0.395965,-0.40133,-0.406627,-0.411859,-0.417025,-0.422128,-0.427167,-0.432145,-0.437061,-0.441916,-0.446713,-0.45145,-0.45613,-0.460753,-0.46532,-0.469831,-0.474288,-0.478691,-0.483042,-0.48734,-0.491586,-0.495782,-0.499928,-0.504025,-0.508072,-0.512072,-0.516025,-0.519931,-0.523792


In [163]:
# save npv_roi_df
npv_roi_df.columns = [str(col) for col in npv_roi_df.columns]
npv_roi_df.to_feather(os.path.join(dpath,'npv_roi_df.fth'))

In [175]:
eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')

In [176]:
# some current loans I have no target_strict for and were not in pmt history. Fill with
# negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)

In [177]:
eval_loan_info.head()

Unnamed: 0,end_d,issue_d,maturity_paid,maturity_time,maturity_time_stat_adj,maturity_paid_stat_adj,rem_to_be_paid,roi_simple,target_loose,target_strict,loan_status,id,grade,int_rate,term,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35
0,2011-10-01,2009-08-01,1.0,1.0,1.0,1.0,0.0,1.173214,0,0,paid,54734,B,0.1189,36,0.095902,0.081254,0.066865,0.052729,0.038841,0.025198,0.011792,-0.001379,-0.014321,-0.027038,-0.039534,-0.051815,-0.063884,-0.075745,-0.087403,-0.098861,-0.110124,-0.121195,-0.132078,-0.142776,-0.153294,-0.163634,-0.1738,-0.183796,-0.193624,-0.203289,-0.212792,-0.222138,-0.231329,-0.240368,-0.249259
1,2010-03-01,2008-07-01,1.0,1.0,1.0,1.0,0.0,1.207769,0,0,paid,55521,F,0.1608,36,0.13769,0.124301,0.111112,0.098121,0.085324,0.072718,0.060299,0.048065,0.036013,0.024139,0.012441,0.000915,-0.010441,-0.02163,-0.032654,-0.043517,-0.054222,-0.06477,-0.075164,-0.085407,-0.095501,-0.105449,-0.115253,-0.124916,-0.134439,-0.143825,-0.153077,-0.162196,-0.171184,-0.180044,-0.188778
2,2018-06-01,2016-08-01,1.0,0.944444,1.0,1.0,0.0,1.353502,0,0,paid,55716,E,0.2499,36,0.274963,0.259472,0.244227,0.229225,0.214461,0.19993,0.185629,0.171554,0.157699,0.144063,0.13064,0.117426,0.104419,0.091615,0.079009,0.0666,0.054382,0.042352,0.030509,0.018847,0.007364,-0.003943,-0.015077,-0.026041,-0.036838,-0.047472,-0.057944,-0.068258,-0.078416,-0.088421,-0.098276
3,2011-06-01,2008-05-01,1.0,1.0,1.0,1.0,0.0,1.173648,0,0,paid,55742,B,0.1071,36,0.08777,0.071643,0.055847,0.040373,0.025214,0.010362,-0.00419,-0.018448,-0.03242,-0.046112,-0.059532,-0.072685,-0.085577,-0.098215,-0.110604,-0.122751,-0.13466,-0.146337,-0.157788,-0.169018,-0.180031,-0.190833,-0.201429,-0.211822,-0.222018,-0.232022,-0.241836,-0.251467,-0.260917,-0.270191,-0.279292
4,2018-04-01,2016-01-01,0.583209,1.0,1.0,1.0,3677.225098,0.643185,1,1,charged_off,56121,A,0.0649,36,-0.385032,-0.390533,-0.395965,-0.40133,-0.406627,-0.411859,-0.417025,-0.422128,-0.427167,-0.432145,-0.437061,-0.441916,-0.446713,-0.45145,-0.45613,-0.460753,-0.46532,-0.469831,-0.474288,-0.478691,-0.483042,-0.48734,-0.491586,-0.495782,-0.499928,-0.504025,-0.508072,-0.512072,-0.516025,-0.519931,-0.523792


In [178]:
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))

# make the training/test datasets

In [161]:
# see gen_datasets in lc_utils

In [162]:
# print(eval_loan_info.shape, base_loan_info.shape, str_loan_info.shape)

# # create a dict of test_ids for up to 24 mths?
# max_date = eval_loan_info['issue_d'].max()

# train_test_ids_dict = {}
# for month in range(1,25):
#     test_ids = set(eval_loan_info[eval_loan_info['issue_d'] >= max_date+pd.DateOffset(months=-month)]['id'])
#     train_ids = set(eval_loan_info[~eval_loan_info['id'].isin(test_ids)]['id'])
#     train_test_ids_dict[month] = (train_ids, test_ids)

# train_ids, test_ids = train_test_ids_dict[1]

# train = eval_loan_info[eval_loan_info['id'].isin(train_ids)]
# test = eval_loan_info[eval_loan_info['id'].isin(test_ids)]

# import pickle

# # save
# with open(os.path.join(dpath, 'train_test_ids_dict.pkl'), 'wb') as handle:
#     pickle.dump(train_test_ids_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# set(train['id']).intersection(set(test['id']))

# from lendingclub.lc_utils import gen_datasets

# train_x, train_y, valid_x, valid_y, train_ids, valid_ids = gen_datasets('2016-01-01', '2016-02-01', base_loan_info, eval_loan_info, '0.07')

# train_x.shape

# look at the dfs

In [34]:
base_loan_info.head()

Unnamed: 0,num_tl_30dpd,mths_since_last_major_derog,annual_inc,open_act_il,num_rev_accts,is_inc_v,num_tl_op_past_12m,mo_sin_old_rev_tl_op,sec_app_inq_last_6_mths,total_cu_tl,sec_app_chargeoff_within_12_mths,percent_bc_gt_75,pct_tl_nvr_dlq,num_op_rev_tl,inq_last_6_mths,application_type,total_bal_ex_mort,int_rate,home_ownership,max_bal_bc,i_l_util,sec_app_fico_range_high,sec_app_collections_12_mths_ex_med,sec_app_open_act_il,delinq_2_yrs,open_rv_12m,delinq_amnt,mths_since_rcnt_il,mo_sin_rcnt_rev_tl_op,earliest_cr_line,addr_state,purpose,total_il_high_credit_limit,mths_since_recent_revol_delinq,bc_open_to_buy,mths_since_last_record,mths_since_last_delinq,initial_list_status,revol_util,sec_app_earliest_cr_line,dti,inq_last_12m,acc_now_delinq,avg_cur_bal,total_acc,id,num_tl_90g_dpd_24m,fico_range_high,sec_app_num_rev_accts,open_acc,open_il_24m,num_rev_tl_bal_gt_0,term,collections_12_mths_ex_med,num_actv_rev_tl,open_il_12m,sec_app_mths_since_last_major_derog,num_sats,num_bc_sats,pub_rec_bankruptcies,pub_rec,mths_since_recent_bc,dti_joint,chargeoff_within_12_mths,tot_hi_cred_lim,acc_open_past_24_mths,funded_amount,tax_liens,mo_sin_old_il_acct,bc_util,all_util,sec_app_fico_range_low,annual_inc_joint,sec_app_mort_acc,revol_bal,grade,num_accts_ever_12_0_ppd,num_actv_bc_tl,sec_app_revol_util,total_rev_hi_lim,mo_sin_rcnt_tl,mths_since_recent_inq,mths_since_recent_bc_dlq,tot_cur_bal,num_tl_12_0dpd_2m,total_bal_il,sec_app_open_acc,inq_fi,emp_length,revol_bal_joint,tot_coll_amt,mort_acc,total_bc_limit,open_acc_6m,addr_zip,num_il_tl,num_bc_tl,sub_grade,open_rv_24m,fico_range_low,loan_amount,is_inc_v_joint
0,,,85000.0,,,platform,,,,,,,,,0.0,individual,,0.1189,rent,,,,,,0.0,,0.0,,,1994-02-01,CA,debt_consolidation,,,,,,f,0.521,NaT,0.1948,,0.0,,42.0,54734,,739.0,,10.0,,,36,0.0,,,,,,0.0,0.0,,,0.0,,,25000.0,0.0,,,,,,,28854.0,B,,,,,,,,,,,,,< 1 year,,,,,,941xx,,,B4,,735.0,25000.0,
1,,,30000.0,,,none,,,,,,,,,1.0,individual,,0.1608,rent,,,,,,0.0,,0.0,,,2001-08-01,IL,debt_consolidation,,,,,,f,0.904,NaT,0.2384,,0.0,,15.0,55521,,644.0,,9.0,,,36,0.0,,,,,,0.0,0.0,,,0.0,,,1000.0,0.0,,,,,,,10125.0,F,,,,,,,,,,,,,< 1 year,,,,,,600xx,,,F2,,640.0,1000.0,
2,0.0,31.0,30784.0,1.0,25.0,none,4.0,127.0,,4.0,,0.857,0.69,13.0,4.0,individual,8351.0,0.2499,rent,1158.0,0.08,,,,0.0,4.0,0.0,50.0,7.0,2003-04-01,WA,debt_consolidation,5929.0,,907.0,28.0,,w,0.74,NaT,0.1696,6.0,0.0,596.0,36.0,55716,0.0,669.0,,14.0,0.0,13.0,36,0.0,13.0,0.0,,14.0,14.0,1.0,1.0,7.0,,0.0,16579.0,13.0,3500.0,0.0,160.0,0.79,0.74,,,,7849.0,E,1.0,7.0,,10650.0,7.0,2.0,,8351.0,,502.0,,2.0,< 1 year,,0.0,0.0,6350.0,0.0,984xx,11.0,17.0,E4,13.0,665.0,3500.0,
3,,,65000.0,,,none,,,,,,,,,0.0,individual,,0.1071,rent,,,,,,0.0,,0.0,,,2000-10-01,NY,credit_card,,,,,,f,0.767,NaT,0.1429,,0.0,,7.0,55742,,709.0,,7.0,,,36,0.0,,,,,,0.0,0.0,,,0.0,,,7000.0,0.0,,,,,,,33623.0,B,,,,,,,,,,,,,< 1 year,,,,,,112xx,,,B5,,705.0,7000.0,
4,0.0,,65000.0,2.0,3.0,none,1.0,35.0,,0.0,,0.0,1.0,2.0,0.0,individual,20302.0,0.0649,mortgage,1569.0,0.73,,,,0.0,0.0,0.0,15.0,16.0,2002-03-01,NE,home_improvement,23822.0,,5488.0,42.0,,w,0.354,NaT,0.1084,1.0,0.0,14826.0,11.0,56121,0.0,709.0,,5.0,1.0,2.0,36,0.0,2.0,0.0,,5.0,2.0,1.0,1.0,16.0,,0.0,87102.0,3.0,8000.0,0.0,166.0,0.354,0.63,,,,3012.0,A,0.0,2.0,,8500.0,12.0,3.0,,74131.0,0.0,17290.0,,2.0,10+ years,,0.0,3.0,8500.0,0.0,686xx,5.0,2.0,A2,1.0,705.0,8000.0,


In [35]:
eval_loan_info.head()

Unnamed: 0,end_d,issue_d,maturity_paid,maturity_time,maturity_time_stat_adj,maturity_paid_stat_adj,rem_to_be_paid,roi_simple,target_loose,target_strict,loan_status,id,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35
0,2011-10-01,2009-08-01,1.0,1.0,-1.0,-1.0,0.0,1.173214,0,0,paid,54734,0.095902,0.081254,0.066865,0.052729,0.038841,0.025198,0.011792,-0.001379,-0.014321,-0.027038,-0.039534,-0.051815,-0.063884,-0.075745,-0.087403,-0.098861,-0.110124,-0.121195,-0.132078,-0.142776,-0.153294,-0.163634,-0.1738,-0.183796,-0.193624,-0.203289,-0.212792,-0.222138,-0.231329,-0.240368,-0.249259
1,2010-03-01,2008-07-01,1.0,1.0,-1.0,-1.0,0.0,1.207769,0,0,paid,55521,0.13769,0.124301,0.111112,0.098121,0.085324,0.072718,0.060299,0.048065,0.036013,0.024139,0.012441,0.000915,-0.010441,-0.02163,-0.032654,-0.043517,-0.054222,-0.06477,-0.075164,-0.085407,-0.095501,-0.105449,-0.115253,-0.124916,-0.134439,-0.143825,-0.153077,-0.162196,-0.171184,-0.180044,-0.188778
2,2018-06-01,2016-08-01,1.0,0.944444,-1.0,-1.0,0.0,1.353502,0,0,paid,55716,0.274963,0.259472,0.244227,0.229225,0.214461,0.19993,0.185629,0.171554,0.157699,0.144063,0.13064,0.117426,0.104419,0.091615,0.079009,0.0666,0.054382,0.042352,0.030509,0.018847,0.007364,-0.003943,-0.015077,-0.026041,-0.036838,-0.047472,-0.057944,-0.068258,-0.078416,-0.088421,-0.098276
3,2011-06-01,2008-05-01,1.0,1.0,-1.0,-1.0,0.0,1.173648,0,0,paid,55742,0.08777,0.071643,0.055847,0.040373,0.025214,0.010362,-0.00419,-0.018448,-0.03242,-0.046112,-0.059532,-0.072685,-0.085577,-0.098215,-0.110604,-0.122751,-0.13466,-0.146337,-0.157788,-0.169018,-0.180031,-0.190833,-0.201429,-0.211822,-0.222018,-0.232022,-0.241836,-0.251467,-0.260917,-0.270191,-0.279292
4,2018-04-01,2016-01-01,0.583209,1.0,1.0,1.0,3677.225098,0.643185,1,1,charged_off,56121,-0.385032,-0.390533,-0.395965,-0.40133,-0.406627,-0.411859,-0.417025,-0.422128,-0.427167,-0.432145,-0.437061,-0.441916,-0.446713,-0.45145,-0.45613,-0.460753,-0.46532,-0.469831,-0.474288,-0.478691,-0.483042,-0.48734,-0.491586,-0.495782,-0.499928,-0.504025,-0.508072,-0.512072,-0.516025,-0.519931,-0.523792


In [36]:
str_loan_info.head()

Unnamed: 0,desc,emp_title,id
0,Due to a lack of personal finance education an...,,54734
1,Looking to sure up a few debts for consolidati...,best buy,55521
2,I currently have a loan out with CashCall. The...,receptionist,55716
3,Just want to pay off the last bit of credit ca...,cnn,55742
4,I recently married and since this was the seco...,maintenance,56121


In [37]:
pmt_hist.head()

Unnamed: 0,outs_princp_beg,all_cash_to_inv,date,fico_last,m_on_books,status_period_end,id
0,1.0,0.033164,2009-09-01,757,1,current,54734
1,0.976748,0.033164,2009-10-01,757,2,current,54734
2,0.953266,0.033164,2009-11-01,787,3,current,54734
3,0.929551,0.033164,2009-12-01,782,4,current,54734
4,0.9056,0.033164,2010-01-01,802,5,current,54734
