In [7]:
# %%writefile ../../lendingclub/data_and_eval_preparation/06_data_and_eval_preparation.py
'''
renames loan_info to match what is received through the api
makes a scaled pmt_history and other various targets for evaluation of models
'''

import os

import numpy as np
# %load ../../lendingclub/data_and_eval_preparation/data_and_eval_preparation.py
import pandas as pd
from tqdm import tqdm

# import gspread
# from google.oauth2 import service_account
# from google.auth.transport.requests import AuthorizedSession
import j_utils.munging as mg
import lendingclub.config as config
import lendingclub.investing.investing_utils as investing_utils
# custom imports
import user_creds.account_info as acc_info
pd.set_option('display.max_columns', 999)

# set paths
ppath = config.prj_dir
dpath = config.data_dir

# load in dataframes
loan_info = pd.read_feather(os.path.join(dpath, 'clean_loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

# sort rows by loan_id (and date)
loan_info = loan_info.sort_values('id')
pmt_hist = pmt_hist.sort_values(['loan_id', 'date'])
strings = strings.sort_values('id')

# rename loan_id to id to match what comes through API
pmt_hist = pmt_hist.rename({'loan_id': 'id'}, axis=1)

# check how fields come in through API _______________________________________
# constants and setup for various accounts and APIs
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

In [10]:
sorted(api_loans['sub_grade'].unique())

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'B1',
 'B3',
 'B4',
 'B5',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5']

In [11]:
api_loans[api_loans['grade'] == 'D']

Unnamed: 0,id,member_id,loan_amount,funded_amount,term,int_rate,exp_default_rate,service_fee_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,is_inc_v,accept_d,exp_d,list_d,credit_pull_d,review_status_d,review_status,desc,purpose,addr_zip,addr_state,investor_count,ils_exp_d,initial_list_status,emp_title,acc_now_delinq,acc_open_past_24_mths,bc_open_to_buy,percent_bc_gt_75,bc_util,dti,delinq_2_yrs,delinq_amnt,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6_mths,mths_since_last_delinq,mths_since_last_record,mths_since_recent_inq,mths_since_recent_revol_delinq,mths_since_recent_bc,mort_acc,open_acc,pub_rec,total_bal_ex_mort,revol_bal,revol_util,total_bc_limit,total_acc,total_il_high_credit_limit,num_rev_accts,mths_since_recent_bc_dlq,pub_rec_bankruptcies,num_accts_ever_12_0_ppd,chargeoff_within_12_mths,collections_12_mths_ex_med,tax_liens,mths_since_last_major_derog,num_sats,num_tl_op_past_12m,mo_sin_rcnt_tl,tot_hi_cred_lim,tot_cur_bal,avg_cur_bal,num_bc_tl,num_actv_bc_tl,num_bc_sats,pct_tl_nvr_dlq,num_tl_90g_dpd_24m,num_tl_30dpd,num_tl_12_0dpd_2m,num_il_tl,mo_sin_old_il_acct,num_actv_rev_tl,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,total_rev_hi_lim,num_rev_tl_bal_gt_0,num_op_rev_tl,tot_coll_amt,application_type,annual_inc_joint,dti_joint,is_inc_v_joint,open_acc_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,i_l_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,inq_fi,total_cu_tl,inq_last_12m,mtg_payment,housing_payment,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6_mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,disbursement_method,open_act_il,sec_app_open_act_il
0,159543586,199163910,10000.0,9975.0,36,20.55,12.86,1.52,374.45,D,D2,24.0,RENT,40065.0,VERIFIED,2019-09-28T14:54:38.000-07:00,2019-11-02T14:00:00.000-07:00,2019-10-03T14:00:00.000-07:00,2019-09-28T14:47:46.000-07:00,2019-10-03T09:54:48.000-07:00,APPROVED,,credit_card,018xx,MA,,2019-10-03T14:00:00.000-07:00,F,Program Coordinator,0,6,1589,0.0,50.3,25.73,0,0.0,2012-08-27T17:00:00.000-07:00,670,674,1,55.0,,3.0,,14,0,7,0,29948,2785.0,32.8,3200,8,31576,5,,0,1,0,0,0,55.0,7,3,2,40076,29948,4278,3,3,3,87,0,0,0,2,19,5,29,2,8500,5,5,0,JOINT,75425.0,16.08,VERIFIED,2,1,2,4,27163.0,86.0,2,4,1062.0,74.7,3,0,2,0.0,,690.0,694.0,2018-02-27T16:00:00.000-08:00,0.0,0.0,5.0,64.8,5.0,0.0,0.0,,18395.0,CASH,2,0.0
1,159156516,198356344,24850.0,14875.0,36,25.65,12.86,1.52,996.6,D,D4,0.0,RENT,42000.0,SOURCE_VERIFIED,2019-09-20T23:30:55.000-07:00,2019-10-27T18:00:00.000-07:00,2019-09-27T18:00:00.000-07:00,2019-09-20T23:23:02.000-07:00,2019-09-27T14:00:41.000-07:00,APPROVED,,debt_consolidation,312xx,GA,,2019-09-27T18:00:00.000-07:00,F,General Manager,0,0,5453,88.9,87.7,29.34,0,0.0,2007-06-20T17:00:00.000-07:00,675,679,0,,,,,95,0,12,0,40419,40419.0,76.0,44200,17,0,15,,0,0,0,0,0,,12,0,95,53200,40419,3674,11,9,9,100,0,0,0,2,147,10,138,95,53200,10,12,0,INDIVIDUAL,,,,0,0,0,133,0.0,,0,0,8759.0,76.0,0,0,0,0.0,,,,,,,,,,,,,,CASH,0,
5,159308454,198788962,35000.0,32600.0,36,28.8,12.86,1.52,1462.9,D,D5,120.0,MORTGAGE,169000.0,NOT_VERIFIED,2019-09-28T19:33:23.000-07:00,2019-10-29T10:00:00.000-07:00,2019-09-29T10:00:00.000-07:00,2019-09-24T06:56:51.000-07:00,2019-09-29T08:01:49.000-07:00,APPROVED,,credit_card,956xx,CA,,2019-09-29T10:00:00.000-07:00,F,IT Consultant,0,13,8816,40.0,69.8,20.96,0,0.0,2008-05-23T17:00:00.000-07:00,685,689,2,,,1.0,,1,2,11,0,106523,20384.0,54.5,29200,27,120000,18,,0,0,0,0,0,,11,9,1,516300,462699,42064,11,4,5,100,0,0,0,7,34,4,136,1,37400,4,8,0,JOINT,229000.0,19.06,NOT_VERIFIED,2,2,4,8,86139.0,86.7,5,7,9943.0,65.7,3,0,6,2650.68,,585.0,589.0,2016-01-28T16:00:00.000-08:00,3.0,0.0,7.0,35.7,6.0,0.0,3.0,2.0,107353.0,CASH,2,1.0
10,159058491,198136381,26575.0,9775.0,36,20.55,12.86,1.52,995.09,D,D2,36.0,RENT,130000.0,SOURCE_VERIFIED,2019-09-25T06:41:04.000-07:00,2019-10-26T18:00:00.000-07:00,2019-09-26T18:00:00.000-07:00,2019-09-19T06:44:47.000-07:00,2019-09-26T13:58:52.000-07:00,APPROVED,,other,460xx,IN,,2019-09-26T18:00:00.000-07:00,F,Finance manager,0,8,7954,25.0,68.9,12.52,0,0.0,2011-04-18T17:00:00.000-07:00,670,674,0,68.0,41.0,0.0,,4,0,10,1,47532,20849.0,62.2,25600,24,36732,16,,1,0,0,0,0,,10,3,4,70232,47532,4753,7,4,4,95,0,0,0,8,82,6,101,4,33500,6,8,0,INDIVIDUAL,,,,1,1,3,11,26683.0,72.6,2,5,11302.0,67.7,1,2,9,0.0,,,,,,,,,,,,,,CASH,2,
11,158929280,197906082,22550.0,16025.0,36,25.65,12.86,1.52,904.36,D,D4,36.0,RENT,65000.0,SOURCE_VERIFIED,2019-09-28T06:11:09.000-07:00,2019-10-28T10:00:00.000-07:00,2019-09-28T10:00:00.000-07:00,2019-09-17T06:40:17.000-07:00,2019-09-28T06:14:23.000-07:00,APPROVED,,debt_consolidation,301xx,GA,,2019-09-28T10:00:00.000-07:00,F,Maintenance Supervisor,0,5,10155,66.7,71.6,31.0,0,0.0,2008-06-16T17:00:00.000-07:00,675,679,1,,,4.0,,17,0,14,0,60140,32357.0,65.5,35700,18,28800,14,,0,0,0,0,0,,14,1,4,78200,60140,4626,9,9,9,100,0,0,0,4,70,12,135,17,49400,12,13,0,INDIVIDUAL,,,,1,1,1,4,27783.0,96.5,0,4,5675.0,76.9,1,0,1,0.0,,,,,,,,,,,,,,CASH,1,
18,159521296,199100515,30075.0,13325.0,36,20.55,12.86,1.52,1126.15,D,D2,,OWN,24650.0,VERIFIED,2019-09-27T21:35:35.000-07:00,2019-10-28T14:00:00.000-07:00,2019-09-28T14:00:00.000-07:00,2019-09-27T21:27:52.000-07:00,2019-09-28T10:23:34.000-07:00,APPROVED,,debt_consolidation,856xx,AZ,,2019-09-28T14:00:00.000-07:00,F,,0,5,18866,0.0,9.3,3.8,0,0.0,2011-05-27T17:00:00.000-07:00,700,704,0,,102.0,12.0,,1,0,6,1,1934,1934.0,8.9,20800,8,0,6,,1,0,0,0,0,,6,2,1,21800,1934,322,5,3,5,100,0,0,0,2,100,3,52,1,21800,3,6,0,JOINT,65386.0,19.47,SOURCE_VERIFIED,1,0,0,75,0.0,,2,5,1513.0,8.9,0,0,1,0.0,,720.0,724.0,2003-01-27T16:00:00.000-08:00,2.0,0.0,15.0,11.4,18.0,0.0,0.0,,24473.0,CASH,0,1.0
26,159814127,199595154,5300.0,4175.0,36,23.05,12.86,1.52,205.3,D,D3,12.0,MORTGAGE,60500.0,SOURCE_VERIFIED,2019-10-03T13:08:39.000-07:00,2019-11-03T17:00:00.000-08:00,2019-10-04T18:00:00.000-07:00,2019-10-03T13:00:38.000-07:00,2019-10-04T13:56:56.000-07:00,APPROVED,,debt_consolidation,152xx,PA,,2019-10-04T18:00:00.000-07:00,F,Caretaker,0,4,3478,0.0,26.0,36.94,1,0.0,2005-04-02T16:00:00.000-08:00,700,704,1,7.0,,4.0,7.0,27,3,9,0,47262,2553.0,38.7,4700,27,62123,10,7.0,0,0,0,0,0,,9,2,4,284723,248569,27619,6,4,4,84,0,0,0,14,88,5,78,6,6600,5,5,0,INDIVIDUAL,,,,2,1,2,4,44709.0,72.0,1,2,1058.0,68.8,1,2,2,1271.0,1272.0,,,,,,,,,,,,,CASH,3,
33,159849975,199720188,25000.0,15350.0,36,28.8,12.86,1.52,1044.93,D,D5,120.0,RENT,165000.0,SOURCE_VERIFIED,2019-10-04T08:32:33.000-07:00,2019-11-03T13:00:00.000-08:00,2019-10-04T14:00:00.000-07:00,2019-10-04T08:27:32.000-07:00,2019-10-04T10:01:30.000-07:00,APPROVED,,debt_consolidation,023xx,MA,,2019-10-04T14:00:00.000-07:00,F,SVP,0,13,9326,12.5,46.4,11.78,0,0.0,2006-10-03T17:00:00.000-07:00,680,684,1,72.0,,2.0,74.0,3,1,19,0,141329,15354.0,45.0,17400,26,132389,14,74.0,0,1,0,0,0,74.0,19,6,3,171479,141329,7438,10,5,8,92,0,0,0,10,123,8,127,3,34100,8,12,0,INDIVIDUAL,,,,4,1,7,10,121202.0,72.1,4,5,2405.0,64.2,1,2,3,0.0,2500.0,,,,,,,,,,,,,CASH,6,
35,159871416,199743671,25000.0,12750.0,36,23.05,12.86,1.52,968.4,D,D3,192.0,RENT,70000.0,NOT_VERIFIED,2019-10-04T13:54:58.000-07:00,2019-11-03T17:00:00.000-08:00,2019-10-04T18:00:00.000-07:00,2019-10-04T13:48:36.000-07:00,2019-10-04T13:57:49.000-07:00,APPROVED,,debt_consolidation,940xx,CA,,2019-10-04T18:00:00.000-07:00,F,office manager,0,3,7230,71.4,77.3,34.0,0,0.0,1999-03-03T16:00:00.000-08:00,700,704,0,,,7.0,,13,0,15,0,48805,28562.0,65.4,31900,20,30600,17,,0,0,0,0,0,,15,1,7,74300,48805,3254,7,7,7,100,0,0,0,3,69,12,247,7,43700,12,14,0,INDIVIDUAL,,,,0,0,1,16,20243.0,66.2,1,2,7609.0,65.7,1,0,1,0.0,,,,,,,,,,,,,,CASH,1,
38,159766373,199454160,30000.0,12775.0,36,20.55,12.86,1.52,1123.34,D,D2,36.0,RENT,130000.0,NOT_VERIFIED,2019-10-02T16:57:01.000-07:00,2019-11-02T14:00:00.000-07:00,2019-10-03T14:00:00.000-07:00,2019-10-02T16:48:42.000-07:00,2019-10-03T09:56:06.000-07:00,APPROVED,,debt_consolidation,774xx,TX,,2019-10-03T14:00:00.000-07:00,F,Owner,0,4,20586,9.1,48.5,9.53,0,0.0,2011-03-01T16:00:00.000-08:00,705,709,2,,,1.0,,2,0,14,0,35861,19414.0,46.7,40000,17,28064,14,,0,0,0,0,0,,14,1,2,69664,35861,2562,12,4,11,100,0,0,0,3,45,4,103,2,41600,4,13,0,INDIVIDUAL,,,,1,0,1,22,16447.0,58.6,1,3,8588.0,51.5,6,0,8,0.0,,,,,,,,,,,,,,CASH,1,


# script version

In [3]:
%%writefile ../../lendingclub/data_and_eval_preparation/06_data_and_eval_preparation.py
'''
renames loan_info to match what is received through the api
makes a scaled pmt_history and other various targets for evaluation of models
'''

import os

import numpy as np
# %load ../../lendingclub/data_and_eval_preparation/data_and_eval_preparation.py
import pandas as pd
from tqdm import tqdm

# import gspread
# from google.oauth2 import service_account
# from google.auth.transport.requests import AuthorizedSession
import j_utils.munging as mg
import lendingclub.config as config
import lendingclub.investing.investing_utils as investing_utils
# custom imports
import user_creds.account_info as acc_info

# set paths
ppath = config.prj_dir
dpath = config.data_dir

# load in dataframes
loan_info = pd.read_feather(os.path.join(dpath, 'clean_loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

# sort rows by loan_id (and date)
loan_info = loan_info.sort_values('id')
pmt_hist = pmt_hist.sort_values(['loan_id', 'date'])
strings = strings.sort_values('id')

# rename loan_id to id to match what comes through API
pmt_hist = pmt_hist.rename({'loan_id': 'id'}, axis=1)

# check how fields come in through API _______________________________________
# constants and setup for various accounts and APIs
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

# checking the fields from csv vs API
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

# rename some loan_info fields to match those coming through api
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
    # 'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)

# SAVE this version of loan info
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(os.path.join(dpath, 'clean_loan_info_api_name_matched.fth'))

# split loan info into dataframes for training off of and evaluating__________
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time',
             'maturity_time_stat_adj', 'maturity_paid_stat_adj',
             'rem_to_be_paid', 'roi_simple', 'target_loose',
             'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']
base_loan_info = loan_info[list(common_flds)]
eval_loan_info = loan_info[eval_flds + ['grade']]
str_loan_info = strings[strb_flds]

# SAVE
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
# save it at bottom of script
# eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
# str_loan_info.reset_index(drop=True, inplace=True)
# str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# make a version of pmt_history where each loan is scaled to be equal size____
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts


# make npv_rois (using various discount rates and actual/known cashflows)_____
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]
npv_roi_holder = {}
disc_rates = np.arange(.05, .36, .01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0, 0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict

npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')
# some current loans I have no target_strict for and were not in pmt history.
# Fill with negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)


# SAVE
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath, 'scaled_pmt_hist.fth'))

# SAVE
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))

Overwriting ../../lendingclub/data_and_eval_preparation/06_data_and_eval_preparation.py


In [None]:
# acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
#     str(inv_acc_id) + '/summary'
# order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
#     str(inv_acc_id) + '/orders'
# creds = service_account.Credentials.from_service_account_file(os.path.join(ppath, 'user_creds', 'credentials.json'))
# scope = ['https://spreadsheets.google.com/feeds']
# creds = creds.with_scopes(scope)
# gc = gspread.Client(auth=creds)
# gc.session = AuthorizedSession(creds)
# sheet = gc.open_by_key(invest_ss_key).sheet1
# sheetins = gc.open_by_key(investins_ss_key).sheet1

In [21]:
%%writefile ../../scripts/data_and_eval_preparation/data_and_eval_preparation.py
import pandas as pd
import numpy as np
import sys
import os
from tqdm import tqdm
import requests
import datetime
import gspread
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
# custom imports
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg
import lendingclub.user_creds.account_info as acc_info
import lendingclub.scripts.investing.investing_utils as investing_utils

# set paths
ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
dpath = os.path.join(ppath,'data')

# load in dataframes
loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

# sort rows by loan_id (and date)
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('id', inplace=True)

# rename loan_id to id to match what comes through API
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)

# check how fields come in through API _______________________________________
# constants and setup for various accounts and APIs
now = datetime.datetime.now()
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}
acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'
creds = service_account.Credentials.from_service_account_file(os.path.join(ppath, 'user_creds', 'credentials.json'))
scope = ['https://spreadsheets.google.com/feeds']
creds = creds.with_scopes(scope)
gc = gspread.Client(auth=creds)
gc.session = AuthorizedSession(creds)
sheet = gc.open_by_key(invest_ss_key).sheet1
sheetins = gc.open_by_key(investins_ss_key).sheet1

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

# checking the fields from csv vs API
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

# rename some loan_info fields to match those coming through api
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
#     'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)

# save this version of loan info
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(os.path.join(dpath, 'loan_info_api_name_matched.fth'))

# split loan info into dataframes for training off of and evaluating__________
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']
base_loan_info = loan_info[list(common_flds)]
eval_loan_info = loan_info[eval_flds]
str_loan_info = strings[strb_flds]

# save
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# make a version of pmt_history where each loan is scaled to be equal size____
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts
    
# save
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

# make npv_rois (using various discount rates and actual/known cashflows)_____
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]
npv_roi_holder = {}
disc_rates = np.arange(.05,.36,.01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0,0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict
    
npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')
# some current loans I have no target_strict for and were not in pmt history.
# Fill with negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)

# save
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))

Overwriting ../../scripts/data_and_eval_preparation/data_and_eval_preparation.py


# notebook version

In [1]:
# this is run right after clean_loan_info

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 60
pd.options.display.max_seq_items = None

In [3]:
import sys
import os
from tqdm import tqdm
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg

ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
dpath = os.path.join(ppath,'data')

In [4]:
loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

In [5]:
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('id', inplace=True)

In [6]:
# rename loan_id to id to match what comes through API
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)

In [7]:
print('loan_info shape: {0}\n pmt_hist shape: {1}\n strings shape: {2}'.format(loan_info.shape, pmt_hist.shape, strings.shape))

loan_info shape: (2376196, 160)
 pmt_hist shape: (47284665, 40)
 strings shape: (2376196, 12)


# rename fields to match what comes through api, save

In [10]:
import requests
import lendingclub.user_creds.account_info as acc_info
import lendingclub.scripts.investing.investing_utils as investing_utils

# constants and setup for various accounts and APIs
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key

header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

#lendingclub api urls
acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

In [13]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [15]:
# api_flds_not_in_licsv
# licsv_flds_not_in_api
# api_loans[['service_fee_rate']]
# loan_info['loan']

In [33]:
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
    'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}

In [34]:
# rename loan_info columns to match api columns
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)

In [35]:
# save renamed loan_info
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(os.path.join(dpath, 'loan_info_api_name_matched.fth'))

In [42]:
#2019-08-19 11:08:56, a list of api_flds not being used in model
new_licsv_flds = set(loan_info.columns)
unused_api_flds = api_flds.difference(new_licsv_flds)
unused_api_flds

{'accept_d',
 'credit_pull_d',
 'desc',
 'disbursement_method',
 'emp_title',
 'exp_d',
 'exp_default_rate',
 'housing_payment',
 'ils_exp_d',
 'investor_count',
 'list_d',
 'member_id',
 'mtg_payment',
 'review_status',
 'review_status_d',
 'service_fee_rate'}

In [44]:
# can compare to the raw loan_info_csvs and make sure none of the
# above are used
# raw_loan_info = pd.read_feather('../../data/raw_loan_info.fth')

# Cut loan_info to api fields

In [51]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [172]:
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id', 'grade', 'int_rate', 'term']
strb_flds = ['desc', 'emp_title', 'id']

In [173]:
base_loan_info = loan_info[list(common_flds)]#+['id']
eval_loan_info = loan_info[eval_flds]#+['id']
str_loan_info = strings[strb_flds]#+['id']

In [174]:
print(base_loan_info.shape, eval_loan_info.shape, str_loan_info.shape)

(2376196, 103) (2376196, 15) (2376196, 3)


In [61]:
# save
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
# eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# scale pmt_hist to be independent of loan_size (so we can treat loans as us investing the same amount in each)

In [24]:
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts

100%|██████████| 2376196/2376196 [09:48<00:00, 4038.02it/s]


In [25]:
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

trying to change columns to smaller dtypes when possible
original dataframe is 25592.282068252563 MB or 24.992462957277894 GB


100%|██████████| 13/13 [00:14<00:00,  1.26s/it]


changed dtypes of 13 cols
reduced dataframe is 23247.607069015503 MB or 22.702741278335452 GB


# make npv_rois and add to eval_loan_info

In [62]:
pmt_hist = pd.read_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

In [63]:
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]

In [64]:
npv_roi_holder = {}
disc_rates = np.arange(.05,.36,.01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0,0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict

100%|██████████| 2376196/2376196 [27:33<00:00, 1436.93it/s]


In [65]:
npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

In [66]:
npv_roi_df.head()

Unnamed: 0,id,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35
0,54734,0.095902,0.081254,0.066865,0.052729,0.038841,0.025198,0.011792,-0.001379,-0.014321,-0.027038,-0.039534,-0.051815,-0.063884,-0.075745,-0.087403,-0.098861,-0.110124,-0.121195,-0.132078,-0.142776,-0.153294,-0.163634,-0.1738,-0.183796,-0.193624,-0.203289,-0.212792,-0.222138,-0.231329,-0.240368,-0.249259
1,55521,0.13769,0.124301,0.111112,0.098121,0.085324,0.072718,0.060299,0.048065,0.036013,0.024139,0.012441,0.000915,-0.010441,-0.02163,-0.032654,-0.043517,-0.054222,-0.06477,-0.075164,-0.085407,-0.095501,-0.105449,-0.115253,-0.124916,-0.134439,-0.143825,-0.153077,-0.162196,-0.171184,-0.180044,-0.188778
2,55716,0.274963,0.259472,0.244227,0.229225,0.214461,0.19993,0.185629,0.171554,0.157699,0.144063,0.13064,0.117426,0.104419,0.091615,0.079009,0.0666,0.054382,0.042352,0.030509,0.018847,0.007364,-0.003943,-0.015077,-0.026041,-0.036838,-0.047472,-0.057944,-0.068258,-0.078416,-0.088421,-0.098276
3,55742,0.08777,0.071643,0.055847,0.040373,0.025214,0.010362,-0.00419,-0.018448,-0.03242,-0.046112,-0.059532,-0.072685,-0.085577,-0.098215,-0.110604,-0.122751,-0.13466,-0.146337,-0.157788,-0.169018,-0.180031,-0.190833,-0.201429,-0.211822,-0.222018,-0.232022,-0.241836,-0.251467,-0.260917,-0.270191,-0.279292
4,56121,-0.385032,-0.390533,-0.395965,-0.40133,-0.406627,-0.411859,-0.417025,-0.422128,-0.427167,-0.432145,-0.437061,-0.441916,-0.446713,-0.45145,-0.45613,-0.460753,-0.46532,-0.469831,-0.474288,-0.478691,-0.483042,-0.48734,-0.491586,-0.495782,-0.499928,-0.504025,-0.508072,-0.512072,-0.516025,-0.519931,-0.523792


In [163]:
# save npv_roi_df
npv_roi_df.columns = [str(col) for col in npv_roi_df.columns]
npv_roi_df.to_feather(os.path.join(dpath,'npv_roi_df.fth'))

In [175]:
eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')

In [176]:
# some current loans I have no target_strict for and were not in pmt history. Fill with
# negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)

In [177]:
eval_loan_info.head()

Unnamed: 0,end_d,issue_d,maturity_paid,maturity_time,maturity_time_stat_adj,maturity_paid_stat_adj,rem_to_be_paid,roi_simple,target_loose,target_strict,loan_status,id,grade,int_rate,term,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35
0,2011-10-01,2009-08-01,1.0,1.0,1.0,1.0,0.0,1.173214,0,0,paid,54734,B,0.1189,36,0.095902,0.081254,0.066865,0.052729,0.038841,0.025198,0.011792,-0.001379,-0.014321,-0.027038,-0.039534,-0.051815,-0.063884,-0.075745,-0.087403,-0.098861,-0.110124,-0.121195,-0.132078,-0.142776,-0.153294,-0.163634,-0.1738,-0.183796,-0.193624,-0.203289,-0.212792,-0.222138,-0.231329,-0.240368,-0.249259
1,2010-03-01,2008-07-01,1.0,1.0,1.0,1.0,0.0,1.207769,0,0,paid,55521,F,0.1608,36,0.13769,0.124301,0.111112,0.098121,0.085324,0.072718,0.060299,0.048065,0.036013,0.024139,0.012441,0.000915,-0.010441,-0.02163,-0.032654,-0.043517,-0.054222,-0.06477,-0.075164,-0.085407,-0.095501,-0.105449,-0.115253,-0.124916,-0.134439,-0.143825,-0.153077,-0.162196,-0.171184,-0.180044,-0.188778
2,2018-06-01,2016-08-01,1.0,0.944444,1.0,1.0,0.0,1.353502,0,0,paid,55716,E,0.2499,36,0.274963,0.259472,0.244227,0.229225,0.214461,0.19993,0.185629,0.171554,0.157699,0.144063,0.13064,0.117426,0.104419,0.091615,0.079009,0.0666,0.054382,0.042352,0.030509,0.018847,0.007364,-0.003943,-0.015077,-0.026041,-0.036838,-0.047472,-0.057944,-0.068258,-0.078416,-0.088421,-0.098276
3,2011-06-01,2008-05-01,1.0,1.0,1.0,1.0,0.0,1.173648,0,0,paid,55742,B,0.1071,36,0.08777,0.071643,0.055847,0.040373,0.025214,0.010362,-0.00419,-0.018448,-0.03242,-0.046112,-0.059532,-0.072685,-0.085577,-0.098215,-0.110604,-0.122751,-0.13466,-0.146337,-0.157788,-0.169018,-0.180031,-0.190833,-0.201429,-0.211822,-0.222018,-0.232022,-0.241836,-0.251467,-0.260917,-0.270191,-0.279292
4,2018-04-01,2016-01-01,0.583209,1.0,1.0,1.0,3677.225098,0.643185,1,1,charged_off,56121,A,0.0649,36,-0.385032,-0.390533,-0.395965,-0.40133,-0.406627,-0.411859,-0.417025,-0.422128,-0.427167,-0.432145,-0.437061,-0.441916,-0.446713,-0.45145,-0.45613,-0.460753,-0.46532,-0.469831,-0.474288,-0.478691,-0.483042,-0.48734,-0.491586,-0.495782,-0.499928,-0.504025,-0.508072,-0.512072,-0.516025,-0.519931,-0.523792


In [178]:
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))

# make the training/test datasets

In [161]:
# see gen_datasets in lc_utils

In [162]:
# print(eval_loan_info.shape, base_loan_info.shape, str_loan_info.shape)

# # create a dict of test_ids for up to 24 mths?
# max_date = eval_loan_info['issue_d'].max()

# train_test_ids_dict = {}
# for month in range(1,25):
#     test_ids = set(eval_loan_info[eval_loan_info['issue_d'] >= max_date+pd.DateOffset(months=-month)]['id'])
#     train_ids = set(eval_loan_info[~eval_loan_info['id'].isin(test_ids)]['id'])
#     train_test_ids_dict[month] = (train_ids, test_ids)

# train_ids, test_ids = train_test_ids_dict[1]

# train = eval_loan_info[eval_loan_info['id'].isin(train_ids)]
# test = eval_loan_info[eval_loan_info['id'].isin(test_ids)]

# import pickle

# # save
# with open(os.path.join(dpath, 'train_test_ids_dict.pkl'), 'wb') as handle:
#     pickle.dump(train_test_ids_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# set(train['id']).intersection(set(test['id']))

# from lendingclub.lc_utils import gen_datasets

# train_x, train_y, valid_x, valid_y, train_ids, valid_ids = gen_datasets('2016-01-01', '2016-02-01', base_loan_info, eval_loan_info, '0.07')

# train_x.shape

# look at the dfs

In [34]:
base_loan_info.head()

Unnamed: 0,num_tl_30dpd,mths_since_last_major_derog,annual_inc,open_act_il,num_rev_accts,is_inc_v,num_tl_op_past_12m,mo_sin_old_rev_tl_op,sec_app_inq_last_6_mths,total_cu_tl,sec_app_chargeoff_within_12_mths,percent_bc_gt_75,pct_tl_nvr_dlq,num_op_rev_tl,inq_last_6_mths,application_type,total_bal_ex_mort,int_rate,home_ownership,max_bal_bc,i_l_util,sec_app_fico_range_high,sec_app_collections_12_mths_ex_med,sec_app_open_act_il,delinq_2_yrs,open_rv_12m,delinq_amnt,mths_since_rcnt_il,mo_sin_rcnt_rev_tl_op,earliest_cr_line,addr_state,purpose,total_il_high_credit_limit,mths_since_recent_revol_delinq,bc_open_to_buy,mths_since_last_record,mths_since_last_delinq,initial_list_status,revol_util,sec_app_earliest_cr_line,dti,inq_last_12m,acc_now_delinq,avg_cur_bal,total_acc,id,num_tl_90g_dpd_24m,fico_range_high,sec_app_num_rev_accts,open_acc,open_il_24m,num_rev_tl_bal_gt_0,term,collections_12_mths_ex_med,num_actv_rev_tl,open_il_12m,sec_app_mths_since_last_major_derog,num_sats,num_bc_sats,pub_rec_bankruptcies,pub_rec,mths_since_recent_bc,dti_joint,chargeoff_within_12_mths,tot_hi_cred_lim,acc_open_past_24_mths,funded_amount,tax_liens,mo_sin_old_il_acct,bc_util,all_util,sec_app_fico_range_low,annual_inc_joint,sec_app_mort_acc,revol_bal,grade,num_accts_ever_12_0_ppd,num_actv_bc_tl,sec_app_revol_util,total_rev_hi_lim,mo_sin_rcnt_tl,mths_since_recent_inq,mths_since_recent_bc_dlq,tot_cur_bal,num_tl_12_0dpd_2m,total_bal_il,sec_app_open_acc,inq_fi,emp_length,revol_bal_joint,tot_coll_amt,mort_acc,total_bc_limit,open_acc_6m,addr_zip,num_il_tl,num_bc_tl,sub_grade,open_rv_24m,fico_range_low,loan_amount,is_inc_v_joint
0,,,85000.0,,,platform,,,,,,,,,0.0,individual,,0.1189,rent,,,,,,0.0,,0.0,,,1994-02-01,CA,debt_consolidation,,,,,,f,0.521,NaT,0.1948,,0.0,,42.0,54734,,739.0,,10.0,,,36,0.0,,,,,,0.0,0.0,,,0.0,,,25000.0,0.0,,,,,,,28854.0,B,,,,,,,,,,,,,< 1 year,,,,,,941xx,,,B4,,735.0,25000.0,
1,,,30000.0,,,none,,,,,,,,,1.0,individual,,0.1608,rent,,,,,,0.0,,0.0,,,2001-08-01,IL,debt_consolidation,,,,,,f,0.904,NaT,0.2384,,0.0,,15.0,55521,,644.0,,9.0,,,36,0.0,,,,,,0.0,0.0,,,0.0,,,1000.0,0.0,,,,,,,10125.0,F,,,,,,,,,,,,,< 1 year,,,,,,600xx,,,F2,,640.0,1000.0,
2,0.0,31.0,30784.0,1.0,25.0,none,4.0,127.0,,4.0,,0.857,0.69,13.0,4.0,individual,8351.0,0.2499,rent,1158.0,0.08,,,,0.0,4.0,0.0,50.0,7.0,2003-04-01,WA,debt_consolidation,5929.0,,907.0,28.0,,w,0.74,NaT,0.1696,6.0,0.0,596.0,36.0,55716,0.0,669.0,,14.0,0.0,13.0,36,0.0,13.0,0.0,,14.0,14.0,1.0,1.0,7.0,,0.0,16579.0,13.0,3500.0,0.0,160.0,0.79,0.74,,,,7849.0,E,1.0,7.0,,10650.0,7.0,2.0,,8351.0,,502.0,,2.0,< 1 year,,0.0,0.0,6350.0,0.0,984xx,11.0,17.0,E4,13.0,665.0,3500.0,
3,,,65000.0,,,none,,,,,,,,,0.0,individual,,0.1071,rent,,,,,,0.0,,0.0,,,2000-10-01,NY,credit_card,,,,,,f,0.767,NaT,0.1429,,0.0,,7.0,55742,,709.0,,7.0,,,36,0.0,,,,,,0.0,0.0,,,0.0,,,7000.0,0.0,,,,,,,33623.0,B,,,,,,,,,,,,,< 1 year,,,,,,112xx,,,B5,,705.0,7000.0,
4,0.0,,65000.0,2.0,3.0,none,1.0,35.0,,0.0,,0.0,1.0,2.0,0.0,individual,20302.0,0.0649,mortgage,1569.0,0.73,,,,0.0,0.0,0.0,15.0,16.0,2002-03-01,NE,home_improvement,23822.0,,5488.0,42.0,,w,0.354,NaT,0.1084,1.0,0.0,14826.0,11.0,56121,0.0,709.0,,5.0,1.0,2.0,36,0.0,2.0,0.0,,5.0,2.0,1.0,1.0,16.0,,0.0,87102.0,3.0,8000.0,0.0,166.0,0.354,0.63,,,,3012.0,A,0.0,2.0,,8500.0,12.0,3.0,,74131.0,0.0,17290.0,,2.0,10+ years,,0.0,3.0,8500.0,0.0,686xx,5.0,2.0,A2,1.0,705.0,8000.0,


In [35]:
eval_loan_info.head()

Unnamed: 0,end_d,issue_d,maturity_paid,maturity_time,maturity_time_stat_adj,maturity_paid_stat_adj,rem_to_be_paid,roi_simple,target_loose,target_strict,loan_status,id,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35
0,2011-10-01,2009-08-01,1.0,1.0,-1.0,-1.0,0.0,1.173214,0,0,paid,54734,0.095902,0.081254,0.066865,0.052729,0.038841,0.025198,0.011792,-0.001379,-0.014321,-0.027038,-0.039534,-0.051815,-0.063884,-0.075745,-0.087403,-0.098861,-0.110124,-0.121195,-0.132078,-0.142776,-0.153294,-0.163634,-0.1738,-0.183796,-0.193624,-0.203289,-0.212792,-0.222138,-0.231329,-0.240368,-0.249259
1,2010-03-01,2008-07-01,1.0,1.0,-1.0,-1.0,0.0,1.207769,0,0,paid,55521,0.13769,0.124301,0.111112,0.098121,0.085324,0.072718,0.060299,0.048065,0.036013,0.024139,0.012441,0.000915,-0.010441,-0.02163,-0.032654,-0.043517,-0.054222,-0.06477,-0.075164,-0.085407,-0.095501,-0.105449,-0.115253,-0.124916,-0.134439,-0.143825,-0.153077,-0.162196,-0.171184,-0.180044,-0.188778
2,2018-06-01,2016-08-01,1.0,0.944444,-1.0,-1.0,0.0,1.353502,0,0,paid,55716,0.274963,0.259472,0.244227,0.229225,0.214461,0.19993,0.185629,0.171554,0.157699,0.144063,0.13064,0.117426,0.104419,0.091615,0.079009,0.0666,0.054382,0.042352,0.030509,0.018847,0.007364,-0.003943,-0.015077,-0.026041,-0.036838,-0.047472,-0.057944,-0.068258,-0.078416,-0.088421,-0.098276
3,2011-06-01,2008-05-01,1.0,1.0,-1.0,-1.0,0.0,1.173648,0,0,paid,55742,0.08777,0.071643,0.055847,0.040373,0.025214,0.010362,-0.00419,-0.018448,-0.03242,-0.046112,-0.059532,-0.072685,-0.085577,-0.098215,-0.110604,-0.122751,-0.13466,-0.146337,-0.157788,-0.169018,-0.180031,-0.190833,-0.201429,-0.211822,-0.222018,-0.232022,-0.241836,-0.251467,-0.260917,-0.270191,-0.279292
4,2018-04-01,2016-01-01,0.583209,1.0,1.0,1.0,3677.225098,0.643185,1,1,charged_off,56121,-0.385032,-0.390533,-0.395965,-0.40133,-0.406627,-0.411859,-0.417025,-0.422128,-0.427167,-0.432145,-0.437061,-0.441916,-0.446713,-0.45145,-0.45613,-0.460753,-0.46532,-0.469831,-0.474288,-0.478691,-0.483042,-0.48734,-0.491586,-0.495782,-0.499928,-0.504025,-0.508072,-0.512072,-0.516025,-0.519931,-0.523792


In [36]:
str_loan_info.head()

Unnamed: 0,desc,emp_title,id
0,Due to a lack of personal finance education an...,,54734
1,Looking to sure up a few debts for consolidati...,best buy,55521
2,I currently have a loan out with CashCall. The...,receptionist,55716
3,Just want to pay off the last bit of credit ca...,cnn,55742
4,I recently married and since this was the seco...,maintenance,56121


In [37]:
pmt_hist.head()

Unnamed: 0,outs_princp_beg,all_cash_to_inv,date,fico_last,m_on_books,status_period_end,id
0,1.0,0.033164,2009-09-01,757,1,current,54734
1,0.976748,0.033164,2009-10-01,757,2,current,54734
2,0.953266,0.033164,2009-11-01,787,3,current,54734
3,0.929551,0.033164,2009-12-01,782,4,current,54734
4,0.9056,0.033164,2010-01-01,802,5,current,54734
