# script version

In [21]:
%%writefile ../../scripts/data_and_eval_preparation/data_and_eval_preparation.py
import pandas as pd
import numpy as np
import sys
import os
from tqdm import tqdm
import requests
import datetime
import gspread
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
# custom imports
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg
import lendingclub.user_creds.account_info as acc_info
import lendingclub.scripts.investing.investing_utils as investing_utils

# set paths
ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
dpath = os.path.join(ppath,'data')

# load in dataframes
loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

# sort rows by loan_id (and date)
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('id', inplace=True)

# rename loan_id to id to match what comes through API
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)

# check how fields come in through API _______________________________________
# constants and setup for various accounts and APIs
now = datetime.datetime.now()
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}
acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'
creds = service_account.Credentials.from_service_account_file(os.path.join(ppath, 'user_creds', 'credentials.json'))
scope = ['https://spreadsheets.google.com/feeds']
creds = creds.with_scopes(scope)
gc = gspread.Client(auth=creds)
gc.session = AuthorizedSession(creds)
sheet = gc.open_by_key(invest_ss_key).sheet1
sheetins = gc.open_by_key(investins_ss_key).sheet1

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

# checking the fields from csv vs API
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

# rename some loan_info fields to match those coming through api
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
#     'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)

# save this version of loan info
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(os.path.join(dpath, 'loan_info_api_name_matched.fth'))

# split loan info into dataframes for training off of and evaluating__________
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']
base_loan_info = loan_info[list(common_flds)]
eval_loan_info = loan_info[eval_flds]
str_loan_info = strings[strb_flds]

# save
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# make a version of pmt_history where each loan is scaled to be equal size____
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts
    
# save
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

# make npv_rois (using various discount rates and actual/known cashflows)_____
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]
npv_roi_holder = {}
disc_rates = np.arange(.05,.36,.01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0,0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict
    
npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')
# some current loans I have no target_strict for and were not in pmt history.
# Fill with negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)

# save
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))

Overwriting ../../scripts/data_and_eval_preparation/data_and_eval_preparation.py


# notebook version

In [1]:
# this is run right after clean_loan_info

In [2]:
pd.options.display.max_columns = 999
pd.options.display.max_rows = 60
pd.options.display.max_seq_items = None

In [29]:
import sys
import os
from tqdm import tqdm
sys.path.append(os.path.join(os.path.expanduser('~'), 'projects'))
import j_utils.munging as mg

ppath = os.path.join(os.path.expanduser('~'), 'projects', 'lendingclub', )
dpath = os.path.join(ppath,'data')

In [5]:
loan_info = pd.read_feather(os.path.join(dpath, 'loan_info.fth'))
pmt_hist = pd.read_feather(os.path.join(dpath, 'clean_pmt_history_3.fth'))
strings = pd.read_feather(os.path.join(dpath, 'strings_loan_info_df.fth'))
strings = strings[strings['id'].isin(loan_info['id'])]

In [6]:
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('id', inplace=True)

In [7]:
# rename loan_id to id to match what comes through API
# loan_info.rename({'loan_id': 'id'}, axis=1, inplace=True)
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)
# strings.rename({'loan_id': 'id'}, axis=1, inplace = True)

In [8]:
print('loan_info shape: {0}\n pmt_hist shape: {1}\n strings shape: {2}'.format(loan_info.shape, pmt_hist.shape, strings.shape))

loan_info shape: (2376196, 160)
 pmt_hist shape: (47284665, 40)
 strings shape: (2376196, 12)


# rename fields to match what comes through api, save

In [12]:
# sys.path.append('/home/justin/lendingclub/investing/')

In [17]:
import requests
# import json
import lendingclub.user_creds.account_info as acc_info
# import re
# from sklearn.externals import joblib
# import lendingclub.dataprep_and_modeling.modeling_utils.data_prep_new as data_prep
import lendingclub.scripts.investing.investing_utils as investing_utils
# from lendingclub.scripts.investing.investing_utils import StandardScalerJustin
# import pandas as pd
# import numpy as np
# import math as math
# import torch
# import pickle as pickle
import datetime
# import smtplib
import gspread
# import google.auth
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession


# constants and setup for various accounts and APIs
now = datetime.datetime.now()
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key

header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'
# min_score = -0.02  # -0.04599714276994965  # -0.035764345824470828
# inv_amt = 25.00
# cash_limit = 0.00
creds = service_account.Credentials.from_service_account_file(os.path.join(ppath, 'user_creds', 'credentials.json'))
scope = ['https://spreadsheets.google.com/feeds']
creds = creds.with_scopes(scope)
gc = gspread.Client(auth=creds)
gc.session = AuthorizedSession(creds)
sheet = gc.open_by_key(invest_ss_key).sheet1
sheetins = gc.open_by_key(investins_ss_key).sheet1

# # First check if I have enough money that I want to invest. min 10 notes so 250
# summary_dict = json.loads(requests.get(
#     acc_summary_url, headers=header).content)
# cash_to_invest = summary_dict['availableCash']

# Load models and things for models
# RF
# rf = investing_utils.load_RF()
# with open(f'{investing_utils.data_save_path}/for_proc_df_model_loading.pkl', 'rb') as handle:
#     nas_all_train, embeddings_all_train, train_cols_meds_all_train, use_cols, cols_all_train, col_cat_dict, mean_stdev_mapper_all_train, dl_df_train, dl_ys_train, cat_vars, emb_szs = pickle.load(handle)
    
# process the dataframe before I'm able to set up the neural net _____________
# wait until it is time to do the api call. I'm rate limited to 1 call a second
# investing_utils.pause_until_time(test=True)

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)



In [18]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [19]:
# api_flds_not_in_licsv
# licsv_flds_not_in_api
# api_loans[['service_fee_rate']]
# loan_info['loan']

In [20]:
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'funded_amnt': 'funded_amount',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
#     'installment_at_funded': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}

In [21]:
os.path.join(dpath, 'loan_info_api_name_matched.fth')

'/home/justin/projects/lendingclub/data/loan_info_api_name_matched.fth'

In [22]:
# rename loan_info columns to match api columns
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)
# save renamed loan_info
# loan_info['id'] = loan_info['id'].astype(int)
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(os.path.join(dpath, 'loan_info_api_name_matched.fth'))

# Cut loan_info to api fields

In [23]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [24]:
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']

In [25]:
base_loan_info = loan_info[list(common_flds)]#+['id']
eval_loan_info = loan_info[eval_flds]#+['id']
str_loan_info = strings[strb_flds]#+['id']

In [26]:
print(base_loan_info.shape, eval_loan_info.shape, str_loan_info.shape)
base_loan_info.select_dtypes('object')

(2376196, 102) (2376196, 12) (2376196, 3)


Unnamed: 0,is_inc_v_joint,is_inc_v,addr_zip,purpose,emp_length,home_ownership,addr_state,initial_list_status,sub_grade,application_type,grade
0,,platform,941xx,debt_consolidation,< 1 year,rent,CA,f,B4,individual,B
1,,none,600xx,debt_consolidation,< 1 year,rent,IL,f,F2,individual,F
2,,none,984xx,debt_consolidation,< 1 year,rent,WA,w,E4,individual,E
3,,none,112xx,credit_card,< 1 year,rent,NY,f,B5,individual,B
4,,none,686xx,home_improvement,10+ years,mortgage,NE,w,A2,individual,A
5,,platform,956xx,debt_consolidation,10+ years,mortgage,CA,f,F2,individual,F
6,,source,658xx,debt_consolidation,10+ years,mortgage,MO,w,B3,individual,B
7,,platform,100xx,debt_consolidation,1 year,rent,NY,f,D3,individual,D
8,,none,777xx,debt_consolidation,10+ years,own,TX,f,C2,individual,C
9,,none,067xx,debt_consolidation,6 years,rent,CT,f,C3,individual,C


In [27]:
# save
base_loan_info.to_feather(os.path.join(dpath, 'base_loan_info.fth'))
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(os.path.join(dpath, 'str_loan_info.fth'))

# scale pmt_hist to be independent of loan_size (so we can treat loans as us investing the same amount in each)

In [30]:
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]
loan_funded_amts = loan_info.set_index('id')['funded_amount'].to_dict()
loan_dollar_cols = [
    'outs_princp_beg',
    'princp_paid',
    'int_paid',
    'fee_paid',
    'amt_due',
    'amt_paid',
    'outs_princp_end',
    'charged_off_amt',
    'monthly_pmt',
    'recovs',
    'recov_fees',
    'all_cash_to_inv', ]
id_grouped = pmt_hist.groupby('id', sort=False)
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts

100%|██████████| 2376196/2376196 [09:38<00:00, 4106.43it/s]


In [31]:
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = mg.reduce_memory(pmt_hist)
pmt_hist.to_feather(os.path.join(dpath,'scaled_pmt_hist.fth'))

trying to change columns to smaller dtypes when possible
original dataframe is 25592.282068252563 MB or 24.992462957277894 GB


100%|██████████| 13/13 [00:02<00:00,  5.95it/s]


changed dtypes of 13 cols
reduced dataframe is 23247.607069015503 MB or 22.702741278335452 GB


# make npv_rois and add to eval_loan_info

In [42]:
interesting_cols_over_time = [
    'outs_princp_beg',
    'all_cash_to_inv',
    'date',
    'fico_last',
    'm_on_books',
    'status_period_end',
    'id',
]
pmt_hist = pmt_hist[interesting_cols_over_time]

In [43]:
pmt_hist.head()

Unnamed: 0,outs_princp_beg,all_cash_to_inv,date,fico_last,m_on_books,status_period_end,id
0,1.0,0.033164,2009-09-01,757,1,current,54734
1,0.976748,0.033164,2009-10-01,757,2,current,54734
2,0.953266,0.033164,2009-11-01,787,3,current,54734
3,0.929551,0.033164,2009-12-01,782,4,current,54734
4,0.9056,0.033164,2010-01-01,802,5,current,54734


In [44]:
npv_roi_holder = {}
disc_rates = np.arange(.05,.36,.01)
id_grouped = pmt_hist.groupby('id')
for ids, group in tqdm(id_grouped):
    npv_roi_dict = {}
    funded = group.iat[0,0]
    cfs = [-funded] + group['all_cash_to_inv'].tolist()
    for rate in disc_rates:
        npv_roi_dict[rate] = np.npv(rate/12, cfs)/funded
    npv_roi_holder[ids] = npv_roi_dict

100%|██████████| 2376196/2376196 [28:40<00:00, 1381.13it/s]


In [45]:
npv_roi_df = pd.DataFrame(npv_roi_holder).T
npv_roi_df.columns = npv_roi_df.columns.values.round(2)
npv_roi_df.index.name = 'id'
npv_roi_df.reset_index(inplace=True)

In [46]:
npv_roi_df.head()

Unnamed: 0,id,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35
0,54734,0.095902,0.081254,0.066865,0.052729,0.038841,0.025198,0.011792,-0.001379,-0.014321,-0.027038,-0.039534,-0.051815,-0.063884,-0.075745,-0.087403,-0.098861,-0.110124,-0.121195,-0.132078,-0.142776,-0.153294,-0.163634,-0.1738,-0.183796,-0.193624,-0.203289,-0.212792,-0.222138,-0.231329,-0.240368,-0.249259
1,55521,0.13769,0.124301,0.111112,0.098121,0.085324,0.072718,0.060299,0.048065,0.036013,0.024139,0.012441,0.000915,-0.010441,-0.02163,-0.032654,-0.043517,-0.054222,-0.06477,-0.075164,-0.085407,-0.095501,-0.105449,-0.115253,-0.124916,-0.134439,-0.143825,-0.153077,-0.162196,-0.171184,-0.180044,-0.188778
2,55716,0.274963,0.259472,0.244227,0.229225,0.214461,0.19993,0.185629,0.171554,0.157699,0.144063,0.13064,0.117426,0.104419,0.091615,0.079009,0.0666,0.054382,0.042352,0.030509,0.018847,0.007364,-0.003943,-0.015077,-0.026041,-0.036838,-0.047472,-0.057944,-0.068258,-0.078416,-0.088421,-0.098276
3,55742,0.08777,0.071643,0.055847,0.040373,0.025214,0.010362,-0.00419,-0.018448,-0.03242,-0.046112,-0.059532,-0.072685,-0.085577,-0.098215,-0.110604,-0.122751,-0.13466,-0.146337,-0.157788,-0.169018,-0.180031,-0.190833,-0.201429,-0.211822,-0.222018,-0.232022,-0.241836,-0.251467,-0.260917,-0.270191,-0.279292
4,56121,-0.385032,-0.390533,-0.395965,-0.40133,-0.406627,-0.411859,-0.417025,-0.422128,-0.427167,-0.432145,-0.437061,-0.441916,-0.446713,-0.45145,-0.45613,-0.460753,-0.46532,-0.469831,-0.474288,-0.478691,-0.483042,-0.48734,-0.491586,-0.495782,-0.499928,-0.504025,-0.508072,-0.512072,-0.516025,-0.519931,-0.523792


In [47]:
eval_loan_info = pd.merge(eval_loan_info, npv_roi_df, how='left', on='id')

In [53]:
# some current loans I have no target_strict for and were not in pmt history. Fill with
# negatives on npv_roi.
eval_loan_info['target_strict'] = eval_loan_info['target_strict'].fillna(0)
eval_loan_info.fillna(-1, inplace=True)

In [54]:
# feather must have string column names
eval_loan_info.columns = [str(col) for col in eval_loan_info.columns]
eval_loan_info.to_feather(os.path.join(dpath, 'eval_loan_info.fth'))