In [1]:
# this is run right after clean_loan_info

In [2]:
import dir_constants as dc
from tqdm import tqdm
from j_utils.munging import compress_memory
pd.options.display.max_columns = 999
pd.options.display.max_rows = 60
pd.options.display.max_seq_items = None
# np.set_printoptions(threshold=100)

In [3]:
PATH = dc.data_path
project = 'lendingclub'
fname_loan = 'loan_info_clean.fth'
fname_pmt = 'pmt_hist_clean.fth'
fname_strings = 'strings_df_clean.fth'
data_path = f'{PATH}{project}'

In [4]:
loan_info = pd.read_feather(f'{data_path}/{fname_loan}')
pmt_hist = pd.read_feather(f'{data_path}/{fname_pmt}')
strings = pd.read_feather(f'{data_path}/{fname_strings}')
strings = strings[strings['id'].isin(loan_info['id'])]

In [5]:
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('id', inplace=True)

In [6]:
# rename loan_id to id to match what comes through API
# loan_info.rename({'loan_id': 'id'}, axis=1, inplace=True)
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)
# strings.rename({'loan_id': 'id'}, axis=1, inplace = True)

In [7]:
print('loan_info shape: {0}\n pmt_hist shape: {1}\n strings shape: {2}'.format(loan_info.shape, pmt_hist.shape, strings.shape))

loan_info shape: (2003915, 161)
 pmt_hist shape: (37318236, 41)
 strings shape: (2003915, 12)


# rename fields to match what comes through api, save

In [8]:
# sys.path.append('/home/justin/lendingclub/investing/')

In [9]:
import requests
import json
import lendingclub.account_info as acc_info
import re
from sklearn.externals import joblib
# import lendingclub.dataprep_and_modeling.modeling_utils.data_prep_new as data_prep
import lendingclub.investing.investing_utils as investing_utils
from lendingclub.investing.investing_utils import StandardScalerJustin
# import pandas as pd
# import numpy as np
import math as math
import torch
import pickle as pickle
import datetime
import smtplib
import gspread
import google.auth
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession


# constants and setup for various accounts and APIs
now = datetime.datetime.now()
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key

header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'
min_score = -0.02  # -0.04599714276994965  # -0.035764345824470828
inv_amt = 25.00
cash_limit = 0.00
creds = service_account.Credentials.from_service_account_file(acc_info.project_path+'credentials.json')
scope = ['https://spreadsheets.google.com/feeds']
creds = creds.with_scopes(scope)
gc = gspread.Client(auth=creds)
gc.session = AuthorizedSession(creds)
sheet = gc.open_by_key(invest_ss_key).sheet1
sheetins = gc.open_by_key(investins_ss_key).sheet1

# First check if I have enough money that I want to invest. min 10 notes so 250
summary_dict = json.loads(requests.get(
    acc_summary_url, headers=header).content)
cash_to_invest = summary_dict['availableCash']

# Load models and things for models
# RF
rf = investing_utils.load_RF()
with open(f'{investing_utils.data_save_path}/for_proc_df_model_loading.pkl', 'rb') as handle:
    nas_all_train, embeddings_all_train, train_cols_meds_all_train, use_cols, cols_all_train, col_cat_dict, mean_stdev_mapper_all_train, dl_df_train, dl_ys_train, cat_vars, emb_szs = pickle.load(handle)
    
# process the dataframe before I'm able to set up the neural net _____________
# wait until it is time to do the api call. I'm rate limited to 1 call a second
investing_utils.pause_until_time(test=True)

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

In [10]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)

In [11]:
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
    'installment_amount': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}

In [12]:
# rename loan_info columns to match api columns
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)
# save renamed loan_info
loan_info['id'] = loan_info['id'].astype(int)
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(f'{data_path}/loan_info_clean_api_name_matched.fth')

# Cut loan_info to api fields

In [13]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [14]:
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'maturity_time_stat_adj', 'maturity_paid_stat_adj', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']

In [15]:
base_loan_info = loan_info[list(common_flds)]#+['id']
eval_loan_info = loan_info[eval_flds]#+['id']
str_loan_info = strings[strb_flds]#+['id']

In [16]:
base_loan_info.head()

Unnamed: 0,application_type,inq_last_12m,num_bc_tl,revol_bal_joint,sec_app_mths_since_last_major_derog,num_rev_accts,tot_coll_amt,bc_util,sec_app_revol_util,open_rv_24m,num_bc_sats,mths_since_rcnt_il,mths_since_last_delinq,max_bal_bc,num_il_tl,mo_sin_rcnt_tl,purpose,sub_grade,total_bal_ex_mort,acc_now_delinq,id,num_tl_90g_dpd_24m,dti,sec_app_chargeoff_within_12_mths,mths_since_recent_bc,open_act_il,delinq_amnt,inq_last_6_mths,mths_since_last_major_derog,mo_sin_old_il_acct,open_acc,acc_open_past_24_mths,annual_inc,total_bc_limit,addr_state,revol_bal,is_inc_v_joint,open_acc_6m,is_inc_v,mths_since_recent_revol_delinq,sec_app_open_acc,annual_inc_joint,sec_app_num_rev_accts,all_util,total_il_high_credit_limit,num_accts_ever_12_0_ppd,open_il_24m,num_tl_12_0dpd_2m,total_bal_il,i_l_util,open_rv_12m,mths_since_recent_inq,num_sats,tax_liens,sec_app_collections_12_mths_ex_med,mths_since_recent_bc_dlq,num_actv_bc_tl,num_actv_rev_tl,total_acc,pct_tl_nvr_dlq,total_rev_hi_lim,mths_since_last_record,sec_app_fico_range_high,num_tl_30dpd,sec_app_fico_range_low,sec_app_open_act_il,pub_rec,tot_hi_cred_lim,member_id,delinq_2_yrs,term,loan_amount,total_cu_tl,fico_range_low,sec_app_earliest_cr_line,sec_app_mort_acc,fico_range_high,avg_cur_bal,chargeoff_within_12_mths,sec_app_inq_last_6_mths,grade,open_il_12m,tot_cur_bal,collections_12_mths_ex_med,pub_rec_bankruptcies,installment,num_tl_op_past_12m,dti_joint,mo_sin_rcnt_rev_tl_op,int_rate,emp_length,inq_fi,disbursement_method,num_op_rev_tl,revol_util,percent_bc_gt_75,num_rev_tl_bal_gt_0,addr_zip,earliest_cr_line,bc_open_to_buy,home_ownership,mort_acc,initial_list_status,mo_sin_old_rev_tl_op
0,individual,,,,,,,,,,,,,,,,debt_consolidation,B4,,0.0,54734,,0.1948,,,,0.0,0.0,,,10.0,,85000.0,,CA,28854.0,,,platform,,,,,,,,,,,,,,,0.0,,,,,42.0,,,,,,,,0.0,,,0.0,36,25000.0,,735.0,NaT,,739.0,,0.0,,B,,,0.0,0.0,829.044904,,,,0.1189,< 1 year,,Cash,,0.521,,,941xx,1994-02-01,,rent,,f,
1,individual,,,,,,,,,,,,,,,,debt_consolidation,F2,,0.0,55521,,0.2384,,,,0.0,1.0,,,9.0,,30000.0,,IL,10125.0,,,none,,,,,,,,,,,,,,,0.0,,,,,15.0,,,,,,,,0.0,,,0.0,36,1000.0,,640.0,NaT,,644.0,,0.0,,F,,,0.0,0.0,35.196542,,,,0.1608,< 1 year,,Cash,,0.904,,,600xx,2001-08-01,,rent,,f,
2,individual,6.0,17.0,,,25.0,0.0,0.79,,13.0,14.0,50.0,,1158.0,11.0,7.0,debt_consolidation,E4,8351.0,0.0,55716,0.0,0.1696,,7.0,1.0,0.0,4.0,31.0,160.0,14.0,13.0,30784.0,6350.0,WA,7849.0,,0.0,none,,,,,0.74,5929.0,1.0,0.0,,502.0,0.08,4.0,2.0,14.0,0.0,,,7.0,13.0,36.0,0.69,10650.0,28.0,,0.0,,,1.0,16579.0,,0.0,36,3500.0,4.0,665.0,NaT,,669.0,596.0,0.0,,E,0.0,8351.0,0.0,1.0,139.14088,4.0,,7.0,0.2499,< 1 year,2.0,Cash,13.0,0.74,0.857,13.0,984xx,2003-04-01,907.0,rent,0.0,w,127.0
3,individual,,,,,,,,,,,,,,,,credit_card,B5,,0.0,55742,,0.1429,,,,0.0,0.0,,,7.0,,65000.0,,NY,33623.0,,,none,,,,,,,,,,,,,,,0.0,,,,,7.0,,,,,,,,0.0,,,0.0,36,7000.0,,705.0,NaT,,709.0,,0.0,,B,,,0.0,0.0,228.210882,,,,0.1071,< 1 year,,Cash,,0.767,,,112xx,2000-10-01,,rent,,f,
4,individual,1.0,2.0,,,3.0,0.0,0.354,,1.0,2.0,15.0,,1569.0,5.0,12.0,home_improvement,A2,20302.0,0.0,56121,0.0,0.1084,,16.0,2.0,0.0,0.0,,166.0,5.0,3.0,65000.0,8500.0,NE,3012.0,,0.0,none,,,,,0.63,23822.0,0.0,1.0,0.0,17290.0,0.73,0.0,3.0,5.0,0.0,,,2.0,2.0,11.0,1.0,8500.0,42.0,,0.0,,,1.0,87102.0,,0.0,36,8000.0,0.0,705.0,NaT,,709.0,14826.0,0.0,,A,0.0,74131.0,0.0,1.0,245.155612,1.0,,16.0,0.0649,10+ years,2.0,Cash,2.0,0.354,0.0,2.0,686xx,2002-03-01,5488.0,mortgage,3.0,w,35.0


In [17]:
eval_loan_info.head()

Unnamed: 0,end_d,issue_d,maturity_paid,maturity_time,maturity_time_stat_adj,maturity_paid_stat_adj,rem_to_be_paid,roi_simple,target_loose,target_strict,loan_status,id
0,2011-10-01,2009-08-01,1.0,1.0,1.0,1.0,0.0,1.173214,0,0.0,paid,54734
1,2010-03-01,2008-07-01,1.0,1.0,1.0,1.0,0.0,1.207769,0,0.0,paid,55521
2,2018-06-01,2016-08-01,1.0,0.666667,0.666667,1.0,0.0,1.353502,0,0.0,paid,55716
3,2011-06-01,2008-05-01,1.0,1.0,1.0,1.0,0.0,1.173648,0,0.0,paid,55742
4,2018-04-01,2016-01-01,0.583209,0.861111,0.861111,0.583209,3677.227849,0.643185,1,1.0,charged_off,56121


In [18]:
str_loan_info.head()

Unnamed: 0,desc,emp_title,id
1576268,Due to a lack of personal finance education an...,,54734
1581966,Looking to sure up a few debts for consolidati...,best buy,55521
1256984,I currently have a loan out with CashCall. The...,receptionist,55716
1579171,Just want to pay off the last bit of credit ca...,cnn,55742
1530230,I recently married and since this was the seco...,maintenance,56121


In [19]:
print(base_loan_info.shape, eval_loan_info.shape, str_loan_info.shape)

(2003915, 104) (2003915, 12) (2003915, 3)


In [20]:
# a bit of cleanup
base_loan_info.drop('member_id', axis=1, inplace=True)
# base_loan_info.fillna(np.nan, inplace=True)
# base_loan_info.replace(['None', 'none', None], value=np.nan, inplace=True)
base_loan_info['sec_app_earliest_cr_line']=pd.to_datetime(base_loan_info['sec_app_earliest_cr_line'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [21]:
# save
base_loan_info.to_feather(f'{data_path}/base_loan_info.fth')
eval_loan_info.to_feather(f'{data_path}/eval_loan_info.fth')
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(f'{data_path}/str_loan_info.fth')

# scale pmt_hist to be independent of loan_size (so we can treat loans as us investing the same amount in each)

In [52]:
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]

In [53]:
loan_funded_amts = loan_info.set_index('id')['funded_amnt'].to_dict()

In [54]:
loan_dollar_cols = ['all_cash_to_inv', 'amt_due', 'amt_paid', 'charged_off_amt', 'fee_paid', 'int_paid', 'outs_princp_beg', 'outs_princp_end', 'princp_paid', 'recov_fees', 'recovs', ]

In [55]:
id_grouped = pmt_hist.groupby('id', sort=False)

In [56]:
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/home/justin/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/justin/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 109, in run
    if instance.miniters > 1 and \
AttributeError: 'tqdm' object has no attribute 'miniters'

100%|██████████| 2003523/2003523 [06:09<00:00, 5420.82it/s]


In [57]:
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts

In [58]:
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = compress_memory(pmt_hist)

100%|██████████| 16/16 [00:13<00:00,  1.28it/s]


changed dtypes of 16 cols


In [59]:
pmt_hist.to_feather(f'{data_path}/scaled_pmt_hist.fth')