In [37]:
import dir_constants as dc
from tqdm import tqdm
from j_utils.munging import compress_memory
pd.options.display.max_columns = 999
pd.options.display.max_rows = 60
pd.options.display.max_seq_items = None
# np.set_printoptions(threshold=100)

In [38]:
PATH = dc.data_path
project = 'lendingclub'
fname_loan = 'loan_info_clean.fth'
fname_pmt = 'pmt_hist_clean.fth'
fname_strings = 'strings_df_clean.fth'
data_path = f'{PATH}{project}'

In [39]:
loan_info = pd.read_feather(f'{data_path}/{fname_loan}')
pmt_hist = pd.read_feather(f'{data_path}/{fname_pmt}')
strings = pd.read_feather(f'{data_path}/{fname_strings}')
strings = strings[strings['loan_id'].isin(loan_info['id'])]

In [4]:
loan_info.sort_values('id', inplace=True)
pmt_hist.sort_values(['loan_id', 'date'], inplace=True)
strings.sort_values('loan_id', inplace=True)

In [5]:
# rename loan_id to id to match what comes through API
# loan_info.rename({'loan_id': 'id'}, axis=1, inplace=True)
pmt_hist.rename({'loan_id': 'id'}, axis=1, inplace = True)
strings.rename({'loan_id': 'id'}, axis=1, inplace = True)

In [6]:
print('loan_info shape: {0}\n pmt_hist shape: {1}\n strings shape: {2}'.format(loan_info.shape, pmt_hist.shape, strings.shape))

loan_info shape: (2003915, 161)
 pmt_hist shape: (37318236, 41)
 strings shape: (0, 10)


# rename fields to match what comes through api, save

In [7]:
sys.path.append('/home/justin/lendingclub/investing/')

In [8]:
import requests
import json
import lendingclub.account_info as acc_info
import re
from sklearn.externals import joblib
# import lendingclub.dataprep_and_modeling.modeling_utils.data_prep_new as data_prep
import lendingclub.investing.investing_utils as investing_utils
from investing_utils import StandardScalerJustin
# import pandas as pd
# import numpy as np
import math as math
import torch
import pickle as pickle
import datetime
import smtplib
import gspread
import google.auth
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession


# constants and setup for various accounts and APIs
now = datetime.datetime.now()
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway+'!@'
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key

header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'
min_score = -0.02  # -0.04599714276994965  # -0.035764345824470828
inv_amt = 25.00
cash_limit = 0.00
creds = service_account.Credentials.from_service_account_file(acc_info.project_path+'credentials.json')
scope = ['https://spreadsheets.google.com/feeds']
creds = creds.with_scopes(scope)
gc = gspread.Client(auth=creds)
gc.session = AuthorizedSession(creds)
sheet = gc.open_by_key(invest_ss_key).sheet1
sheetins = gc.open_by_key(investins_ss_key).sheet1

# First check if I have enough money that I want to invest. min 10 notes so 250
summary_dict = json.loads(requests.get(
    acc_summary_url, headers=header).content)
cash_to_invest = summary_dict['availableCash']

# Load models and things for models
# RF
rf = investing_utils.load_RF()
with open(f'{investing_utils.data_save_path}/for_proc_df_model_loading.pkl', 'rb') as handle:
    nas_all_train, embeddings_all_train, train_cols_meds_all_train, use_cols, cols_all_train, col_cat_dict, mean_stdev_mapper_all_train, dl_df_train, dl_ys_train, cat_vars, emb_szs = pickle.load(handle)
    
# process the dataframe before I'm able to set up the neural net _____________
# wait until it is time to do the api call. I'm rate limited to 1 call a second
investing_utils.pause_until_time(test=True)

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

In [9]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)

In [10]:
licsv_to_api_rename_dict = {
    'acc_open_past_24mths':'acc_open_past_24_mths',
    'zip_code': 'addr_zip',
    'delinq_2yrs': 'delinq_2_yrs',
    'il_util': 'i_l_util',
    'inq_last_6mths': 'inq_last_6_mths',
    'installment_amount': 'installment',
    'verification_status': 'is_inc_v',
    'verification_status_joint': 'is_inc_v_joint',
    'loan_amnt': 'loan_amount',
    'num_accts_ever_120_pd': 'num_accts_ever_12_0_ppd',
    'num_tl_120dpd_2m': 'num_tl_12_0dpd_2m',
    'sec_app_inq_last_6mths': 'sec_app_inq_last_6_mths',
}

In [11]:
# rename loan_info columns to match api columns
loan_info.rename(licsv_to_api_rename_dict, axis=1, inplace=True)
# save renamed loan_info
loan_info['id'] = loan_info['id'].astype(int)
loan_info.reset_index(drop=True, inplace=True)
loan_info.to_feather(f'{data_path}/{fname_loan}')

# Cut loan_info to api fields

In [12]:
api_flds = set(api_loans.columns)
licsv_flds = set(loan_info.columns)
common_flds = api_flds.intersection(licsv_flds)
api_flds_not_in_licsv = api_flds.difference(licsv_flds)
licsv_flds_not_in_api = licsv_flds.difference(api_flds)

In [17]:
eval_flds = ['end_d', 'issue_d', 'maturity_paid', 'maturity_time', 'rem_to_be_paid', 'roi_simple',
             'target_loose', 'target_strict', 'loan_status', 'id']
strb_flds = ['desc', 'emp_title', 'id']

In [18]:
base_loan_info = loan_info[list(common_flds)]#+['id']
eval_loan_info = loan_info[eval_flds]#+['id']
str_loan_info = strings[strb_flds]#+['id']

In [19]:
# a bit of cleanup
base_loan_info.drop('member_id', axis=1, inplace=True)
# base_loan_info.fillna(np.nan, inplace=True)
base_loan_info.replace(['None', 'none', None], value=np.nan, inplace=True)
base_loan_info['sec_app_earliest_cr_line']=pd.to_datetime(base_loan_info['sec_app_earliest_cr_line'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [34]:
'''
TARGET STRICT IS BROKEN. EVENTUALLY FIX IT IN clean_loan_info.ipynb
'''

'\nTARGET STRICT IS BROKEN. EVENTUALLY FIX IT IN clean_loan_info.ipynb\n'

In [36]:
loan_info['member_id'].value_counts(dropna=False)

NaN    2003915
Name: member_id, dtype: int64

In [20]:
# save
base_loan_info.to_feather(f'{data_path}/base_loan_info.fth')
eval_loan_info.to_feather(f'{data_path}/eval_loan_info.fth')
str_loan_info.reset_index(drop=True, inplace=True)
str_loan_info.to_feather(f'{data_path}/str_loan_info.fth')

# scale pmt_hist to be independent of loan_size (so we can treat loans as us investing the same amount in each)

In [17]:
pmt_hist = pmt_hist[pmt_hist['id'].isin(loan_info['id'])]

In [18]:
loan_funded_amts = loan_info.set_index('id')['funded_amnt'].to_dict()

In [19]:
loan_dollar_cols = ['all_cash_to_inv', 'amt_due', 'amt_paid', 'charged_off_amt', 'fee_paid', 'int_paid', 'outs_princp_beg', 'outs_princp_end', 'princp_paid', 'recov_fees', 'recovs', ]

In [20]:
id_grouped = pmt_hist.groupby('id', sort=False)

In [24]:
funded_amts = []
for ids, group in tqdm(id_grouped):
    funded_amt = loan_funded_amts[ids]
    funded_amts.extend([funded_amt]*len(group))    

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/home/justin/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/justin/anaconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 109, in run
    if instance.miniters > 1 and \
AttributeError: 'tqdm' object has no attribute 'miniters'

100%|██████████| 2003376/2003376 [05:09<00:00, 6482.56it/s]


In [27]:
for col in loan_dollar_cols:
    pmt_hist[col] = pmt_hist[col]/funded_amts

In [33]:
pmt_hist.reset_index(drop=True, inplace=True)
_, pmt_hist = compress_memory(pmt_hist)

100%|██████████| 16/16 [00:12<00:00,  1.29it/s]


changed dtypes of 16 cols


In [38]:
pmt_hist.to_feather(f'{data_path}/scaled_pmt_hist.fth')