In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
# from tqdm.notebook import tqdm
pd.options.display.max_columns = 999
pd.options.display.max_rows = 50
# from pandas.testing import assert_frame_equal

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 2_13_2020 adding timings, writing timings to new table in database

In [8]:
a, b, c, d, e, f = 'a', 'b', 'c', 'd', 'e', 'f'
test_msg_content = f"test investment round \n LC API Response: {a} \n Response Contents: {b} \n {c} \n {d} \
funny"

In [10]:
start, t1, t2, t3, t4, t5 = 0, 1, 2, 3, 4, 5

In [9]:
timing_df = pd.DataFrame({'start': start,
                          'api_get_loans': t1 - start,
                          'munge_api_loans': t2 - t1,
                          'finish_scoring': t3 - t2,
                          'get_investable': t4 - t3,
                          'ass'})

'test investment round \n LC API Response: a \n Response Contents: b \n c \n d funny'

In [41]:
%%writefile ../../lendingclub/investing/invest_script.py
'''
Script to run every time there is an investment round
'''
import os
import sys
import argparse
import requests
import math
import datetime
import timeit
import pytz
import pickle
import json
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
# trying to embed matplotlib plots into emails
from email.message import EmailMessage
from email.utils import make_msgid
import mimetypes

# LC imports
import user_creds.account_info as acc_info
from lendingclub.investing import investing_utils as inv_util
from lendingclub.modeling import score_utils as scr_util
from lendingclub import config
from lendingclub.modeling.models import Model

parser = argparse.ArgumentParser()
parser.add_argument('--test', '-t', help='Boolean, if True will invest fast and not wait', action='store_true')
args = parser.parse_args()
test = args.test
    
def handle_new_cols_to_sql(df, table_name, con):
    '''
    If new columns are added, bring in existing sql table and combine with
    pandas, then rewrite out new dataframe
    '''
    try:
        #this will fail if there is a new column
        df.to_sql(name=table_name, con=con, if_exists = 'append', index=False)
    except sqlalchemy.exc.OperationalError:
        data = pd.read_sql(f'SELECT * FROM {table_name}', con)
        df2 = pd.concat([data,df])
        df2.to_sql(name=table_name, con=con, if_exists = 'replace', index=False)
    

# lendingclub account + API related constants
inv_amt = 250.00
cash_limit = 0.00

token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway
my_recipients = acc_info.to_emails_throwaway
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}
acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'

# check account money, how much money to deploy in loans
summary_dict = json.loads(requests.get(
    acc_summary_url, headers=header).content)
cash_to_invest = summary_dict['availableCash']
n_to_pick = int(math.floor(cash_to_invest / inv_amt))

# other constants
western = pytz.timezone('US/Pacific')
now = datetime.datetime.now(tz=pytz.UTC)


# setup for model
with open(os.path.join(config.data_dir, 'base_loan_info_dtypes.pkl'), 'rb') as f:
    base_loan_dtypes = pickle.load(f)
cb_both = Model('catboost_both')
# clf_wt_scorer will combine the regr and clf scores, with clf wt of 20%
clf_wt_scorer = scr_util.combined_score(scr_util.clf_wt)

# WAIT UNTIL LOANS RELEASED. I'm rate limited to 1 call a second
inv_util.pause_until_time(test=test)    

# Start timings
start = timeit.default_timer()

# get loans from API, munge them to a form that matches training data
api_loans, api_ids = inv_util.get_loans_and_ids(
    header, exclude_already=True)

# time for getting loans
t1 = timeit.default_timer()

# match format of cr_line dates and emp_length, dti, dti_joint
api_loans['earliest_cr_line'] = pd.to_datetime(api_loans['earliest_cr_line'].str[:10])
api_loans['sec_app_earliest_cr_line'] = pd.to_datetime(api_loans['sec_app_earliest_cr_line'].str[:10])
bins = [12*k for k in range(1,11)]
bins = [-np.inf] + bins + [np.inf]
labels = ['< 1 year','1 year','2 years','3 years','4 years','5 years','6 years','7 years','8 years','9 years','10+ years',]
api_loans['emp_length'] = pd.cut(api_loans['emp_length'], bins=bins, labels=labels, right=False).astype(str).replace({'nan':'None'})
# I think 9999 is supposed to be their value for nan. Not entirely sure
api_loans['dti'] = api_loans['dti'].replace({9999:np.nan})
api_loans['dti_joint'] = api_loans['dti_joint'].replace({9999:np.nan})
api_loans = api_loans.astype(base_loan_dtypes)

# time for finishing munging data to correct form
t2 = timeit.default_timer()

# make raw scores and combined scores
_, api_loans['catboost_regr'], api_loans['catboost_clf'] = cb_both.score(api_loans, return_all=True)
api_loans['catboost_regr_scl'] = scr_util.scale_cb_regr_score(api_loans)
catboost_comb_col = f'catboost_comb_{int(scr_util.clf_wt*100)}'
api_loans[catboost_comb_col] = clf_wt_scorer('catboost_clf', 'catboost_regr_scl', api_loans)

# time for finishing the entire scorer
t3 = timeit.default_timer()

# get loans that pass the investing criteria
investable_loans = api_loans.query(f"{catboost_comb_col} >= {scr_util.min_comb_score}")
# investable_loans = investable_loans.sort_values('catboost_comb', ascending=False)

# time for getting investable loans
t4 = timeit.default_timer()

# Set up order and submit order
to_order_loan_ids = investable_loans.nlargest(n_to_pick, catboost_comb_col)['id']
orders_dict = {'aid': inv_acc_id}
orders_list = [{'loanId': int(loan_ids),
                        'requestedAmount': int(inv_amt),
                        'portfolioId': int(portfolio_id)} for loan_ids in to_order_loan_ids]
orders_dict['orders'] = orders_list
payload = json.dumps(orders_dict)
# place order
order_resp = inv_util.submit_lc_order(cash_to_invest, cash_limit, order_url, header, payload)

# time for assembling and placing orders
t5 = timeit.default_timer()

# some date related columns to add before writing to db
# convert existing date cols
to_datify = [col for col in api_loans.columns if '_d' in col and api_loans[col].dtype == 'object']
for col in to_datify:
    api_loans[col] = pd.to_datetime(api_loans[col], utc=True).dt.tz_convert(western)

# add date cols: date, year, month, week of year, day, hour
api_loans['last_seen_list_d'] = now
api_loans['list_d_year'] = api_loans['list_d'].dt.year
api_loans['list_d_month'] = api_loans['list_d'].dt.month
api_loans['list_d_day'] = api_loans['list_d'].dt.day
api_loans['list_d_week'] = api_loans['list_d'].dt.week
api_loans['list_d_hour'] = api_loans['list_d'].dt.hour
api_loans['last_seen_list_d_year'] = api_loans['last_seen_list_d'].dt.year
api_loans['last_seen_list_d_month'] = api_loans['last_seen_list_d'].dt.month
api_loans['last_seen_list_d_day'] = api_loans['last_seen_list_d'].dt.day
api_loans['last_seen_list_d_week'] = api_loans['last_seen_list_d'].dt.week
api_loans['last_seen_list_d_hour'] = api_loans['last_seen_list_d'].dt.hour

msg = EmailMessage()
order_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
# email headers
email_cols = ['id', 'int_rate', 'term', 'catboost_clf', 'catboost_regr', 'catboost_regr_scl', catboost_comb_col]
msg['Subject'] = order_time + ' Investment Round'
msg['From'] = 'justindlrig <{0}>'.format(my_gmail_account)
msg['To'] = 'self <{0}>'.format(my_recipients[0])
# set the plain text body
msg_content = f"investment round \n LC API Response: {order_resp} \n Response Contents: {order_resp.content} \
\n Time to get loans: {t1 - start} \n Time to munge loans: {t2 - t1} \n Time to finish scoring process: {t3 - t2} \
\n Time to get investable loans: {t4 - t3} \n Time to assemble and place order {t5 - t4} \
\n Time whole process {t5 - start} \n {investable_loans[email_cols]} \n {api_loans[email_cols]}"
msg.set_content(msg_content)

inv_util.send_emails(now, my_gmail_account, my_gmail_password, msg)

# make the timing_df
timing_df = pd.DataFrame({'start': start,
                          'api_get_loans': t1 - start,
                          'munge_api_loans': t2 - t1,
                          'finish_scoring': t3 - t2,
                          'get_investable': t4 - t3,
                          'assemble_place_order': t5 - t4,
                          'order_date': order_time,
                          'whole_process': t5 - start,
}, index=[0])

# write dataframes out to db
disk_engine = create_engine(f'sqlite:///{config.lc_api_db}')
handle_new_cols_to_sql(api_loans, 'lc_api_loans', disk_engine)
handle_new_cols_to_sql(timing_df, 'order_timings', disk_engine)
#api_loans.to_sql('lc_api_loans', disk_engine, if_exists='append', index=False,)
#timing_df.to_sql('order_timings', disk_engine, if_exists='append', index=False,)

Overwriting ../../lendingclub/investing/invest_script.py


In [None]:
# if wanted to send image in emails
# now create a Content-ID for the image
# image_cid = make_msgid(domain='xyz.com')#
# if `domain` argument isn't provided, it will 
# use your computer's name

# set an alternative html body
# msg.add_alternative("""\
# <html>
#     <body>
#         <p>This is an HTML body.<br>
#            It also has an image.
#         </p>
#         <img src="cid:{image_cid}">
#     </body>
# </html>
# """.format(image_cid=image_cid[1:-1]), subtype='html')
# # image_cid looks like <long.random.number@xyz.com>
# # to use it as the img src, we don't need `<` or `>`
# # so we use [1:-1] to strip them off
# with open('/home/justin/projects/nst_star_app/images/ex1.png', 'rb') as img:
#     # know the Content-Type of the image
#     maintype, subtype = mimetypes.guess_type(img.name)[0].split('/')
#     # attach it
#     msg.get_payload()[1].add_related(img.read(), 
#                                          maintype=maintype, 
#                                          subtype=subtype, 
#                                          cid=image_cid)

# below for google account stuff

#google sheet keys
# invest_ss_key = acc_info.invest_ss_key
# investins_ss_key = acc_info.investins_ss_key


# creds = service_account.Credentials.from_service_account_file(os.path.join(config.prj_dir, 'user_creds', 'credentials.json'))
# scope = ['https://spreadsheets.google.com/feeds']
# creds = creds.with_scopes(scope)
# gc = gspread.Client(auth=creds)
# gc.session = AuthorizedSession(creds)
# sheet = gc.open_by_key(invest_ss_key).sheet1
# sheetins = gc.open_by_key(investins_ss_key).sheet1

# 12_19_2019 script writing

In [2]:
# import os
# import requests
# import json

# from lendingclub import config
# from lendingclub.modeling.models import Model
# from j_utils import munging as mg
# import re
# from sklearn.externals import joblib
# # import lendingclub.dataprep_and_modeling.modeling_utils.data_prep_new as data_prep
# import lendingclub.investing.investing_utils as investing_utils
# from lendingclub.modeling import score_utils as scr_util
# # from investing_utils import StandardScalerJustin
# import pandas as pd
# import numpy as np
# import math as math
# import torch
# import pickle
# import datetime
# import smtplib
# import gspread
# import google.auth
# from google.oauth2 import service_account
# from google.auth.transport.requests import AuthorizedSession

In [37]:
scr_util.min_comb_16_score

0.7923743712451782

In [3]:
%%writefile ../../lendingclub/investing/investing_utils.py
# %load ../../lendingclub/investing/investing_utils
import requests
import json
import re
import pandas as pd
import numpy as np
import datetime as dt
import user_creds.account_info as acc_info
import pause
import smtplib
from sklearn.base import TransformerMixin, BaseEstimator
from pandas_summary import DataFrameSummary
# from sklearn.externals import joblib

class StandardScalerJustin(TransformerMixin, BaseEstimator):
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.copy = copy
    
    def fit(self, X, y=None):
        if type(X) == np.ndarray:
            X = pd.Series(X.reshape(-1))
        self.mean_ = X.dropna().mean()
        self.var_ = X.dropna().var()
        return self

    def transform(self, X):
        mean = self.mean_
        std_dev = np.sqrt(self.var_)
        if std_dev == 0:
            return X
        return (X-mean)/std_dev
    
def fit_scalers(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScalerJustin()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    return mapper    

def proc_df_justin(df, y_fld, valid_test, skip_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None, train_cols_meds=None, cols=None):

    """ proc_df takes a data frame df and splits off the response variable, and
    changes the df into an entirely numeric dataframe.

    Parameters:
    -----------
    df: The data frame you wish to process.

    y_fld: The name of the response variable
    
    valid_test: boolean indicating if this is a df to match to train columns.

    skip_flds: A list of fields that dropped from df.

    do_scale: Standardizes each column in df,Takes Boolean Values(True,False)

    na_dict: a dictionary of na columns to add. Na columns are also added if there
        are any missing values.

    preproc_fn: A function that gets applied to df.

    max_n_cat: The maximum number of categories to break into dummy values, instead
        of integer codes.

    subset: Takes a random subset of size subset from df.

    mapper: If do_scale is set as True, the mapper variable
        calculates the values used for scaling of variables during training time(mean and standard deviation).
        
    train_cols_meds: dict where keys are columns from training and values are medians, use for values to fill an entire missing column (shouldn't be needed when used to actually pick loans, was needed for train/valid/test due to new fields being added over the timeframe and missing in certain datasets while existing in others)
    
    cols: Just to compare column order and ensure the variables are in the right order.

    Returns:
    --------
    [x, y, nas, mapper(optional)]:

        x: x is the transformed version of df. x will not have the response variable
            and is entirely numeric.

        y: y is the response variable

        nas: returns a dictionary of which nas it created, and the associated median.

        mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continous
        variables which is then used for scaling of during test-time."""        
    assert type(valid_test) == bool, print('must indiciate if this is test/valid set to match columns with train')
    
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace=True)

    # fit the scalers
    if do_scale: mapper = fit_scalers(df, mapper)
    if na_dict is None: na_dict = {}      
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    df[mapper.transformed_names_] = mapper.transform(df)
    embeddings=[]
    for n,c in df.items():
        numericalize(df, c, n, max_n_cat)
        if not is_numeric_dtype(c):
            embeddings.append(prep_embeddings(c, n))
    df = pd.get_dummies(df, dummy_na=True)
    # fix the nas
    if valid_test:
        for col, med in train_cols_meds.items():
            try:
                df[col].fillna(med, inplace=True)
            except KeyError:
                print(col)
                df[col] = med
        df = df[cols]
        
    res = [df, y, na_dict, embeddings]
    if not valid_test: res += [res[0].median(), res[0].columns]
    if do_scale: res = res + [mapper]
    return res

def prep_embeddings(c, n):
    # allocate in embeddings for a null
    return (n, len(c.cat.categories)+1)

def eval_models(trials, port_size, available_loans, regr_version, X_test, y_test,
                default_series, yhat_test): #regr, 
    results = {}
    pct_default = {}
    test_copy = X_test.copy()
    
    default_series = default_series.loc[X_test.index]
    yhats_ys_defs = pd.DataFrame([yhat_test, y_test, default_series.values]).T
    yhats_ys_defs.rename(columns={0:'yhat', 1:'y', 2:'defaults'}, inplace=True)
    for trial in tqdm_notebook(np.arange(trials)):
        # of all test loans, grab a batch of n=available_loans
        available_idx = np.random.choice(
            np.arange(len(test_copy)), available_loans, replace=False)
        available_loans_df = yhats_ys_defs.ix[available_idx,:]
        available_loans_df.sort_values('yhat', inplace=True, ascending=False)
        picks = available_loans_df[:port_size]
        results[trial] = picks['y'].mean()
        pct_default[trial] = picks['defaults'].sum()/port_size
    pct_default_series = pd.Series(pct_default)
    results_df = pd.DataFrame(pd.Series(results))
    results_df['pct_def'] = pct_default_series
    results_df.columns = pd.MultiIndex(levels=[[regr_version], [0.07, 'pct_def']],
           labels=[[0, 0,], [0, 1,]],
           names=['discount_rate', 'model'])
    return results_df

# def load_RF():
#     return joblib.load(f'{PATH_RF}{regr_version_RF}_{training_type}.pkl')
    
def add_dateparts(df):
    '''Uses the fastai add_datepart to turn datetimes into numbers to process
       does not do it for issue_d'''
    date_cols = df.select_dtypes(['datetime64']).columns
    for date_col in date_cols:
        if date_col not in special_cols:
            add_datepart(df, date_col, drop=True)
    return [col for col in date_cols if col not in special_cols]    

def pause_until_time(test=False):
    now = dt.datetime.now()
    if not test:
        pause_until = dt.datetime(
            now.year, now.month, now.day, now.hour + 1, 0, 0)
    if test:
        # if testing, wait 2 seconds and print('will pause 2 seconds')
        pause_until = dt.datetime(
            now.year, now.month, now.day, now.hour, now.minute, now.second + 2)
    pause.until(pause_until)

def convert_to_underscore(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([0-9A-Z])', r'\1_\2', s1).lower()

def get_already_invested_filter_id(header):
    filters_list = json.loads(requests.get(
        'https://api.lendingclub.com/api/investor/v1/accounts/' + str(inv_acc_id) + '/filters', headers=header).content)
    filters_df = pd.DataFrame(filters_list['filters'])
    # I manually made a single filter that excludes loans already invested in.
    # Not sure if there is a way to do this entirely through the api.
    return filters_df[filters_df['name'] == 'exclude_already_invested'].iloc[0, 0]

def get_loans_and_ids(header, exclude_already=True):
    '''Gets loans from lendingclub with the single filter of exclude loans already invested in.'''
    if exclude_already:
        filter_id = get_already_invested_filter_id(header)
        payload = {'showAll': 'true', 'filterId': filter_id}
        resp = requests.get(
            'https://api.lendingclub.com/api/investor/v1/loans/listing', headers=header, params=payload) #'https://api.lendingclub.com/api/investor/v1/loans/listing'
        loans_list = json.loads(resp.content)['loans']
    if not exclude_already:
        payload = {'showAll': 'true'}
        resp = requests.get(
            'https://api.lendingclub.com/api/investor/v1/loans/listing', headers=header, params=payload)
        loans_list = json.loads(resp.content)['loans']

    api_loans = pd.DataFrame(loans_list)
    api_loans.columns = np.array(
        [convert_to_underscore(col) for col in api_loans.columns.values])
    # save the loan ids
    loan_ids = api_loans['id']
    return api_loans, loan_ids

def match_col_names(api_loans):
    # cols to add
    # make a col of nans so cols match up exactly
    api_loans['issue_d'] = 0
    api_loans['line_history_m'] = 0
    api_loans['maturity_paid'] = 0
    api_loans['maturity_time'] = 0
    api_loans['npv_roi_10'] = 0
    api_loans['orig_amt_due'] = 0
    api_loans['target_loose'] = 0
    api_loans['target_strict'] = 0
    api_loans['fico'] = 0

    cols_to_drop_immediately = [
        'accept_d',
        'credit_pull_d',
        'desc',
        'emp_title',
        'exp_d',
        'exp_default_rate',
        'funded_amount',
        'housing_payment',
        'id',
        'ils_exp_d',
        'initial_list_status',
        'investor_count',
        'list_d',
        'member_id',
        'mtg_payment',
        'review_status',
        'review_status_d',
        'sec_app_earliest_cr_line',
        'sec_app_fico_range_high',
        'sec_app_fico_range_low',
        'service_fee_rate',
    ]
    api_loans.drop(cols_to_drop_immediately, axis=1, inplace=True)
    rename_dict = {
        'acc_open_past_24_mths': 'acc_open_past_24mths',
        'addr_zip': 'zip_code',
        'delinq_2_yrs': 'delinq_2yrs',
        'i_l_util': 'il_util',
        'inq_last_6_mths': 'inq_last_6mths',
        'installment': 'installment_amount',
        'is_inc_v': 'verification_status',
        'is_inc_v_joint': 'verification_status_joint',
        'loan_amount': 'loan_amnt',
        'num_accts_ever_12_0_ppd': 'num_accts_ever_120_pd',
        'num_tl_12_0dpd_2m': 'num_tl_120dpd_2m',
        'sec_app_inq_last_6_mths': 'sec_app_inq_last_6mths',
    }
    api_loans.rename(columns=rename_dict, inplace=True)
    return api_loans

def match_existing_cols_to_csv(api_loans):
    api_loans.fillna(value=np.nan, inplace=True)
    api_loans['all_util'] = api_loans['all_util'] / 100.0
    api_loans['application_type'] = api_loans['application_type'].str.lower()

    # turn employment length into categorical
    emp_len_dict = {np.nan: 'n/a',
                    0.0: '< 1 year',
                    12.0: '1 year',
                    24.0: '2 years',
                    36.0: '3 years',
                    48.0: '4 years',
                    60.0: '5 years',
                    72.0: '6 years',
                    84.0: '7 years',
                    96.0: '8 years',
                    108.0: '9 years',
                    120.0: '10+ years', }
    api_loans['emp_length'] = api_loans['emp_length'].replace(emp_len_dict)
    api_loans['home_ownership'] = api_loans['home_ownership'].str.lower()
    api_loans['int_rate'] = api_loans['int_rate'] / 100.0

    # verification status
    dic_veri_status = {'NOT_VERIFIED': 'none',
                       'SOURCE_VERIFIED': 'source',
                       'VERIFIED': 'platform'}
    api_loans['verification_status'] = api_loans[
        'verification_status'].replace(dic_veri_status)
    api_loans['verification_status_joint'] = api_loans[
        'verification_status_joint'].replace(dic_veri_status)
    api_loans['pct_tl_nvr_dlq'] = api_loans['pct_tl_nvr_dlq'] / 100.0
    api_loans['percent_bc_gt_75'] = api_loans['percent_bc_gt_75'] / 100.0
    api_loans['revol_util'] = api_loans['revol_util'] / 100.0
    return api_loans

def make_missing_cols_and_del_dates(api_loans):
    # probably something with earliest credit line, fico range high/low
    # need to add line_history_m, orig_amt_due, fico
    api_loans['fico'] = (api_loans['fico_range_high'] +
                         api_loans['fico_range_low']) / 2
    # line_history_m depends on issue_d, which doesn't exist for listed loans.
    # Assume it takes one month to issue so increase the number compared to
    # the csvs by 1
    today = pd.to_datetime(dt.date.today())
    api_loans['earliest_cr_line'] = pd.to_datetime(
        api_loans['earliest_cr_line'])
    line_hist_d = (today - api_loans['earliest_cr_line']) / np.timedelta64(
        1, 'D')
    api_loans['line_history_m'] = (line_hist_d * (12 / 365.25)).round() + 1
    api_loans['orig_amt_due'] = api_loans[
        'term'] * api_loans['installment_amount']

    api_loans.drop(['earliest_cr_line', 'fico_range_high',
                    'fico_range_low'], axis=1, inplace=True)
    return api_loans

def verify_df_base_cols(api_loans, test_loans):
    api_cols = api_loans.columns.values.copy()
    api_cols.sort()
    csv_cols = test_loans.columns.values.copy()
    csv_cols.sort()
    assert len(api_cols) == len(csv_cols)
    examine = dict(zip(api_cols, csv_cols))
    for key, val in examine.iteritems():
        if key != val:
            print(key, val)
            return None
    return True

def make_CIs(preds):
    means = np.mean(preds, axis=0)
    std_devs = np.std(preds, axis=0)
    df = pd.DataFrame(np.zeros((preds.shape[1],2)), columns=['mean', 'std_dev'])
    df['mean'] = means
    df['std_dev'] = std_devs
    return df

def submit_lc_order(cash_to_invest, cash_limit, order_url, header, payload):
    if cash_to_invest >= cash_limit:
        order_response = requests.post(order_url, headers=header, data=payload)
        return order_response
    return None

def send_emails(now, my_gmail_account, my_gmail_password, msg): #, my_recipients
#     subject = now.strftime("%Y-%m-%d %H:%M:%S.%f") + ' Investment Round'
    smtpserver = smtplib.SMTP('smtp.gmail.com',587)
    smtpserver.ehlo()
    smtpserver.starttls()
    smtpserver.login(my_gmail_account, my_gmail_password)
#     msg = """From: %s\nTo: %s\nSubject: %s\n\n%s""" % (my_gmail_account, my_recipients, subject, message)
#     smtpserver.sendmail(msg)#my_gmail_account, my_recipients, 
    smtpserver.send_message(msg)
    smtpserver.close()

# # constants
# inv_acc_id = acc_info.investor_id
# special_cols = []
# platform = 'lendingclub'
# datapath = '/home/justin/all_data/'
# PATH_NN = f'{datapath}{platform}/NN/'
# PATH_RF = f'{datapath}{platform}/RF/'
# data_save_path = f'{datapath}{platform}/'
# training_type = 'all'
# regr_version_RF = '0.2.2'
# regr_version_NN = '1.0.1'


Overwriting ../../lendingclub/investing/investing_utils.py


In [36]:
%%writefile ../../lendingclub/investing/invest_script.py
'''
Script to run every time there is an investment round
'''
import os
import sys
import argparse
import requests
import math
import datetime
import pickle
import json
import numpy as np
import pandas as pd
# trying to embed matplotlib plots into emails
from email.message import EmailMessage
from email.utils import make_msgid
import mimetypes

# LC imports
import user_creds.account_info as acc_info
from lendingclub.investing import investing_utils as inv_util
from lendingclub.modeling import score_utils as scr_util
from lendingclub import config
from lendingclub.modeling.models import Model

parser = argparse.ArgumentParser()
parser.add_argument('--test', '-t', help='Boolean, if True will invest fast and not wait', action='store_true')
args = parser.parse_args()
test = args.test
    
# print(test)
# print(type(test))
    

# lendingclub account + API related constants
inv_amt = 25.00
cash_limit = 0.00

token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway
my_recipients = acc_info.to_emails_throwaway
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}
acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'

# check account money, how much money to deploy in loans
summary_dict = json.loads(requests.get(
    acc_summary_url, headers=header).content)
cash_to_invest = summary_dict['availableCash']
n_to_pick = int(math.floor(cash_to_invest / inv_amt))

# other constants
now = datetime.datetime.now()


# setup for model
with open(os.path.join(config.data_dir, 'base_loan_info_dtypes.pkl'), 'rb') as f:
    base_loan_dtypes = pickle.load(f)
cb_both = Model('catboost_both')
# clf_wt_16_scorer will combine the regr and clf scores, with clf wt of 16%
clf_wt_16_scorer = scr_util.combined_score(.16)

# WAIT UNTIL LOANS RELEASED. I'm rate limited to 1 call a second
inv_util.pause_until_time(test=test)    

# get loans from API, munge them to a form that matches training data
api_loans, api_ids = inv_util.get_loans_and_ids(
    header, exclude_already=True)
# match format of cr_line dates and emp_length, dti, dti_joint
api_loans['earliest_cr_line'] = pd.to_datetime(api_loans['earliest_cr_line'].str[:10])
api_loans['sec_app_earliest_cr_line'] = pd.to_datetime(api_loans['sec_app_earliest_cr_line'].str[:10])
bins = [12*k for k in range(1,11)]
bins = [-np.inf] + bins + [np.inf]
labels = ['< 1 year','1 year','2 years','3 years','4 years','5 years','6 years','7 years','8 years','9 years','10+ years',]
api_loans['emp_length'] = pd.cut(api_loans['emp_length'], bins=bins, labels=labels, right=False).astype(str).replace({'nan':'None'})
# I think 9999 is supposed to be their value for nan. Not entirely sure
api_loans['dti'] = api_loans['dti'].replace({9999:np.nan})
api_loans['dti_joint'] = api_loans['dti_joint'].replace({9999:np.nan})
api_loans = api_loans.astype(base_loan_dtypes)

# make raw scores and combined scores
_, api_loans['catboost_regr'], api_loans['catboost_clf'] = cb_both.score(api_loans, return_all=True)
api_loans['catboost_regr_scl'] = scr_util.scale_cb_regr_score(api_loans)
api_loans['catboost_comb'] = clf_wt_16_scorer('catboost_clf', 'catboost_regr_scl', api_loans)

# get loans that pass the investing criteria
investable_loans = api_loans.query("catboost_comb >= {0}".format(scr_util.min_comb_16_score))
# investable_loans = investable_loans.sort_values('catboost_comb', ascending=False)

# Set up order and submit order
to_order_loan_ids = investable_loans.nlargest(n_to_pick, "catboost_comb")['id']
orders_dict = {'aid': inv_acc_id}
orders_list = [{'loanId': int(loan_ids),
                        'requestedAmount': int(inv_amt),
                        'portfolioId': int(portfolio_id)} for loan_ids in to_order_loan_ids]
orders_dict['orders'] = orders_list
payload = json.dumps(orders_dict)
# place order
order_resp = inv_util.submit_lc_order(cash_to_invest, cash_limit, order_url, header, payload)

msg = EmailMessage()
# email headers
email_cols = ['id', 'int_rate', 'term', 'catboost_clf', 'catboost_regr', 'catboost_regr_scl', 'catboost_comb']
msg['Subject'] = now.strftime("%Y-%m-%d %H:%M:%S.%f") + ' Investment Round'
msg['From'] = 'justindlrig <{0}>'.format(my_gmail_account)
msg['To'] = 'self <{0}>'.format(my_recipients[0])
# set the plain text body
msg.set_content("test investment round \n LC API Response: {0} \n Response Contents: {1} \n {2} \n {3}".format(order_resp, order_resp.content, investable_loans[email_cols], api_loans[email_cols]))
# now create a Content-ID for the image
image_cid = make_msgid(domain='xyz.com')#
# if `domain` argument isn't provided, it will 
# use your computer's name

# set an alternative html body
# msg.add_alternative("""\
# <html>
#     <body>
#         <p>This is an HTML body.<br>
#            It also has an image.
#         </p>
#         <img src="cid:{image_cid}">
#     </body>
# </html>
# """.format(image_cid=image_cid[1:-1]), subtype='html')
# # image_cid looks like <long.random.number@xyz.com>
# # to use it as the img src, we don't need `<` or `>`
# # so we use [1:-1] to strip them off
# with open('/home/justin/projects/nst_star_app/images/ex1.png', 'rb') as img:
#     # know the Content-Type of the image
#     maintype, subtype = mimetypes.guess_type(img.name)[0].split('/')
#     # attach it
#     msg.get_payload()[1].add_related(img.read(), 
#                                          maintype=maintype, 
#                                          subtype=subtype, 
#                                          cid=image_cid)
inv_util.send_emails(now, my_gmail_account, my_gmail_password, msg)


# below for google account stuff

#google sheet keys
# invest_ss_key = acc_info.invest_ss_key
# investins_ss_key = acc_info.investins_ss_key


# creds = service_account.Credentials.from_service_account_file(os.path.join(config.prj_dir, 'user_creds', 'credentials.json'))
# scope = ['https://spreadsheets.google.com/feeds']
# creds = creds.with_scopes(scope)
# gc = gspread.Client(auth=creds)
# gc.session = AuthorizedSession(creds)
# sheet = gc.open_by_key(invest_ss_key).sheet1
# sheetins = gc.open_by_key(investins_ss_key).sheet1

Overwriting ../../lendingclub/investing/invest_script.py


[autoreload of lendingclub.modeling.models failed: Traceback (most recent call last):
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 450, in superreload
    update_generic(old_obj, new_obj)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 387, in update_generic
    update(a, b)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 357, in update_class
    update_instances(old, new)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 312, in update_instances
    update_instances(old, new, obj.__dict__, visited)
  File "/home/justin/anaconda3/envs/lendingclub

In [26]:
bool("False")

True

In [18]:
scr_util.scale_cb_regr_score??

In [10]:
print(api_loans.shape)
api_loans.head()

(67, 123)


Unnamed: 0,id,member_id,loan_amount,funded_amount,term,int_rate,exp_default_rate,service_fee_rate,installment,grade,...,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,disbursement_method,open_act_il,sec_app_open_act_il,catboost_regr,catboost_clf,catboost_regr_scl,catboost_comb_29
0,162477892,203905831,25000.0,14000.0,36,23.049999,12.86,1.52,968.4,D,...,,,,CASH,3.0,,-0.171216,0.697845,0.639344,0.656309
1,163097569,205205335,25200.0,20175.0,36,20.549999,12.86,1.52,943.6,D,...,0.0,38.0,76468.0,CASH,2.0,4.0,-0.136313,0.746974,0.664695,0.688556
2,162929632,204551769,27000.0,21275.0,36,25.65,12.86,1.52,1082.82,D,...,,,,CASH,4.0,,-0.206476,0.578615,0.613735,0.60355
3,163148111,205266152,27000.0,20400.0,36,25.65,12.86,1.52,1082.82,D,...,,,,CASH,5.0,,-0.178009,0.659843,0.63441,0.641786
4,162835838,204285431,40000.0,35075.0,36,16.950001,7.96,1.23,1425.12,C,...,,,,DIRECT_PAY,2.0,,-0.128401,0.729686,0.670441,0.687622


In [16]:
api_loans.query("catboost_comb_29 >= {0}".format(.78))

Unnamed: 0,id,member_id,loan_amount,funded_amount,term,int_rate,exp_default_rate,service_fee_rate,installment,grade,...,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,disbursement_method,open_act_il,sec_app_open_act_il,catboost_regr,catboost_clf,catboost_regr_scl,catboost_comb_29
2,163450687,205900920,10000.0,9725.0,36,7.56,2.63,0.88,311.34,A,...,,,,DIRECT_PAY,1.0,,-0.045695,0.933956,0.73051,0.78951
20,163250164,205407603,39200.0,27200.0,36,11.02,5.36,1.05,1283.73,B,...,,,,CASH,0.0,,-0.046752,0.938308,0.729743,0.790227
26,163401556,205790595,17000.0,15000.0,36,11.71,5.36,1.05,562.3,B,...,,,,CASH,4.0,,-0.047149,0.910468,0.729455,0.781949
34,163621726,206118531,14725.0,13150.0,36,8.81,2.63,0.88,466.96,A,...,,,,CASH,4.0,,-0.030219,0.918186,0.741751,0.792917
35,163672086,206313082,7000.0,5200.0,36,16.950001,7.96,1.23,249.4,C,...,,,,CASH,1.0,,-0.020195,0.865015,0.749031,0.782667
37,163639377,206218205,23825.0,21450.0,36,8.81,2.63,0.88,755.53,A,...,,,,CASH,3.0,,-0.045988,0.932053,0.730298,0.788807
40,163644843,206233770,13275.0,9350.0,36,13.08,5.36,1.05,447.8,B,...,,,,CASH,2.0,,-0.038993,0.890451,0.735378,0.78035
42,163676903,206317959,10000.0,5600.0,36,11.02,5.36,1.05,327.49,B,...,,,,CASH,4.0,,-0.034329,0.903741,0.738766,0.786609
59,163680406,206310422,35000.0,20650.0,36,6.46,2.63,0.88,1072.08,A,...,,,,CASH,5.0,,-0.037594,0.971128,0.736394,0.804467


In [17]:
api_loans.query("catboost_comb_29 >= {0}".format(.78)).nlargest(5, 'catboost_comb_29')

Unnamed: 0,id,member_id,loan_amount,funded_amount,term,int_rate,exp_default_rate,service_fee_rate,installment,grade,...,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,disbursement_method,open_act_il,sec_app_open_act_il,catboost_regr,catboost_clf,catboost_regr_scl,catboost_comb_29
59,163680406,206310422,35000.0,20650.0,36,6.46,2.63,0.88,1072.08,A,...,,,,CASH,5.0,,-0.037594,0.971128,0.736394,0.804467
34,163621726,206118531,14725.0,13150.0,36,8.81,2.63,0.88,466.96,A,...,,,,CASH,4.0,,-0.030219,0.918186,0.741751,0.792917
20,163250164,205407603,39200.0,27200.0,36,11.02,5.36,1.05,1283.73,B,...,,,,CASH,0.0,,-0.046752,0.938308,0.729743,0.790227
2,163450687,205900920,10000.0,9725.0,36,7.56,2.63,0.88,311.34,A,...,,,,DIRECT_PAY,1.0,,-0.045695,0.933956,0.73051,0.78951
37,163639377,206218205,23825.0,21450.0,36,8.81,2.63,0.88,755.53,A,...,,,,CASH,3.0,,-0.045988,0.932053,0.730298,0.788807


In [5]:
order_resp

<Response [200]>

In [6]:
order_resp.content

b'{"orderInstructId":null}'

In [7]:
api_loans.head().sort_values('catboost_comb_29', ascending=False)

Unnamed: 0,id,member_id,loan_amount,funded_amount,term,int_rate,exp_default_rate,service_fee_rate,installment,grade,...,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,disbursement_method,open_act_il,sec_app_open_act_il,catboost_regr,catboost_clf,catboost_regr_scl,catboost_comb_29
2,163450687,205900920,10000.0,9725.0,36,7.56,2.63,0.88,311.34,A,...,,,,DIRECT_PAY,1.0,,-0.045695,0.933956,0.73051,0.78951
0,163223097,205380433,12000.0,11900.0,36,23.049999,12.86,1.52,464.83,D,...,0.0,76.0,11281.0,CASH,1.0,4.0,-0.025727,0.764609,0.745014,0.750696
3,163390637,205779629,18000.0,16550.0,36,16.120001,7.96,1.23,633.9,C,...,,,,CASH,1.0,,-0.08211,0.829564,0.704062,0.740458
4,163097569,205205335,25200.0,18950.0,36,20.549999,12.86,1.52,943.6,D,...,0.0,38.0,76468.0,CASH,2.0,4.0,-0.136313,0.746974,0.664695,0.688556
1,162477892,203905831,25000.0,13250.0,36,23.049999,12.86,1.52,968.4,D,...,,,,CASH,3.0,,-0.171216,0.697845,0.639344,0.656309


In [38]:
# dump api loans to sqllite db
from sqlalchemy import create_engine
disk_engine = create_engine('sqlite:///{0}'.format(config.lc_api_db))
api_loans.to_sql('api_loans', disk_engine, if_exists='append')

In [46]:
import dash
from dash.dependencies import Input, Output
import dash_core_components as dcc
import dash_html_components as html
# try dash
app = dash.Dash(__name__)
server = app.server
if __name__ == '__main__':
    app.run_server(debug=True)

Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Debugger PIN: 042-601-689
Debugger PIN: 042-601-689
Debugger PIN: 042-601-689
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


SystemExit: 1


To exit: use 'exit', 'quit', or Ctrl-D.



In [None]:
app.

In [45]:
%debug

> [0;32m/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/werkzeug/serving.py[0m(988)[0;36mrun_simple[0;34m()[0m
[0;32m    986 [0;31m            [0ms[0m [0;34m=[0m [0msocket[0m[0;34m.[0m[0msocket[0m[0;34m([0m[0maddress_family[0m[0;34m,[0m [0msocket[0m[0;34m.[0m[0mSOCK_STREAM[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    987 [0;31m            [0ms[0m[0;34m.[0m[0msetsockopt[0m[0;34m([0m[0msocket[0m[0;34m.[0m[0mSOL_SOCKET[0m[0;34m,[0m [0msocket[0m[0;34m.[0m[0mSO_REUSEADDR[0m[0;34m,[0m [0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 988 [0;31m            [0ms[0m[0;34m.[0m[0mbind[0m[0;34m([0m[0mserver_address[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    989 [0;31m            [0;32mif[0m [0mhasattr[0m[0;34m([0m[0ms[0m[0;34m,[0m [0;34m"set_inheritable"[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    990 [0;31m                [0ms[0m[0;3

In [16]:


msg = EmailMessage()
# email headers
msg['Subject'] = now.strftime("%Y-%m-%d %H:%M:%S.%f") + ' Investment Round'
msg['From'] = 'justindlrig <{0}>'.format(my_gmail_account)
msg['To'] = 'self <{0}>'.format(my_recipients[0])
# set the plain text body
msg.set_content('This is a plain text body.')
# now create a Content-ID for the image
image_cid = make_msgid(domain='xyz.com')#
# if `domain` argument isn't provided, it will 
# use your computer's name

# set an alternative html body
msg.add_alternative("""\
<html>
    <body>
        <p>This is an HTML body.<br>
           It also has an image.
        </p>
        <img src="cid:{image_cid}">
    </body>
</html>
""".format(image_cid=image_cid[1:-1]), subtype='html')
# image_cid looks like <long.random.number@xyz.com>
# to use it as the img src, we don't need `<` or `>`
# so we use [1:-1] to strip them off
with open('/home/justin/projects/nst_star_app/images/ex1.png', 'rb') as img:
    # know the Content-Type of the image
    maintype, subtype = mimetypes.guess_type(img.name)[0].split('/')
    # attach it
    msg.get_payload()[1].add_related(img.read(), 
                                         maintype=maintype, 
                                         subtype=subtype, 
                                         cid=image_cid)

[autoreload of lendingclub.investing.investing_utils failed: Traceback (most recent call last):
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 450, in superreload
    update_generic(old_obj, new_obj)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 387, in update_generic
    update(a, b)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 357, in update_class
    update_instances(old, new)
  File "/home/justin/anaconda3/envs/lendingclub/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 312, in update_instances
    update_instances(old, new, obj.__dict__, visited)
  File "/home/justin/anaconda3/envs/l

In [17]:
inv_util.send_emails(now, my_gmail_account, my_gmail_password, msg)

In [11]:
msg

<email.message.EmailMessage at 0x7facaa416810>

# moving emailing into invest scripts themselves, based off ethermine scripts

In [None]:
# message = '''
# Ran investment round.
# Cash to invest: ${0}, meaning {1} possible notes to invest in at ${2} each.
# {3} loans seen through api in total.
# {4} loans seen through api excluding already invested. 
# {5} could be ordered due to score or cash available. Min score cutoff is {6}
# Response: {7}, {8}
# Scores from this batch:
# {9}
#     '''.format(cash_to_invest, n_to_pick, inv_amt, all_loan_count, len(api_loans), len(to_order_loan_ids), min_score, order_response, 'order_resp.content goes here', api_loans[['id', 'catboost_clf', 'catboost_regr', 'catboost_regr_scl', 'catboost_comb_29']]) #order_response.content

# send emails
# message = "test investment round \n LC API Response: {0} \n Response Contents: {1}".format(order_resp, order_resp.content)

In [2]:
# %%writefile invest_script.py
#invest_script_instant.py  
# print('From DL Server, wait invest')
import os
import requests
import json
import user_creds.account_info as acc_info
from lendingclub import config
from lendingclub.modeling.models import Model
from j_utils import munging as mg
import re
from sklearn.externals import joblib
# import lendingclub.dataprep_and_modeling.modeling_utils.data_prep_new as data_prep
import lendingclub.investing.investing_utils as investing_utils
from lendingclub.modeling import score_utils as scr_util
# from investing_utils import StandardScalerJustin
import pandas as pd
import numpy as np
import math as math
import torch
import pickle
import datetime
import smtplib
import gspread
import google.auth
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession



In [3]:
# constants and setup for various accounts and APIs
now = datetime.datetime.now()
token = acc_info.token
inv_acc_id = acc_info.investor_id
portfolio_id = acc_info.portfolio_id
my_gmail_account = acc_info.from_email_throwaway
my_gmail_password = acc_info.password_throwaway
my_recipients = acc_info.to_emails_throwaway
invest_ss_key = acc_info.invest_ss_key
investins_ss_key = acc_info.investins_ss_key

# lendingclub stuff
header = {
    'Authorization': token,
    'Content-Type': 'application/json',
    'X-LC-LISTING-VERSION': '1.3'
}

acc_summary_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/summary'
order_url = 'https://api.lendingclub.com/api/investor/v1/accounts/' + \
    str(inv_acc_id) + '/orders'
# min_score = -0.02  # -0.04599714276994965  # -0.035764345824470828
inv_amt = 25.00
cash_limit = 0.00
with open(os.path.join(config.data_dir, 'base_loan_info_dtypes.pkl'), 'rb') as f:
    base_loan_dtypes = pickle.load(f)

# below for google account stuff
creds = service_account.Credentials.from_service_account_file(os.path.join(config.prj_dir, 'user_creds', 'credentials.json'))
scope = ['https://spreadsheets.google.com/feeds']
creds = creds.with_scopes(scope)
gc = gspread.Client(auth=creds)
gc.session = AuthorizedSession(creds)
sheet = gc.open_by_key(invest_ss_key).sheet1
sheetins = gc.open_by_key(investins_ss_key).sheet1


In [4]:
# First check if I have enough money that I want to invest
summary_dict = json.loads(requests.get(
    acc_summary_url, headers=header).content)
cash_to_invest = summary_dict['availableCash']
n_to_pick = int(math.floor(cash_to_invest / inv_amt))

# load model, setup things for model
cb_both = Model('catboost_both')
# clf_wt_29_scorer will combine the regr and clf scores, with clf wt of 29%
clf_wt_29_scorer = scr_util.combined_score(.29)

# wait until it is time to do the api call. I'm rate limited to 1 call a second
investing_utils.pause_until_time(test=True)

In [5]:
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)
# match format of cr_line dates and emp_length, dti, dti_joint
api_loans['earliest_cr_line'] = pd.to_datetime(api_loans['earliest_cr_line'].str[:10])
api_loans['sec_app_earliest_cr_line'] = pd.to_datetime(api_loans['sec_app_earliest_cr_line'].str[:10])
bins = [12*k for k in range(1,11)]
bins = [-np.inf] + bins + [np.inf]
labels = ['< 1 year','1 year','2 years','3 years','4 years','5 years','6 years','7 years','8 years','9 years','10+ years',]
api_loans['emp_length'] = pd.cut(api_loans['emp_length'], bins=bins, labels=labels, right=False).astype(str).replace({'nan':'None'})
api_loans['dti'] = api_loans['dti'].replace({9999:np.nan})
api_loans['dti_joint'] = api_loans['dti_joint'].replace({9999:np.nan})
api_loans = api_loans.astype(base_loan_dtypes)

# Compare that api_loans get procced in same way as say, base_loan_info

In [7]:
# load model
cb_both = Model('catboost_both')
base_loan_info = pd.read_feather(os.path.join(config.data_dir, 'base_loan_info.fth'))
print(base_loan_info.shape)

(99997, 90)


In [10]:
tr_pr_blidf, cols, max_dict, min_dict, fill_dict, cats_dict, norm_dict = mg.train_proc(base_loan_info.copy())

dropping the following cols: 
['sec_app_earliest_cr_lineDay', 'sec_app_earliest_cr_lineIs_month_end', 'sec_app_earliest_cr_lineIs_quarter_end', 'sec_app_earliest_cr_lineIs_year_end', 'earliest_cr_lineDay', 'earliest_cr_lineIs_month_end', 'earliest_cr_lineIs_month_start', 'earliest_cr_lineIs_quarter_end', 'earliest_cr_lineIs_year_end']
only 2 values, consider dropping the following cols: 
['term', 'application_type', 'initial_list_status', 'sec_app_earliest_cr_lineIs_month_start', 'sec_app_earliest_cr_lineIs_quarter_start', 'sec_app_earliest_cr_lineIs_year_start', 'earliest_cr_lineIs_quarter_start', 'earliest_cr_lineIs_year_start']
made the following new null columns
['pct_tl_nvr_dlq_isnull', 'sec_app_revol_util_isnull', 'sec_app_num_rev_accts_isnull', 'revol_bal_joint_isnull', 'open_rv_12m_isnull', 'mo_sin_old_rev_tl_op_isnull', 'sec_app_open_act_il_isnull', 'mths_since_recent_revol_delinq_isnull', 'pub_rec_bankruptcies_isnull', 'tot_hi_cred_lim_isnull', 'num_bc_sats_isnull', 'mo_sin_o

In [11]:
val_pr_blidf = mg.val_test_proc(base_loan_info.copy(), cols, max_dict, min_dict, fill_dict, cats_dict, norm_dict)

In [12]:
val_pr_apidf = mg.val_test_proc(api_loans.copy(), cols, max_dict, min_dict, fill_dict, cats_dict, norm_dict)

In [13]:
# So the same dataframe used to train_proc, when passed to val_test_proc, turns out fine
print(tr_pr_blidf.shape, val_pr_blidf.shape)
assert not tr_pr_blidf.ne(val_pr_blidf).sum().sum(), 'two dataframes have values not equal'
assert not (tr_pr_blidf.dtypes != val_pr_blidf.dtypes).sum(), 'two dataframes have dtypes not equal'

(99997, 174) (99997, 174)


In [136]:
def map_col_val_procval(df, pdf):
    '''
    Get every mapping from value in original df to proc_df
    Need to decide how to evaluate the cols in pdf that are not in df.
    '''
    df_to_pdf = {}
    new_proc_cols = []
    for col in pdf.columns:
        if col in df:
            compare = pd.concat([df[col], pdf[col]], axis=1)
            df_to_pdf[col] = dict(zip(*compare.drop_duplicates().values.T))
        else:
            if 'isnull' in col:
                assert pdf[col].nunique() <= 2
                assert (pdf[col].isin([0,1])).all()
    return df_to_pdf

def check_proc_btwn_dfs(map1: dict, map2: dict):
    '''
    Given map1 and map2 from the map_col_val_procval function, iterate through
    the keys and verify that if same key is in both dicts, 
    map1/2 are dictionaries that are dict[colname]: dict[ori_val]: processed_val
    map2 should be from api loans
    '''
    mismatch_val = []
    new_api_val = set()
    for col, dic in map2.items():
        for k,v in dic.items():
            if k in map1[col] and v != map1[col][k]:
                mismatch_val.append((col,k, v))
                break
            elif k not in map1[col]:
                new_api_val.add(col)
    return mismatch_val, new_api_val

In [138]:
df1_to_pdf1 = map_col_val_procval(base_loan_info, tr_pr_blidf)
df2_to_pdf2 = map_col_val_procval(base_loan_info, val_pr_blidf)
df3_to_pdf3 = map_col_val_procval(api_loans, val_pr_apidf)

In [139]:
check_proc_btwn_dfs(df1_to_pdf1, df3_to_pdf3)

([],
 {'all_util',
  'annual_inc',
  'annual_inc_joint',
  'avg_cur_bal',
  'bc_open_to_buy',
  'dti',
  'dti_joint',
  'id',
  'int_rate',
  'max_bal_bc',
  'mo_sin_old_il_acct',
  'mths_since_last_delinq',
  'mths_since_last_major_derog',
  'mths_since_last_record',
  'mths_since_rcnt_il',
  'mths_since_recent_bc_dlq',
  'mths_since_recent_inq',
  'mths_since_recent_revol_delinq',
  'pct_tl_nvr_dlq',
  'revol_bal',
  'revol_bal_joint',
  'sec_app_chargeoff_within_12_mths',
  'sec_app_collections_12_mths_ex_med',
  'sec_app_fico_range_high',
  'sec_app_fico_range_low',
  'sec_app_mort_acc',
  'sec_app_mths_since_last_major_derog',
  'sec_app_num_rev_accts',
  'sec_app_open_acc',
  'sec_app_open_act_il',
  'sec_app_revol_util',
  'tot_coll_amt',
  'tot_cur_bal',
  'tot_hi_cred_lim',
  'total_bal_ex_mort',
  'total_bal_il',
  'total_il_high_credit_limit'})

In [140]:
base_loan_info.head()

Unnamed: 0,pct_tl_nvr_dlq,annual_inc,pub_rec,sec_app_revol_util,sec_app_num_rev_accts,revol_bal_joint,open_rv_12m,mo_sin_old_rev_tl_op,sec_app_open_act_il,mths_since_recent_revol_delinq,purpose,pub_rec_bankruptcies,tot_hi_cred_lim,num_bc_sats,mo_sin_old_il_acct,id,term,tot_coll_amt,num_actv_bc_tl,sec_app_mort_acc,bc_util,open_il_12m,sec_app_earliest_cr_line,application_type,revol_util,dti,annual_inc_joint,total_il_high_credit_limit,emp_length,inq_fi,sec_app_fico_range_low,collections_12_mths_ex_med,tax_liens,num_tl_90g_dpd_24m,sec_app_collections_12_mths_ex_med,mths_since_last_delinq,open_act_il,total_acc,sub_grade,delinq_amnt,mths_since_recent_bc_dlq,sec_app_chargeoff_within_12_mths,sec_app_mths_since_last_major_derog,dti_joint,earliest_cr_line,mths_since_last_record,num_tl_30dpd,open_acc,open_il_24m,num_op_rev_tl,mths_since_last_major_derog,mo_sin_rcnt_tl,avg_cur_bal,total_rev_hi_lim,total_bal_ex_mort,num_rev_tl_bal_gt_0,mths_since_rcnt_il,num_il_tl,home_ownership,total_cu_tl,inq_last_12m,num_bc_tl,mths_since_recent_inq,revol_bal,addr_state,fico_range_high,acc_now_delinq,num_tl_op_past_12m,grade,mths_since_recent_bc,open_acc_6m,int_rate,num_sats,sec_app_fico_range_high,all_util,open_rv_24m,chargeoff_within_12_mths,total_bal_il,percent_bc_gt_75,num_rev_accts,tot_cur_bal,sec_app_open_acc,max_bal_bc,mo_sin_rcnt_rev_tl_op,num_actv_rev_tl,mort_acc,fico_range_low,initial_list_status,total_bc_limit,bc_open_to_buy
0,,32000.0,0.0,,,,,,,,debt_consolidation,0.0,,,,57416,36,,,,,,NaT,INDIVIDUAL,25.6,11.63,,,6 years,,,0.0,0.0,,,58.0,,40.0,C3,0.0,,,,,1996-12-01,,,14.0,,,,,,,,,,,RENT,,,,,3511.0,CT,684.0,0.0,,C,,,13.57,,,,,0.0,,,,,,,,,,680.0,F,,
1,100.0,65000.0,0.0,,,,3.0,48.0,,,debt_consolidation,0.0,145285.0,5.0,163.0,65104,36,0.0,4.0,,81.800003,2.0,NaT,INDIVIDUAL,81.900002,18.24,,127285.0,2 years,2.0,,0.0,0.0,0.0,,,12.0,26.0,D4,0.0,,,,,2002-09-01,,0.0,19.0,6.0,7.0,,6.0,9108.0,18000.0,173057.0,6.0,6.0,19.0,RENT,2.0,4.0,5.0,0.0,14741.0,ME,664.0,0.0,5.0,D,6.0,2.0,18.99,19.0,,110.0,3.0,0.0,158316.0,60.0,7.0,173057.0,,4878.0,6.0,6.0,0.0,660.0,W,14600.0,2654.0
2,,50000.0,0.0,,,,,,,,vacation,,,,,121530,36,,,,,,NaT,INDIVIDUAL,0.7,5.35,,,9 years,,,0.0,0.0,,,0.0,,29.0,A3,0.0,,,,,1984-09-01,0.0,,17.0,,,,,,,,,,,OWN,,,,,21050.0,WA,764.0,0.0,,A,,,7.75,,,,,0.0,,,,,,,,,,760.0,F,,
3,,90000.0,0.0,,,,,,,,credit_card,,,,,122718,36,,,,,,NaT,INDIVIDUAL,62.200001,5.24,,,1 year,,,0.0,0.0,,,0.0,,15.0,F2,0.0,,,,,2002-10-01,0.0,,8.0,,,,,,,,,,,RENT,,,,,22379.0,NY,664.0,0.0,,F,,,15.33,,,,,0.0,,,,,,,,,,660.0,F,,
4,,1896.0,0.0,,,,,,,,debt_consolidation,,,,,123688,36,,,,,,NaT,INDIVIDUAL,87.699997,18.99,,,1 year,,,0.0,0.0,,,0.0,,3.0,G3,0.0,,,,,2006-05-01,0.0,,3.0,,,,,,,,,,,RENT,,,,,702.0,AZ,644.0,0.0,,G,,,17.219999,,,,,0.0,,,,,,,,,,640.0,F,,


In [148]:
# val_pr_blidf.iloc[base_loan_info.query('emp_length == "10+ years"').index].head()

In [141]:
api_loans.head()[base_loan_info.columns]

Unnamed: 0,pct_tl_nvr_dlq,annual_inc,pub_rec,sec_app_revol_util,sec_app_num_rev_accts,revol_bal_joint,open_rv_12m,mo_sin_old_rev_tl_op,sec_app_open_act_il,mths_since_recent_revol_delinq,purpose,pub_rec_bankruptcies,tot_hi_cred_lim,num_bc_sats,mo_sin_old_il_acct,id,term,tot_coll_amt,num_actv_bc_tl,sec_app_mort_acc,bc_util,open_il_12m,sec_app_earliest_cr_line,application_type,revol_util,dti,annual_inc_joint,total_il_high_credit_limit,emp_length,inq_fi,sec_app_fico_range_low,collections_12_mths_ex_med,tax_liens,num_tl_90g_dpd_24m,sec_app_collections_12_mths_ex_med,mths_since_last_delinq,open_act_il,total_acc,sub_grade,delinq_amnt,mths_since_recent_bc_dlq,sec_app_chargeoff_within_12_mths,sec_app_mths_since_last_major_derog,dti_joint,earliest_cr_line,mths_since_last_record,num_tl_30dpd,open_acc,open_il_24m,num_op_rev_tl,mths_since_last_major_derog,mo_sin_rcnt_tl,avg_cur_bal,total_rev_hi_lim,total_bal_ex_mort,num_rev_tl_bal_gt_0,mths_since_rcnt_il,num_il_tl,home_ownership,total_cu_tl,inq_last_12m,num_bc_tl,mths_since_recent_inq,revol_bal,addr_state,fico_range_high,acc_now_delinq,num_tl_op_past_12m,grade,mths_since_recent_bc,open_acc_6m,int_rate,num_sats,sec_app_fico_range_high,all_util,open_rv_24m,chargeoff_within_12_mths,total_bal_il,percent_bc_gt_75,num_rev_accts,tot_cur_bal,sec_app_open_acc,max_bal_bc,mo_sin_rcnt_rev_tl_op,num_actv_rev_tl,mort_acc,fico_range_low,initial_list_status,total_bc_limit,bc_open_to_buy
0,71.0,85000.0,0.0,,,,0.0,170.0,,10.0,debt_consolidation,0.0,66914.0,5.0,140.0,162440644,36,0.0,3.0,,59.099998,0.0,NaT,INDIVIDUAL,48.700001,19.139999,,44414.0,10+ years,2.0,,0.0,0.0,0.0,,10.0,2.0,21.0,D3,0.0,10.0,,,,2005-09-24,,0.0,10.0,0.0,8.0,,54.0,1512.0,22500.0,15118.0,4.0,54.0,11.0,RENT,1.0,2.0,5.0,3.0,10962.0,CA,674.0,0.0,0.0,D,67.0,0.0,23.049999,10.0,,22.6,0.0,0.0,4156.0,40.0,10.0,15118.0,,4457.0,61.0,4.0,0.0,670.0,F,13500.0,5526.0
1,85.0,60000.0,0.0,,,,0.0,95.0,,7.0,debt_consolidation,0.0,82782.0,4.0,77.0,162477892,36,0.0,1.0,,42.200001,1.0,NaT,INDIVIDUAL,42.200001,29.370001,,59182.0,1 year,2.0,,0.0,0.0,1.0,,7.0,3.0,20.0,D3,0.0,7.0,,,,2011-12-25,,0.0,11.0,3.0,8.0,7.0,1.0,5777.0,23600.0,57771.0,3.0,1.0,8.0,OWN,4.0,1.0,6.0,1.0,9966.0,TX,669.0,0.0,1.0,D,45.0,1.0,23.049999,11.0,,58.099998,1.0,0.0,47805.0,33.299999,12.0,57771.0,,5364.0,15.0,3.0,0.0,665.0,F,12700.0,7336.0
2,37.0,55000.0,0.0,,,,3.0,72.0,,37.0,credit_card,0.0,66373.0,6.0,135.0,163081291,36,0.0,6.0,,70.599998,0.0,NaT,INDIVIDUAL,62.0,12.04,,48373.0,< 1 year,0.0,,0.0,0.0,0.0,,37.0,2.0,38.0,B4,0.0,37.0,,,,2008-09-08,,0.0,11.0,1.0,9.0,40.0,3.0,4921.0,18000.0,54136.0,7.0,21.0,28.0,RENT,0.0,2.0,7.0,3.0,11162.0,FL,674.0,0.0,3.0,B,3.0,1.0,12.4,11.0,,62.700001,3.0,0.0,42974.0,50.0,10.0,54136.0,,4150.0,3.0,7.0,0.0,670.0,F,15700.0,4620.0
3,100.0,63000.0,0.0,,,,1.0,65.0,,,debt_consolidation,0.0,69643.0,7.0,76.0,162774715,36,3203.0,5.0,,30.200001,1.0,NaT,INDIVIDUAL,26.299999,24.209999,,40543.0,1 year,1.0,,0.0,0.0,0.0,,,2.0,14.0,C1,0.0,,,,,2013-08-02,,0.0,11.0,2.0,9.0,,7.0,3955.0,29100.0,43501.0,6.0,8.0,3.0,RENT,0.0,4.0,7.0,4.0,7653.0,MA,679.0,0.0,2.0,C,7.0,0.0,14.3,11.0,,62.5,2.0,0.0,35848.0,0.0,11.0,43501.0,,3192.0,7.0,6.0,0.0,675.0,F,23300.0,16260.0
4,100.0,57000.0,0.0,23.6,18.0,22574.0,2.0,168.0,2.0,,debt_consolidation,0.0,31696.0,2.0,42.0,162845686,36,623.0,2.0,1.0,32.5,0.0,2005-12-03,JOINT,42.799999,16.799999,114000.0,24396.0,4 years,3.0,655.0,0.0,0.0,0.0,0.0,,1.0,7.0,C4,0.0,,0.0,41.0,18.870001,2005-12-03,,0.0,5.0,0.0,4.0,,8.0,2779.0,7300.0,13896.0,3.0,42.0,1.0,OWN,0.0,6.0,2.0,6.0,3126.0,AL,674.0,0.0,2.0,C,26.0,0.0,16.950001,5.0,659.0,43.799999,2.0,0.0,10770.0,0.0,6.0,13896.0,9.0,1426.0,8.0,3.0,0.0,670.0,F,5100.0,3441.0


In [143]:
val_pr_blidf.head()

Unnamed: 0,pct_tl_nvr_dlq,annual_inc,pub_rec,sec_app_revol_util,sec_app_num_rev_accts,revol_bal_joint,open_rv_12m,mo_sin_old_rev_tl_op,sec_app_open_act_il,mths_since_recent_revol_delinq,purpose,pub_rec_bankruptcies,tot_hi_cred_lim,num_bc_sats,mo_sin_old_il_acct,id,term,tot_coll_amt,num_actv_bc_tl,sec_app_mort_acc,bc_util,open_il_12m,application_type,revol_util,dti,annual_inc_joint,total_il_high_credit_limit,emp_length,inq_fi,sec_app_fico_range_low,collections_12_mths_ex_med,tax_liens,num_tl_90g_dpd_24m,sec_app_collections_12_mths_ex_med,mths_since_last_delinq,open_act_il,total_acc,sub_grade,delinq_amnt,mths_since_recent_bc_dlq,sec_app_chargeoff_within_12_mths,sec_app_mths_since_last_major_derog,dti_joint,mths_since_last_record,num_tl_30dpd,open_acc,open_il_24m,num_op_rev_tl,mths_since_last_major_derog,mo_sin_rcnt_tl,avg_cur_bal,total_rev_hi_lim,total_bal_ex_mort,num_rev_tl_bal_gt_0,mths_since_rcnt_il,num_il_tl,home_ownership,total_cu_tl,inq_last_12m,num_bc_tl,mths_since_recent_inq,revol_bal,addr_state,fico_range_high,acc_now_delinq,num_tl_op_past_12m,grade,mths_since_recent_bc,open_acc_6m,int_rate,num_sats,sec_app_fico_range_high,all_util,open_rv_24m,chargeoff_within_12_mths,total_bal_il,percent_bc_gt_75,num_rev_accts,tot_cur_bal,sec_app_open_acc,max_bal_bc,mo_sin_rcnt_rev_tl_op,num_actv_rev_tl,mort_acc,fico_range_low,initial_list_status,total_bc_limit,bc_open_to_buy,sec_app_earliest_cr_lineYear,sec_app_earliest_cr_lineMonth,sec_app_earliest_cr_lineWeek,sec_app_earliest_cr_lineDayofweek,sec_app_earliest_cr_lineDayofyear,sec_app_earliest_cr_lineIs_month_start,sec_app_earliest_cr_lineIs_quarter_start,sec_app_earliest_cr_lineIs_year_start,earliest_cr_lineYear,earliest_cr_lineMonth,earliest_cr_lineWeek,earliest_cr_lineDayofweek,earliest_cr_lineDayofyear,earliest_cr_lineIs_quarter_start,earliest_cr_lineIs_year_start,pct_tl_nvr_dlq_isnull,sec_app_revol_util_isnull,sec_app_num_rev_accts_isnull,revol_bal_joint_isnull,open_rv_12m_isnull,mo_sin_old_rev_tl_op_isnull,sec_app_open_act_il_isnull,mths_since_recent_revol_delinq_isnull,pub_rec_bankruptcies_isnull,tot_hi_cred_lim_isnull,num_bc_sats_isnull,mo_sin_old_il_acct_isnull,tot_coll_amt_isnull,num_actv_bc_tl_isnull,sec_app_mort_acc_isnull,bc_util_isnull,open_il_12m_isnull,revol_util_isnull,dti_isnull,annual_inc_joint_isnull,total_il_high_credit_limit_isnull,inq_fi_isnull,sec_app_fico_range_low_isnull,num_tl_90g_dpd_24m_isnull,sec_app_collections_12_mths_ex_med_isnull,mths_since_last_delinq_isnull,open_act_il_isnull,mths_since_recent_bc_dlq_isnull,sec_app_chargeoff_within_12_mths_isnull,sec_app_mths_since_last_major_derog_isnull,dti_joint_isnull,mths_since_last_record_isnull,num_tl_30dpd_isnull,open_il_24m_isnull,num_op_rev_tl_isnull,mths_since_last_major_derog_isnull,mo_sin_rcnt_tl_isnull,avg_cur_bal_isnull,total_rev_hi_lim_isnull,total_bal_ex_mort_isnull,num_rev_tl_bal_gt_0_isnull,mths_since_rcnt_il_isnull,num_il_tl_isnull,total_cu_tl_isnull,inq_last_12m_isnull,num_bc_tl_isnull,mths_since_recent_inq_isnull,num_tl_op_past_12m_isnull,mths_since_recent_bc_isnull,open_acc_6m_isnull,num_sats_isnull,sec_app_fico_range_high_isnull,all_util_isnull,open_rv_24m_isnull,total_bal_il_isnull,percent_bc_gt_75_isnull,num_rev_accts_isnull,tot_cur_bal_isnull,sec_app_open_acc_isnull,max_bal_bc_isnull,mo_sin_rcnt_rev_tl_op_isnull,num_actv_rev_tl_isnull,mort_acc_isnull,total_bc_limit_isnull,bc_open_to_buy_isnull,sec_app_earliest_cr_lineYear_isnull,sec_app_earliest_cr_lineMonth_isnull,sec_app_earliest_cr_lineWeek_isnull,sec_app_earliest_cr_lineDay_isnull,sec_app_earliest_cr_lineDayofweek_isnull,sec_app_earliest_cr_lineDayofyear_isnull
0,0.64432,-0.571083,-0.313733,0.075816,-0.208157,-0.258458,-0.185517,-0.18966,-0.318921,-0.137057,3,-0.349041,-0.346261,-0.264538,0.080642,-1.833757,-0.646522,-0.12103,-0.290852,-0.305598,0.066162,-0.732915,1,-0.976441,-0.47636,-0.201896,-0.243049,7,-0.0198,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,1.050593,-0.257343,1.320038,13,-0.01584,-0.063049,-0.110058,-0.072406,-0.040747,0.073466,-0.047633,0.418699,-0.358778,-0.2699,0.021548,-0.253467,-0.370014,-0.264824,-0.261135,-0.173535,-0.313437,-0.325451,5,-0.550332,-0.429246,-0.150248,-0.179532,-0.56907,7,-0.586258,-0.057427,-0.038725,3,-0.33201,0.066868,0.107439,-0.113774,-0.03491,0.025582,-0.274891,-0.077377,-0.280753,-0.226183,-0.244546,-0.388255,-0.239665,-0.254035,-0.346315,-0.183356,-0.28545,-0.586262,1,-0.299965,-0.359203,2004.0,7.0,27.0,3.0,182.0,False,False,False,1996,12,48,6,336,False,False,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,0.64432,-0.167405,-0.313733,0.075816,-0.208157,-0.258458,1.148653,-1.369048,-0.318921,-0.137057,3,-0.349041,-0.187947,0.062013,0.695706,-1.833595,-0.646522,-0.12103,0.133745,-0.305598,0.861803,1.432463,1,1.295831,-0.047186,-0.201896,1.804201,3,0.65903,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,-0.129281,3.095193,0.157658,19,-0.01584,-0.063049,-0.110058,-0.072406,-0.040747,0.073466,-0.047633,1.301732,2.824469,-0.2699,0.021548,-0.253467,-0.266113,-0.494231,2.387878,0.128544,-0.581218,1.433444,5,0.187614,0.843627,-0.574165,-1.182875,-0.090006,21,-1.186068,-0.057427,1.593492,4,-0.57818,0.947872,1.229335,1.298577,-0.03491,2.565213,0.111445,-0.077377,2.740793,0.51315,-0.866411,0.180539,-0.239665,-0.186872,-0.459465,0.111036,-0.815379,-1.186084,2,-0.393513,-0.543942,2004.0,7.0,27.0,3.0,182.0,False,False,False,2002,9,35,6,244,False,False,0,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,1,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,1,1,1
2,0.64432,-0.350895,-0.313733,0.075816,-0.208157,-0.258458,-0.185517,-0.18966,-0.318921,-0.137057,13,-0.349041,-0.346261,-0.264538,0.080642,-1.832406,-0.646522,-0.12103,-0.290852,-0.305598,0.066162,-0.732915,1,-1.981407,-0.884108,-0.201896,-0.243049,10,-0.0198,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,-1.581434,-0.257343,0.406739,3,-0.01584,-0.063049,-0.110058,-0.072406,-0.040747,-2.749543,-0.047633,0.948519,-0.358778,-0.2699,0.021548,-0.253467,-0.370014,-0.264824,-0.261135,-0.173535,-0.313437,-0.325451,4,-0.550332,-0.429246,-0.150248,-0.179532,0.179132,47,1.812985,-0.057427,-0.038725,1,-0.33201,0.066868,-1.097254,-0.113774,-0.03491,0.025582,-0.274891,-0.077377,-0.280753,-0.226183,-0.244546,-0.388255,-0.239665,-0.254035,-0.346315,-0.183356,-0.28545,1.813028,1,-0.299965,-0.359203,2004.0,7.0,27.0,3.0,182.0,False,False,False,1984,9,35,5,245,False,False,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,0.64432,0.138412,-0.313733,0.075816,-0.208157,-0.258458,-0.185517,-0.18966,-0.318921,-0.137057,2,-0.349041,-0.346261,-0.264538,0.080642,-1.832381,-0.646522,-0.12103,-0.290852,-0.305598,0.066162,-0.732915,1,0.500737,-0.89125,-0.201896,-0.243049,1,-0.0198,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,-1.581434,-0.257343,-0.755641,27,-0.01584,-0.063049,-0.110058,-0.072406,-0.040747,-2.749543,-0.047633,-0.64094,-0.358778,-0.2699,0.021548,-0.253467,-0.370014,-0.264824,-0.261135,-0.173535,-0.313437,-0.325451,5,-0.550332,-0.429246,-0.150248,-0.179532,0.235826,34,-1.186068,-0.057427,-0.038725,6,-0.33201,0.066868,0.471745,-0.113774,-0.03491,0.025582,-0.274891,-0.077377,-0.280753,-0.226183,-0.244546,-0.388255,-0.239665,-0.254035,-0.346315,-0.183356,-0.28545,-1.186084,1,-0.299965,-0.359203,2004.0,7.0,27.0,3.0,182.0,False,False,False,2002,10,40,1,274,True,False,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,0.64432,-0.939336,-0.313733,0.075816,-0.208157,-0.258458,-0.185517,-0.18966,-0.318921,-0.137057,3,-0.349041,-0.346261,-0.264538,0.080642,-1.83236,-0.646522,-0.12103,-0.290852,-0.305598,0.066162,-0.732915,1,1.529919,0.001509,-0.201896,-0.243049,1,-0.0198,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,-1.581434,-0.257343,-1.751968,33,-0.01584,-0.063049,-0.110058,-0.072406,-0.040747,-2.749543,-0.047633,-1.523973,-0.358778,-0.2699,0.021548,-0.253467,-0.370014,-0.264824,-0.261135,-0.173535,-0.313437,-0.325451,5,-0.550332,-0.429246,-0.150248,-0.179532,-0.6889,4,-1.785879,-0.057427,-0.038725,7,-0.33201,0.066868,0.862959,-0.113774,-0.03491,0.025582,-0.274891,-0.077377,-0.280753,-0.226183,-0.244546,-0.388255,-0.239665,-0.254035,-0.346315,-0.183356,-0.28545,-1.785907,1,-0.299965,-0.359203,2004.0,7.0,27.0,3.0,182.0,False,False,False,2006,5,18,0,121,False,False,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [144]:
val_pr_apidf.head()

Unnamed: 0,pct_tl_nvr_dlq,annual_inc,pub_rec,sec_app_revol_util,sec_app_num_rev_accts,revol_bal_joint,open_rv_12m,mo_sin_old_rev_tl_op,sec_app_open_act_il,mths_since_recent_revol_delinq,purpose,pub_rec_bankruptcies,tot_hi_cred_lim,num_bc_sats,mo_sin_old_il_acct,id,term,tot_coll_amt,num_actv_bc_tl,sec_app_mort_acc,bc_util,open_il_12m,application_type,revol_util,dti,annual_inc_joint,total_il_high_credit_limit,emp_length,inq_fi,sec_app_fico_range_low,collections_12_mths_ex_med,tax_liens,num_tl_90g_dpd_24m,sec_app_collections_12_mths_ex_med,mths_since_last_delinq,open_act_il,total_acc,sub_grade,delinq_amnt,mths_since_recent_bc_dlq,sec_app_chargeoff_within_12_mths,sec_app_mths_since_last_major_derog,dti_joint,mths_since_last_record,num_tl_30dpd,open_acc,open_il_24m,num_op_rev_tl,mths_since_last_major_derog,mo_sin_rcnt_tl,avg_cur_bal,total_rev_hi_lim,total_bal_ex_mort,num_rev_tl_bal_gt_0,mths_since_rcnt_il,num_il_tl,home_ownership,total_cu_tl,inq_last_12m,num_bc_tl,mths_since_recent_inq,revol_bal,addr_state,fico_range_high,acc_now_delinq,num_tl_op_past_12m,grade,mths_since_recent_bc,open_acc_6m,int_rate,num_sats,sec_app_fico_range_high,all_util,open_rv_24m,chargeoff_within_12_mths,total_bal_il,percent_bc_gt_75,num_rev_accts,tot_cur_bal,sec_app_open_acc,max_bal_bc,mo_sin_rcnt_rev_tl_op,num_actv_rev_tl,mort_acc,fico_range_low,initial_list_status,total_bc_limit,bc_open_to_buy,sec_app_earliest_cr_lineYear,sec_app_earliest_cr_lineMonth,sec_app_earliest_cr_lineWeek,sec_app_earliest_cr_lineDayofweek,sec_app_earliest_cr_lineDayofyear,sec_app_earliest_cr_lineIs_month_start,sec_app_earliest_cr_lineIs_quarter_start,sec_app_earliest_cr_lineIs_year_start,earliest_cr_lineYear,earliest_cr_lineMonth,earliest_cr_lineWeek,earliest_cr_lineDayofweek,earliest_cr_lineDayofyear,earliest_cr_lineIs_quarter_start,earliest_cr_lineIs_year_start,pct_tl_nvr_dlq_isnull,sec_app_revol_util_isnull,sec_app_num_rev_accts_isnull,revol_bal_joint_isnull,open_rv_12m_isnull,mo_sin_old_rev_tl_op_isnull,sec_app_open_act_il_isnull,mths_since_recent_revol_delinq_isnull,pub_rec_bankruptcies_isnull,tot_hi_cred_lim_isnull,num_bc_sats_isnull,mo_sin_old_il_acct_isnull,tot_coll_amt_isnull,num_actv_bc_tl_isnull,sec_app_mort_acc_isnull,bc_util_isnull,open_il_12m_isnull,revol_util_isnull,dti_isnull,annual_inc_joint_isnull,total_il_high_credit_limit_isnull,inq_fi_isnull,sec_app_fico_range_low_isnull,num_tl_90g_dpd_24m_isnull,sec_app_collections_12_mths_ex_med_isnull,mths_since_last_delinq_isnull,open_act_il_isnull,mths_since_recent_bc_dlq_isnull,sec_app_chargeoff_within_12_mths_isnull,sec_app_mths_since_last_major_derog_isnull,dti_joint_isnull,mths_since_last_record_isnull,num_tl_30dpd_isnull,open_il_24m_isnull,num_op_rev_tl_isnull,mths_since_last_major_derog_isnull,mo_sin_rcnt_tl_isnull,avg_cur_bal_isnull,total_rev_hi_lim_isnull,total_bal_ex_mort_isnull,num_rev_tl_bal_gt_0_isnull,mths_since_rcnt_il_isnull,num_il_tl_isnull,total_cu_tl_isnull,inq_last_12m_isnull,num_bc_tl_isnull,mths_since_recent_inq_isnull,num_tl_op_past_12m_isnull,mths_since_recent_bc_isnull,open_acc_6m_isnull,num_sats_isnull,sec_app_fico_range_high_isnull,all_util_isnull,open_rv_24m_isnull,total_bal_il_isnull,percent_bc_gt_75_isnull,num_rev_accts_isnull,tot_cur_bal_isnull,sec_app_open_acc_isnull,max_bal_bc_isnull,mo_sin_rcnt_rev_tl_op_isnull,num_actv_rev_tl_isnull,mort_acc_isnull,total_bc_limit_isnull,bc_open_to_buy_isnull,sec_app_earliest_cr_lineYear_isnull,sec_app_earliest_cr_lineMonth_isnull,sec_app_earliest_cr_lineWeek_isnull,sec_app_earliest_cr_lineDay_isnull,sec_app_earliest_cr_lineDayofweek_isnull,sec_app_earliest_cr_lineDayofyear_isnull
0,-2.572488,0.077248,-0.313733,0.075816,-0.208157,-0.258458,-0.852603,-0.117871,-0.318921,-1.159673,3,-0.349041,-0.609495,0.062013,0.267025,1.58936,-0.646522,-0.12103,-0.290852,-0.305598,0.069652,-0.732915,1,-0.044123,0.011249,-0.201896,0.006999,2,0.65903,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,-1.127636,-0.257343,-0.257478,18,-0.01584,-1.302442,-0.110058,-0.072406,-0.040747,0.073466,-0.047633,-0.287727,-0.995427,-0.057296,0.021548,4.950369,-0.715816,-0.366782,-0.711389,-0.475614,1.254992,0.351047,5,-0.181359,-0.004955,-0.574165,-0.681203,-0.251216,5,-0.886163,-0.057427,-1.126869,4,1.298866,-0.814136,2.069722,-0.290317,-0.03491,-1.622782,-1.047561,-0.077377,-0.703644,-0.040657,-0.493292,-0.782908,-0.239665,-0.262678,2.652168,-0.477748,-0.815379,-0.886173,1,-0.440287,-0.37604,2004.0,7.0,27.0,3.0,182.0,False,False,False,2005,9,38,5,267,False,False,0.0,1,1,1,0.0,0.0,1,0,0.0,0.0,0.0,0,0.0,0.0,1,0.0,0.0,0.0,0,1,0.0,0.0,1,0.0,1,0,0.0,0,1,1,1,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1,1,1
1,-1.019546,-0.228569,-0.313733,0.075816,-0.208157,-0.258458,-0.852603,-0.887037,-0.318921,-1.293057,3,-0.349041,-0.524143,-0.264538,-0.907188,1.590145,-0.646522,-0.12103,-1.140046,-0.305598,-0.520099,0.349774,1,-0.306464,0.675462,-0.201896,0.327269,1,0.65903,-0.03491,-0.119121,-0.101635,1.827671,-0.170328,-1.263775,0.077911,-0.340506,18,-0.01584,-1.435234,-0.110058,-0.072406,-0.040747,0.073466,-0.047633,-0.111121,0.914521,-0.057296,-1.72976,-0.795534,-0.463317,-0.335628,0.125598,-0.777693,-0.77249,-0.054852,4,0.92556,-0.429246,-0.362206,-1.015651,-0.293704,43,-1.036116,-0.057427,-0.582797,4,0.621899,0.066868,2.069722,-0.113774,-0.03491,0.078292,-0.661226,-0.077377,0.271617,-0.226183,-0.244546,-0.52272,-0.239665,-0.099363,0.049711,-0.772141,-0.815379,-1.036129,1,-0.474305,-0.270225,2004.0,7.0,27.0,3.0,182.0,False,False,False,2011,12,51,6,359,False,False,0.0,1,1,1,0.0,0.0,1,0,0.0,0.0,0.0,0,0.0,0.0,1,0.0,0.0,0.0,0,1,0.0,0.0,1,0.0,1,0,0.0,0,1,1,1,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1,1,1
2,-6.343918,-0.289732,-0.313733,0.075816,-0.208157,-0.258458,1.148653,-1.122915,-0.318921,0.040789,2,-0.349041,-0.612405,0.388564,0.173834,1.602865,-0.646522,-0.12103,0.982939,-0.305598,0.470962,-0.732915,1,0.492665,-0.44974,-0.201896,0.092857,11,-0.69863,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,0.097618,-0.257343,1.153984,9,-0.01584,-0.107313,-0.110058,-0.072406,-0.040747,0.073466,-0.047633,-0.111121,-0.358778,0.155308,-0.208887,-0.578707,-0.513994,-0.494231,0.054268,0.430622,-0.007402,2.65114,5,-0.550332,-0.004955,-0.150248,-0.681203,-0.242684,10,-0.886163,-0.057427,0.505347,2,-0.670494,0.066868,-0.134741,-0.113774,-0.03491,0.298712,0.111445,-0.077377,0.163676,0.236247,-0.493292,-0.544893,-0.239665,-0.317956,-0.629191,0.405428,-0.815379,-0.886173,1,-0.346739,-0.429006,2004.0,7.0,27.0,3.0,182.0,False,False,False,2008,9,37,0,252,False,False,0.0,1,1,1,0.0,0.0,1,0,0.0,0.0,0.0,0,0.0,0.0,1,0.0,0.0,0.0,0,1,0.0,0.0,1,0.0,1,0,0.0,0,1,1,1,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1,1,1
3,0.64432,-0.191871,-0.313733,0.075816,-0.208157,-0.258458,-0.185517,-1.194704,-0.318921,-0.137057,3,-0.349041,-0.594816,0.715115,-0.925826,1.596403,-0.646522,1.680938,0.558342,-0.305598,-0.938858,0.349774,1,-0.948189,0.340433,-0.201896,-0.07695,1,-0.0198,-0.03491,-0.119121,-0.101635,-0.162512,-0.170328,-0.129281,-0.257343,-0.838669,11,-0.01584,-0.063049,-0.110058,-0.072406,-0.040747,0.073466,-0.047633,-0.111121,0.277872,0.155308,0.021548,-0.145054,-0.571184,-0.179858,-0.154424,0.128544,-0.504709,-0.73135,5,-0.550332,0.843627,-0.150248,-0.513979,-0.392375,19,-0.73621,-0.057427,-0.038725,3,-0.547409,-0.814136,0.258543,-0.113774,-0.03491,0.289129,-0.274891,-0.077377,0.004458,-1.148272,-0.368919,-0.609768,-0.239665,-0.490454,-0.40289,0.111036,-0.815379,-0.736217,1,-0.023574,0.251486,2004.0,7.0,27.0,3.0,182.0,False,False,False,2013,8,31,4,214,False,False,0.0,1,1,1,0.0,0.0,1,1,0.0,0.0,0.0,0,0.0,0.0,1,0.0,0.0,0.0,0,1,0.0,0.0,1,0.0,1,1,0.0,1,1,1,1,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1,1,1
4,0.64432,-0.265267,-0.313733,-1.361599,0.670125,-0.437434,0.481568,-0.138382,-0.318921,-0.137057,3,-0.349041,-0.798928,-0.91764,-1.559528,1.597899,-0.646522,0.229462,-0.715449,-0.305598,-0.858596,-0.732915,2,-0.282248,-0.140683,-0.188561,-0.427126,5,1.33786,-0.372272,-0.119121,-0.101635,-0.162512,-0.170328,-0.129281,-0.592596,-1.419859,14,-0.01584,-0.063049,-0.110058,0.179295,-0.063658,0.073466,-0.047633,-1.17076,-0.995427,-0.907713,0.021548,-0.036641,-0.640806,-0.797275,-0.735369,-0.777693,0.79594,-1.001949,4,-0.550332,1.692208,-1.210042,-0.179532,-0.585494,2,-0.886163,-0.057427,-0.038725,3,0.037245,-0.814136,0.807072,-1.173036,-0.372272,-0.606929,-0.274891,-0.077377,-0.555866,-1.148272,-0.990784,-0.790362,-0.393484,-0.808441,-0.346315,-0.772141,-0.815379,-0.886173,1,-0.79747,-0.497933,2005.0,12.0,48.0,5.0,337.0,False,False,False,2005,12,48,5,337,False,False,0.0,0,0,0,0.0,0.0,0,1,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0,0.0,0,1,0.0,1,0,0,0,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0


In [145]:
col = 'pct_tl_nvr_dlq'
print(df1_to_pdf1[col][100], df2_to_pdf2[col][100], df3_to_pdf3[col][100])

0.6443199 0.6443199 0.6443199


In [86]:
mapped_different_cols = []
value_dne_cols = []
for col, dicts in df1_to_pdf1.items():
    for k, val in dicts.items():
        if k not in df2_to_pdf2[col]:
            value_dne_cols.append(col)
            break
        elif k in df2_to_pdf2[col] and val != df2_to_pdf2[col][k]:
            if col == 'sub_grade':
                print(k, val, df2_to_pdf2[col][k])
            mapped_different_cols.append(col)
            break
        else:
            assert val == df2_to_pdf2[col][k]
            
#             all_other_cols.append(col)
#             break

In [87]:
value_dne_cols

['bc_open_to_buy',
 'sec_app_collections_12_mths_ex_med',
 'sec_app_chargeoff_within_12_mths',
 'total_bal_ex_mort',
 'tot_cur_bal',
 'all_util',
 'dti_joint',
 'sec_app_fico_range_high',
 'avg_cur_bal',
 'dti',
 'revol_util',
 'revol_bal_joint',
 'id',
 'bc_util',
 'sec_app_revol_util',
 'annual_inc_joint',
 'sec_app_open_act_il',
 'int_rate']

In [88]:
mapped_different_cols

['mort_acc',
 'total_bc_limit',
 'sec_app_num_rev_accts',
 'tax_liens',
 'mo_sin_rcnt_rev_tl_op',
 'total_il_high_credit_limit',
 'open_rv_12m',
 'mo_sin_rcnt_tl',
 'mths_since_recent_revol_delinq',
 'mo_sin_old_rev_tl_op',
 'tot_hi_cred_lim',
 'mths_since_recent_bc',
 'open_rv_24m',
 'pub_rec_bankruptcies',
 'sec_app_open_acc',
 'mths_since_rcnt_il',
 'acc_now_delinq',
 'annual_inc',
 'sec_app_mort_acc',
 'fico_range_high',
 'total_acc',
 'open_acc',
 'mths_since_last_record',
 'num_sats',
 'chargeoff_within_12_mths',
 'inq_fi',
 'num_bc_tl',
 'num_tl_30dpd',
 'sec_app_fico_range_low',
 'num_op_rev_tl',
 'open_act_il',
 'num_bc_sats',
 'mo_sin_old_il_acct',
 'max_bal_bc',
 'total_bal_il',
 'mths_since_recent_inq',
 'revol_bal',
 'percent_bc_gt_75',
 'mths_since_last_major_derog',
 'sec_app_mths_since_last_major_derog',
 'total_rev_hi_lim',
 'num_actv_bc_tl',
 'pct_tl_nvr_dlq',
 'total_cu_tl',
 'inq_last_12m',
 'pub_rec',
 'num_rev_accts',
 'collections_12_mths_ex_med',
 'num_rev_tl_ba

In [91]:
col = 'mort_acc'
api_loans[col].head()

0    4
1    1
2    4
3    1
4    0
Name: mort_acc, dtype: int64

In [92]:
proc_df_api_loans[col].head()

0    1.188104
1   -0.326254
2    1.188104
3   -0.326254
4   -0.831040
Name: mort_acc, dtype: float64

In [95]:
base_loan_info.query('{0} == 4'.format(col)).head()

Unnamed: 0,mort_acc,sub_grade,total_bc_limit,sec_app_num_rev_accts,tax_liens,initial_list_status,mo_sin_rcnt_rev_tl_op,total_il_high_credit_limit,open_rv_12m,bc_open_to_buy,mo_sin_rcnt_tl,mths_since_recent_revol_delinq,mo_sin_old_rev_tl_op,tot_hi_cred_lim,mths_since_recent_bc,open_rv_24m,pub_rec_bankruptcies,sec_app_open_acc,application_type,mths_since_rcnt_il,acc_now_delinq,sec_app_collections_12_mths_ex_med,sec_app_chargeoff_within_12_mths,annual_inc,sec_app_mort_acc,fico_range_high,total_acc,open_acc,total_bal_ex_mort,mths_since_last_record,tot_cur_bal,num_sats,chargeoff_within_12_mths,inq_fi,num_bc_tl,num_tl_30dpd,all_util,sec_app_earliest_cr_line,dti_joint,purpose,emp_length,sec_app_fico_range_low,num_op_rev_tl,open_act_il,sec_app_fico_range_high,avg_cur_bal,dti,num_bc_sats,mo_sin_old_il_acct,max_bal_bc,total_bal_il,mths_since_recent_inq,revol_bal,percent_bc_gt_75,revol_util,mths_since_last_major_derog,sec_app_mths_since_last_major_derog,total_rev_hi_lim,num_actv_bc_tl,grade,revol_bal_joint,id,pct_tl_nvr_dlq,total_cu_tl,inq_last_12m,earliest_cr_line,bc_util,pub_rec,addr_state,num_rev_accts,sec_app_revol_util,collections_12_mths_ex_med,num_rev_tl_bal_gt_0,fico_range_low,mths_since_recent_bc_dlq,tot_coll_amt,annual_inc_joint,num_tl_90g_dpd_24m,term,mths_since_last_delinq,sec_app_open_act_il,open_il_24m,open_il_12m,num_tl_op_past_12m,open_acc_6m,int_rate,num_actv_rev_tl,delinq_amnt,home_ownership,num_il_tl
110,4.0,B1,11000.0,,0.0,W,52.0,9567.0,0.0,832.0,3.0,77.0,558.0,315486.0,52.0,0.0,0.0,,INDIVIDUAL,44.0,0.0,,,115000.0,,679.0,25.0,6.0,10768.0,,297589.0,6.0,0.0,0.0,5.0,0.0,52.0,NaT,,debt_consolidation,3 years,,3.0,1.0,,49598.0,5.1,3.0,119.0,5634.0,600.0,3.0,10168.0,66.699997,92.400002,39.0,,11000.0,3.0,B,,361774,88.0,0.0,2.0,1969-06-01,92.400002,0.0,MO,8.0,,0.0,3.0,675.0,77.0,82.0,,0.0,36,39.0,,0.0,0.0,1.0,1.0,8.49,3.0,0.0,MORTGAGE,13.0
1327,4.0,A1,71700.0,,0.0,W,3.0,31161.0,2.0,57956.0,3.0,,138.0,415261.0,4.0,3.0,0.0,,INDIVIDUAL,19.0,0.0,,,59000.0,,794.0,38.0,19.0,21379.0,,313034.0,19.0,0.0,0.0,10.0,0.0,17.0,NaT,,debt_consolidation,8 years,,11.0,7.0,,17391.0,16.01,8.0,138.0,4497.0,7635.0,5.0,13744.0,0.0,15.0,,,91600.0,4.0,A,,801378,100.0,0.0,2.0,2004-09-01,19.200001,0.0,TX,19.0,,0.0,4.0,790.0,,0.0,,0.0,36,,,1.0,0.0,4.0,4.0,5.32,4.0,0.0,MORTGAGE,15.0
2134,4.0,E1,2300.0,,0.0,F,,,,887.0,,26.0,,,56.0,,0.0,,INDIVIDUAL,,0.0,,,95000.0,,664.0,30.0,10.0,32924.0,,,,0.0,,,,,NaT,,home_improvement,4 years,,,,,,13.29,,,,,3.0,5119.0,0.0,53.299999,,,,,E,,1199403,,,,1999-12-01,61.400002,0.0,UT,,,0.0,,660.0,,,,,36,26.0,,,,,,19.99,,0.0,MORTGAGE,
2136,4.0,B1,106600.0,,0.0,F,,,,103350.0,,42.0,,,3.0,,0.0,,INDIVIDUAL,,0.0,,,230000.0,,774.0,52.0,17.0,42940.0,,,,0.0,,,,,NaT,,wedding,8 years,,,,,,6.27,,,,,1.0,5773.0,0.0,5.0,,,,,B,,1200172,,,,1986-05-01,3.0,0.0,NC,,,0.0,,770.0,,,,,60,42.0,,,,,,9.76,,0.0,MORTGAGE,
2157,4.0,F2,8800.0,,0.0,F,,,,2574.0,,13.0,,,19.0,,0.0,,INDIVIDUAL,,0.0,,,92000.0,,674.0,24.0,4.0,10563.0,,,,0.0,,,,,NaT,,other,10+ years,,,,,,9.33,,,,,18.0,7276.0,50.0,64.400002,,,,,F,,1207852,,,,1994-12-01,70.699997,0.0,FL,,,0.0,,670.0,,,,,60,7.0,,,,,,22.780001,,0.0,MORTGAGE,


In [97]:
proc_df_base_loans.loc[base_loan_info.query('{0} == 4'.format(col)).index].head()

Unnamed: 0,mort_acc,sub_grade,total_bc_limit,sec_app_num_rev_accts,tax_liens,initial_list_status,mo_sin_rcnt_rev_tl_op,total_il_high_credit_limit,open_rv_12m,bc_open_to_buy,mo_sin_rcnt_tl,mths_since_recent_revol_delinq,mo_sin_old_rev_tl_op,tot_hi_cred_lim,mths_since_recent_bc,open_rv_24m,pub_rec_bankruptcies,sec_app_open_acc,application_type,mths_since_rcnt_il,acc_now_delinq,sec_app_collections_12_mths_ex_med,sec_app_chargeoff_within_12_mths,annual_inc,sec_app_mort_acc,fico_range_high,total_acc,open_acc,total_bal_ex_mort,mths_since_last_record,tot_cur_bal,num_sats,chargeoff_within_12_mths,inq_fi,num_bc_tl,num_tl_30dpd,all_util,dti_joint,purpose,emp_length,sec_app_fico_range_low,num_op_rev_tl,open_act_il,sec_app_fico_range_high,avg_cur_bal,dti,num_bc_sats,mo_sin_old_il_acct,max_bal_bc,total_bal_il,mths_since_recent_inq,revol_bal,percent_bc_gt_75,revol_util,mths_since_last_major_derog,sec_app_mths_since_last_major_derog,total_rev_hi_lim,num_actv_bc_tl,grade,revol_bal_joint,id,pct_tl_nvr_dlq,total_cu_tl,inq_last_12m,bc_util,pub_rec,addr_state,num_rev_accts,sec_app_revol_util,collections_12_mths_ex_med,num_rev_tl_bal_gt_0,fico_range_low,mths_since_recent_bc_dlq,tot_coll_amt,annual_inc_joint,num_tl_90g_dpd_24m,term,mths_since_last_delinq,sec_app_open_act_il,open_il_24m,open_il_12m,num_tl_op_past_12m,open_acc_6m,int_rate,num_actv_rev_tl,delinq_amnt,home_ownership,num_il_tl,sec_app_earliest_cr_lineYear,sec_app_earliest_cr_lineMonth,sec_app_earliest_cr_lineWeek,sec_app_earliest_cr_lineDayofweek,sec_app_earliest_cr_lineDayofyear,sec_app_earliest_cr_lineIs_month_start,sec_app_earliest_cr_lineIs_quarter_start,sec_app_earliest_cr_lineIs_year_start,earliest_cr_lineYear,earliest_cr_lineMonth,earliest_cr_lineWeek,earliest_cr_lineDayofweek,earliest_cr_lineDayofyear,earliest_cr_lineIs_quarter_start,earliest_cr_lineIs_year_start,mort_acc_isnull,total_bc_limit_isnull,sec_app_num_rev_accts_isnull,mo_sin_rcnt_rev_tl_op_isnull,total_il_high_credit_limit_isnull,open_rv_12m_isnull,bc_open_to_buy_isnull,mo_sin_rcnt_tl_isnull,mths_since_recent_revol_delinq_isnull,mo_sin_old_rev_tl_op_isnull,tot_hi_cred_lim_isnull,mths_since_recent_bc_isnull,open_rv_24m_isnull,pub_rec_bankruptcies_isnull,sec_app_open_acc_isnull,mths_since_rcnt_il_isnull,sec_app_collections_12_mths_ex_med_isnull,sec_app_chargeoff_within_12_mths_isnull,sec_app_mort_acc_isnull,total_bal_ex_mort_isnull,mths_since_last_record_isnull,tot_cur_bal_isnull,num_sats_isnull,inq_fi_isnull,num_bc_tl_isnull,num_tl_30dpd_isnull,all_util_isnull,dti_joint_isnull,sec_app_fico_range_low_isnull,num_op_rev_tl_isnull,open_act_il_isnull,sec_app_fico_range_high_isnull,avg_cur_bal_isnull,dti_isnull,num_bc_sats_isnull,mo_sin_old_il_acct_isnull,max_bal_bc_isnull,total_bal_il_isnull,mths_since_recent_inq_isnull,percent_bc_gt_75_isnull,revol_util_isnull,mths_since_last_major_derog_isnull,sec_app_mths_since_last_major_derog_isnull,total_rev_hi_lim_isnull,num_actv_bc_tl_isnull,revol_bal_joint_isnull,pct_tl_nvr_dlq_isnull,total_cu_tl_isnull,inq_last_12m_isnull,bc_util_isnull,num_rev_accts_isnull,sec_app_revol_util_isnull,num_rev_tl_bal_gt_0_isnull,mths_since_recent_bc_dlq_isnull,tot_coll_amt_isnull,annual_inc_joint_isnull,num_tl_90g_dpd_24m_isnull,mths_since_last_delinq_isnull,sec_app_open_act_il_isnull,open_il_24m_isnull,open_il_12m_isnull,num_tl_op_past_12m_isnull,open_acc_6m_isnull,num_actv_rev_tl_isnull,num_il_tl_isnull,sec_app_earliest_cr_lineYear_isnull,sec_app_earliest_cr_lineMonth_isnull,sec_app_earliest_cr_lineWeek_isnull,sec_app_earliest_cr_lineDay_isnull,sec_app_earliest_cr_lineDayofweek_isnull,sec_app_earliest_cr_lineDayofyear_isnull
110,1.188104,6,-0.493568,-0.216498,-0.107197,2,2.359127,-0.739467,-0.894006,-0.608283,-0.555762,1.832907,3.962171,0.75545,0.906869,-1.092062,-0.348078,-0.212555,1,0.907338,-0.068165,-0.238151,-0.149375,0.531833,-0.257895,-0.667337,0.010755,-1.019455,-0.808613,0.053819,0.962403,-1.026207,-0.076628,-0.686751,-0.640457,-0.05616,-0.300834,-0.04867,3,4,-0.061468,-1.15509,-0.593088,-0.061468,2.142091,-1.119711,-0.590317,-0.133949,0.007422,-0.801163,-0.637262,-0.285061,0.601209,1.655015,-0.231027,-0.035571,-0.660179,-0.289311,2,-0.294048,-1.5346,-0.690385,-0.553073,-0.100503,1.150518,-0.312403,24,-0.811695,0.072907,-0.113383,-0.80868,-0.667344,1.650811,-0.076487,-0.16069,-0.172894,-0.574253,0.215692,-0.318947,-1.013867,-0.763228,-0.627081,-0.010002,-0.991495,-0.801339,-0.019865,1,0.612426,2004.0,7.0,27.0,3.0,182.0,False,False,False,1969,6,22,6,152,False,False,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,1,1,1,1
1327,1.188104,1,2.301322,-0.216498,-0.107197,2,-0.620428,-0.247409,0.384971,3.084704,-0.555762,-0.163994,-0.466294,1.291399,-0.639789,0.018413,-0.348078,-0.212555,1,-0.058471,-0.068165,-0.238151,-0.149375,-0.239539,-0.257895,2.931913,1.092718,1.341612,-0.588171,0.053819,1.05743,1.332678,-0.076628,-0.686751,0.39931,-0.05616,-1.989599,-0.04867,3,9,-0.061468,0.586968,1.449239,-0.061468,0.233122,-0.197435,1.099434,0.23006,-0.208047,-0.637554,-0.296912,-0.124733,-1.255772,-1.510169,0.001879,-0.035571,1.746597,0.150159,1,-0.294048,-1.523021,0.668879,-0.553073,-0.100503,-1.452042,-0.312403,43,0.543234,0.072907,-0.113383,-0.500391,2.931975,-0.062117,-0.117544,-0.16069,-0.172894,-0.574253,-0.148517,-0.318947,-0.399924,-0.763228,1.002744,2.501508,-1.659315,-0.501605,-0.019865,1,0.883589,2004.0,7.0,27.0,3.0,182.0,False,False,False,2004,9,36,2,245,False,False,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,1,1
2134,1.188104,21,-0.894153,-0.216498,-0.107197,1,-0.316392,-0.24039,-0.254518,-0.604727,-0.219053,-0.430247,-0.181607,-0.34055,1.035757,-0.351745,-0.348078,-0.212555,1,-0.328897,-0.068165,-0.238151,-0.149375,0.256343,-0.257895,-1.136804,0.426895,-0.292973,-0.348325,0.053819,-0.379216,-0.118944,-0.076628,-0.028169,-0.22455,-0.05616,0.08517,-0.04867,5,5,-0.061468,-0.284061,-0.2527,-0.061468,-0.364665,-0.42737,-0.252367,0.057634,-0.258645,-0.276405,-0.637262,-0.511429,-1.255772,0.056065,0.001879,-0.035571,-0.263031,-0.289311,5,-0.294048,-1.512537,0.419681,-0.553073,-0.100503,0.048341,-0.312403,44,-0.195818,0.072907,-0.113383,-0.192103,-1.13682,-0.062117,-0.117544,-0.16069,-0.172894,-0.574253,-0.376147,-0.318947,-0.399924,-0.763228,-0.083806,-0.010002,1.431195,-0.201871,-0.019865,1,-0.201063,2004.0,7.0,27.0,3.0,182.0,False,False,False,1999,12,48,2,335,False,False,0,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
2136,1.188104,6,3.908269,-0.216498,-0.107197,1,-0.316392,-0.24039,-0.254518,6.019362,-0.219053,0.279762,-0.181607,-0.34055,-0.672011,-0.351745,-0.348078,-0.212555,1,-0.328897,-0.068165,-0.238151,-0.149375,2.115901,-0.257895,2.305957,2.257909,0.978371,-0.140244,0.053819,-0.379216,-0.118944,-0.076628,-0.028169,-0.22455,-0.05616,0.08517,-0.04867,14,9,-0.061468,-0.284061,-0.2527,-0.061468,-0.364665,-1.020805,-0.252367,0.057634,-0.258645,-0.276405,-0.977612,-0.482108,-1.255772,-1.919108,0.001879,-0.035571,-0.263031,-0.289311,2,-0.294048,-1.512517,0.419681,-0.553073,-0.100503,-2.028019,-0.312403,27,-0.195818,0.072907,-0.113383,-0.192103,2.306007,-0.062117,-0.117544,-0.16069,-0.172894,1.741362,0.352271,-0.318947,-0.399924,-0.763228,-0.083806,-0.010002,-0.723946,-0.201871,-0.019865,1,-0.201063,2004.0,7.0,27.0,3.0,182.0,False,False,False,1986,5,18,3,121,False,False,0,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1
2157,1.188104,27,-0.594865,-0.216498,-0.107197,1,-0.316392,-0.24039,-0.254518,-0.495665,-0.219053,-1.00713,-0.181607,-0.34055,-0.156458,-0.351745,-0.348078,-0.212555,1,-0.328897,-0.068165,-0.238151,-0.149375,0.21502,-0.257895,-0.823826,-0.072473,-1.382697,-0.812872,0.053819,-0.379216,-0.118944,-0.076628,-0.028169,-0.22455,-0.05616,0.08517,-0.04867,10,2,-0.061468,-0.284061,-0.2527,-0.061468,-0.364665,-0.762128,-0.252367,0.057634,-0.258645,-0.276405,1.915363,-0.414722,0.136267,0.509987,0.001879,-0.035571,-0.263031,-0.289311,6,-0.294048,-1.512314,0.419681,-0.553073,-0.100503,0.378994,-0.312403,10,-0.195818,0.072907,-0.113383,-0.192103,-0.823836,-0.062117,-0.117544,-0.16069,-0.172894,1.741362,-1.241144,-0.318947,-0.399924,-0.763228,-0.083806,-0.010002,2.018961,-0.201871,-0.019865,1,-0.201063,2004.0,7.0,27.0,3.0,182.0,False,False,False,1994,12,48,3,335,False,False,0,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1


In [46]:
print(value_dne_cols)
col_iter = iter(value_dne_cols)

['sec_app_num_rev_accts', 'mths_since_recent_revol_delinq', 'tot_hi_cred_lim', 'sec_app_open_acc', 'sec_app_collections_12_mths_ex_med', 'sec_app_chargeoff_within_12_mths', 'sec_app_mort_acc', 'total_bal_ex_mort', 'mths_since_last_record', 'tot_cur_bal', 'all_util', 'dti_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'dti', 'revol_bal', 'revol_util', 'mths_since_last_major_derog', 'sec_app_mths_since_last_major_derog', 'revol_bal_joint', 'id', 'sec_app_revol_util', 'mths_since_recent_bc_dlq', 'annual_inc_joint', 'mths_since_last_delinq', 'sec_app_open_act_il', 'int_rate']


In [47]:
print(mapped_different_cols)

['mort_acc', 'sub_grade', 'total_bc_limit', 'tax_liens', 'mo_sin_rcnt_rev_tl_op', 'total_il_high_credit_limit', 'open_rv_12m', 'bc_open_to_buy', 'mo_sin_rcnt_tl', 'mo_sin_old_rev_tl_op', 'mths_since_recent_bc', 'open_rv_24m', 'pub_rec_bankruptcies', 'mths_since_rcnt_il', 'acc_now_delinq', 'annual_inc', 'fico_range_high', 'total_acc', 'open_acc', 'num_sats', 'chargeoff_within_12_mths', 'inq_fi', 'num_bc_tl', 'num_tl_30dpd', 'emp_length', 'num_op_rev_tl', 'open_act_il', 'avg_cur_bal', 'num_bc_sats', 'mo_sin_old_il_acct', 'max_bal_bc', 'total_bal_il', 'mths_since_recent_inq', 'percent_bc_gt_75', 'total_rev_hi_lim', 'num_actv_bc_tl', 'grade', 'pct_tl_nvr_dlq', 'total_cu_tl', 'inq_last_12m', 'bc_util', 'pub_rec', 'addr_state', 'num_rev_accts', 'collections_12_mths_ex_med', 'num_rev_tl_bal_gt_0', 'fico_range_low', 'tot_coll_amt', 'num_tl_90g_dpd_24m', 'open_il_24m', 'open_il_12m', 'num_tl_op_past_12m', 'open_acc_6m', 'num_actv_rev_tl', 'delinq_amnt', 'num_il_tl']


In [30]:
ex_col = next(col_iter)
print(ex_col)
df1_to_pdf1[ex_col]

mths_since_recent_revol_delinq


{nan: -0.16399380564689636,
 10.0: 0.19101079088342507,
 40.0: -1.1402564311776056,
 37.0: 0.057884068677322015,
 55.0: -0.8296274126966984,
 7.0: 0.6347665315704353,
 9.0: 0.4572642352956312,
 29.0: -0.11961822759748207,
 50.0: -0.07524265352878105,
 46.0: 0.27976193902082713,
 31.0: -1.3621343015211107,
 24.0: 1.1228978463261465,
 61.0: -1.2733831533837086,
 28.0: -0.25274494980358514,
 14.0: -1.1846320052463066,
 20.0: 1.2560245685322495,
 64.0: -0.9627541349028015,
 34.0: 1.4779024388757547,
 5.0: -0.20836937573488412,
 22.0: 0.8566444019139404,
 33.0: -0.6965006904905954,
 69.0: -0.3414960979409872,
 30.0: -0.5189983942157913,
 42.0: -0.29712052387228616,
 17.0: -0.6077495423531933}

In [31]:
df2_to_pdf2[ex_col]

{nan: -0.16399380564689636,
 13.0: -1.0071296691894531,
 12.0: -1.0515053272247314,
 18.0: -0.7852518558502197,
 21.0: -0.6521251201629639,
 15.0: -0.6077495217323303,
 22.0: -0.9183785319328308,
 17.0: -0.8296273946762085,
 7.0: -1.2733831405639648,
 14.0: -0.9627541303634644,
 16.0: -0.874002993106842,
 25.0: -0.47462281584739685,
 19.0: -0.7408762574195862,
 20.0: -0.6965007185935974,
 11.0: -1.0958808660507202,
 28.0: -0.3414961099624634,
 10.0: -1.140256404876709,
 24.0: -0.518998384475708,
 31.0: -0.20836937427520752,
 6.0: -1.3177586793899536,
 9.0: -0.38587167859077454,
 27.0: -1.1846320629119873,
 26.0: -0.4302472472190857,
 30.0: -0.2527449429035187,
 35.0: -0.03086707927286625,
 39.0: 0.14663521945476532,
 38.0: 0.10225964337587357,
 43.0: 0.32413750886917114,
 8.0: -1.229007601737976,
 44.0: 0.3685130774974823,
 23.0: -0.5633739829063416,
 32.0: -0.29712051153182983,
 29.0: -0.11961822956800461,
 34.0: -0.07524265348911285,
 33.0: 0.19101078808307648,
 40.0: 0.2353863716125

In [None]:
api_loans.head()[base_loan_info.columns]
base_loan_info.head()
col = 'dti'
api_loans[col].value_counts(dropna=False), base_loan_info[col].value_counts(dropna=False)

In [270]:
api_loans[ex_col].unique()

array(['F'], dtype=object)

In [255]:
base_loan_info[ex_col].unique()

array(['< 1 year', '10+ years', '1 year', '6 years', '9 years', '3 years',
       '5 years', '7 years', '2 years', '8 years', '4 years', 'None'],
      dtype=object)

In [169]:
proc_df_api_loans['application_type'].value_counts(dropna=False)

0    84
Name: application_type, dtype: int64

In [164]:
df1_to_pdf1

{'mort_acc': {0: -0.8329272859049683,
  1: -0.32728533836254287,
  2: 0.17835660917988264,
  4: 1.1896405042647338,
  3: 0.6839985567223081,
  6: 2.2009243993495846},
 'collections_12_mths_ex_med': {0: -0.11741085301187737,
  1: 6.5643743678158515},
 'application_type': {'INDIVIDUAL': 0},
 'pub_rec': {0: -0.35289116968700746, 1: 1.267717115197907},
 'num_tl_op_past_12m': {1: -0.6277557953717531,
  2: -1.1723743374710003,
  0: -0.08313725327250598,
  3: 0.46148128882674117,
  4: 1.0060998309259883,
  5: 1.5507183730252354,
  7: 2.0953369151244825,
  6: 2.6399554572237296,
  8: 3.1845739993229767},
 'inq_last_12m': {0: -0.8978315167571935,
  2: -0.09364421720226375,
  1: -0.4957378669797286,
  3: 0.30844943257520113,
  4: 0.710543082352666,
  5: 1.112636732130131,
  8: 2.3189176814625254,
  6: 1.5147303819075957,
  19: 6.741947829014639,
  11: 3.1231049810174554,
  10: 3.52519863079492},
 'annual_inc': {40000.0: -0.30997498255215206,
  65000.0: -0.0971963489314257,
  60000.0: -0.13975207

In [165]:
df2_to_pdf2

{'mort_acc': {0.0: -0.8329272866249084,
  1.0: -0.3272853493690491,
  2.0: 0.1783566027879715,
  3.0: 0.6839985847473145,
  4.0: 1.1896405220031738,
  5.0: 1.6952824592590332,
  6.0: 2.2009243965148926,
  nan: 2.706566333770752,
  7.0: 3.2122082710266113,
  8.0: 3.7178502082824707,
  9.0: 4.22349214553833,
  10.0: 4.7291340827941895,
  11.0: 5.234776020050049,
  12.0: 5.740417957305908,
  13.0: 6.246059894561768,
  14.0: 6.751701831817627,
  15.0: 7.257343769073486,
  16.0: 7.762985706329346,
  17.0: 8.268628120422363,
  18.0: 8.774270057678223,
  19.0: 9.279911994934082,
  20.0: 10.2911958694458,
  23.0: 10.79683780670166,
  22.0: 9.785553932189941,
  21.0: 11.30247974395752,
  24.0: 11.808121681213379,
  25.0: 12.819405555725098,
  27.0: 13.325047492980957,
  28.0: 12.313763618469238,
  26.0: 13.830689430236816,
  29.0: 14.841973304748535,
  31.0: 14.336331367492676,
  34.0: 15.347615242004395,
  30.0: 16.35890007019043,
  32.0: 16.86454200744629,
  35.0: 17.875825881958008,
  37.0: 

In [96]:
cols = cb_both.proc_arti[0]
proc_df_api_loans.head(2)[cols]

Unnamed: 0,mort_acc,collections_12_mths_ex_med,application_type,pub_rec,num_tl_op_past_12m,inq_last_12m,annual_inc,mths_since_last_delinq,dti_joint,delinq_amnt,mths_since_recent_inq,num_actv_rev_tl,fico_range_high,mths_since_recent_bc_dlq,sec_app_collections_12_mths_ex_med,num_actv_bc_tl,bc_open_to_buy,mo_sin_old_rev_tl_op,revol_bal,mo_sin_rcnt_tl,mths_since_last_record,total_cu_tl,sec_app_open_act_il,fico_range_low,sec_app_fico_range_high,total_acc,open_il_24m,emp_length,total_bal_il,pct_tl_nvr_dlq,bc_util,max_bal_bc,term,revol_bal_joint,pub_rec_bankruptcies,addr_state,num_op_rev_tl,inq_fi,sec_app_fico_range_low,open_rv_12m,num_bc_tl,open_il_12m,tot_cur_bal,num_rev_tl_bal_gt_0,initial_list_status,total_bc_limit,num_tl_30dpd,sec_app_revol_util,open_act_il,tot_hi_cred_lim,mths_since_recent_bc,int_rate,dti,sec_app_chargeoff_within_12_mths,num_il_tl,mths_since_recent_revol_delinq,num_tl_90g_dpd_24m,total_rev_hi_lim,sec_app_num_rev_accts,mo_sin_old_il_acct,percent_bc_gt_75,annual_inc_joint,grade,sec_app_open_acc,num_bc_sats,num_sats,id,tot_coll_amt,mths_since_last_major_derog,sec_app_mths_since_last_major_derog,open_acc,num_rev_accts,mo_sin_rcnt_rev_tl_op,open_rv_24m,tax_liens,total_bal_ex_mort,home_ownership,acc_now_delinq,sec_app_mort_acc,sub_grade,avg_cur_bal,revol_util,total_il_high_credit_limit,purpose,open_acc_6m,all_util,mths_since_rcnt_il,chargeoff_within_12_mths,earliest_cr_lineYear,earliest_cr_lineMonth,earliest_cr_lineWeek,earliest_cr_lineDayofweek,earliest_cr_lineDayofyear,earliest_cr_lineIs_quarter_start,earliest_cr_lineIs_year_start,sec_app_earliest_cr_lineYear,sec_app_earliest_cr_lineMonth,sec_app_earliest_cr_lineWeek,sec_app_earliest_cr_lineDayofweek,sec_app_earliest_cr_lineDayofyear,sec_app_earliest_cr_lineIs_month_start,sec_app_earliest_cr_lineIs_quarter_start,sec_app_earliest_cr_lineIs_year_start,mort_acc_isnull,num_tl_op_past_12m_isnull,inq_last_12m_isnull,mths_since_last_delinq_isnull,dti_joint_isnull,mths_since_recent_inq_isnull,num_actv_rev_tl_isnull,mths_since_recent_bc_dlq_isnull,sec_app_collections_12_mths_ex_med_isnull,num_actv_bc_tl_isnull,bc_open_to_buy_isnull,mo_sin_old_rev_tl_op_isnull,mo_sin_rcnt_tl_isnull,mths_since_last_record_isnull,total_cu_tl_isnull,sec_app_open_act_il_isnull,sec_app_fico_range_high_isnull,open_il_24m_isnull,total_bal_il_isnull,pct_tl_nvr_dlq_isnull,bc_util_isnull,max_bal_bc_isnull,revol_bal_joint_isnull,pub_rec_bankruptcies_isnull,num_op_rev_tl_isnull,inq_fi_isnull,sec_app_fico_range_low_isnull,open_rv_12m_isnull,num_bc_tl_isnull,open_il_12m_isnull,tot_cur_bal_isnull,num_rev_tl_bal_gt_0_isnull,total_bc_limit_isnull,num_tl_30dpd_isnull,sec_app_revol_util_isnull,open_act_il_isnull,tot_hi_cred_lim_isnull,mths_since_recent_bc_isnull,dti_isnull,sec_app_chargeoff_within_12_mths_isnull,num_il_tl_isnull,mths_since_recent_revol_delinq_isnull,num_tl_90g_dpd_24m_isnull,total_rev_hi_lim_isnull,sec_app_num_rev_accts_isnull,mo_sin_old_il_acct_isnull,percent_bc_gt_75_isnull,annual_inc_joint_isnull,sec_app_open_acc_isnull,num_bc_sats_isnull,num_sats_isnull,tot_coll_amt_isnull,mths_since_last_major_derog_isnull,sec_app_mths_since_last_major_derog_isnull,num_rev_accts_isnull,mo_sin_rcnt_rev_tl_op_isnull,open_rv_24m_isnull,total_bal_ex_mort_isnull,sec_app_mort_acc_isnull,avg_cur_bal_isnull,revol_util_isnull,total_il_high_credit_limit_isnull,open_acc_6m_isnull,all_util_isnull,mths_since_rcnt_il_isnull,sec_app_earliest_cr_lineYear_isnull,sec_app_earliest_cr_lineMonth_isnull,sec_app_earliest_cr_lineWeek_isnull,sec_app_earliest_cr_lineDay_isnull,sec_app_earliest_cr_lineDayofweek_isnull,sec_app_earliest_cr_lineDayofyear_isnull
0,0.178357,-0.117411,0,-0.352891,-0.627756,0.308449,-0.207841,-0.144157,336.089356,-0.01877,-0.641637,-0.803433,-0.524396,-0.062979,-0.211297,-0.291372,-0.632812,-1.324853,-0.608959,-0.221559,0.033398,-0.558696,0.319569,-0.495737,-0.428693,-0.653742,0.804948,2,0.586559,1118.836942,297.328916,-0.784131,-0.569071,-0.355448,-0.3551,25,-1.153162,1.264131,-0.428693,-0.894633,-1.051922,0.273535,-0.009332,-0.807948,0,-0.860322,-0.056117,193.358131,0.068246,-0.105523,0.514239,430.628494,268.893864,-0.124037,0.19916,-0.161319,-0.175972,-0.810968,-0.814809,-0.380446,184.265214,-0.258992,4,-0.509198,-0.585934,-0.844542,2.738513,-0.022952,0.014224,-0.053184,-0.838041,-1.170515,1.374885,-1.10296,-0.133408,0.277661,0,-0.066639,-0.891502,17,0.398572,343.158234,0.658926,3,-0.012534,386.341489,-0.561506,-0.082886,2000.0,7.0,27.0,3.0,182.0,0.0,0.0,2004.0,7.0,27.0,3.0,182.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0,0.0,1,0,0.0,0,0.0,0.0,1,1.0,0,0,1.0,1.0,0.0,0,1.0,0,0.0,0.0,1.0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0,0.0,0,0.0,1,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,1,1,0.0,0.0,1.0,0.0,0,0.0,0,0.0,1.0,1.0,0,1,1,1,1.0,1,1
1,-0.327285,-0.117411,0,1.267717,-0.083137,-0.093644,-0.650421,-0.144157,361.962208,-0.01877,-0.981909,0.100985,0.893476,-0.062979,-0.211297,-0.291372,0.458924,0.137516,-0.489674,-0.221559,1.00654,-0.196376,0.319569,0.92144,0.232451,-0.235948,0.804948,2,1.385091,1118.836942,2.127302,-0.994935,1.757249,1.017733,2.301525,5,-0.278614,-0.675696,0.232451,-0.894633,-0.842726,1.310656,1.469447,0.119861,0,-0.198298,-0.056117,349.681994,0.068246,1.398542,-0.097438,273.081109,88394.047566,-0.124037,-0.07171,-0.161319,-0.175972,-0.134943,0.155916,0.708052,-1.256292,0.402829,2,-0.200192,-0.585934,-0.117272,2.735069,-0.022952,0.014224,-0.053184,-0.11007,-0.185135,0.467833,0.018728,-0.133408,1.046795,0,-0.066639,-0.331904,10,1.256557,75.605506,1.681452,3,-0.012534,87.512363,-0.561506,-0.082886,2000.0,7.0,27.0,3.0,182.0,0.0,0.0,2004.0,7.0,27.0,3.0,182.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0,0.0,1,0,0.0,0,0.0,0.0,0,1.0,0,0,1.0,1.0,0.0,0,1.0,0,0.0,0.0,1.0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0,0.0,0,0.0,1,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,1,1,0.0,0.0,1.0,0.0,0,0.0,0,0.0,1.0,1.0,0,1,1,1,1.0,1,1


In [127]:
col = 'collections_12_mths_ex_med'
api_loans.head(2)[col]

0    0
1    0
Name: collections_12_mths_ex_med, dtype: int64

In [130]:
val = 0
ex = base_loan_info.query('{0} == {1}'.format(col, val)).head(1)
ex

Unnamed: 0,mort_acc,collections_12_mths_ex_med,application_type,pub_rec,num_tl_op_past_12m,inq_last_12m,annual_inc,mths_since_last_delinq,dti_joint,delinq_amnt,mths_since_recent_inq,num_actv_rev_tl,fico_range_high,mths_since_recent_bc_dlq,sec_app_collections_12_mths_ex_med,num_actv_bc_tl,earliest_cr_line,bc_open_to_buy,mo_sin_old_rev_tl_op,revol_bal,mo_sin_rcnt_tl,mths_since_last_record,total_cu_tl,sec_app_open_act_il,fico_range_low,sec_app_fico_range_high,total_acc,open_il_24m,emp_length,total_bal_il,pct_tl_nvr_dlq,bc_util,max_bal_bc,term,revol_bal_joint,pub_rec_bankruptcies,addr_state,num_op_rev_tl,inq_fi,sec_app_fico_range_low,open_rv_12m,num_bc_tl,sec_app_earliest_cr_line,open_il_12m,tot_cur_bal,num_rev_tl_bal_gt_0,initial_list_status,total_bc_limit,num_tl_30dpd,sec_app_revol_util,open_act_il,tot_hi_cred_lim,mths_since_recent_bc,int_rate,dti,sec_app_chargeoff_within_12_mths,num_il_tl,mths_since_recent_revol_delinq,num_tl_90g_dpd_24m,total_rev_hi_lim,sec_app_num_rev_accts,mo_sin_old_il_acct,percent_bc_gt_75,annual_inc_joint,grade,sec_app_open_acc,num_bc_sats,num_sats,id,tot_coll_amt,mths_since_last_major_derog,sec_app_mths_since_last_major_derog,open_acc,num_rev_accts,mo_sin_rcnt_rev_tl_op,open_rv_24m,tax_liens,total_bal_ex_mort,home_ownership,acc_now_delinq,sec_app_mort_acc,sub_grade,avg_cur_bal,revol_util,total_il_high_credit_limit,purpose,open_acc_6m,all_util,mths_since_rcnt_il,chargeoff_within_12_mths
0,,0.0,individual,0.0,,,85000.0,,,0.0,,,739.0,,,,1994-02-01,,,28854.0,,,,,735.0,,42.0,,< 1 year,,,,,36,,0.0,CA,,,,,,NaT,,,,f,,,,,,,0.1189,0.1948,,,,,,,,,,B,,,,54734,,,,10.0,,,,0.0,,rent,0.0,,B4,,0.521,,debt_consolidation,,,,0.0


In [131]:
proc_df_base_loans.iloc[ex.index]

Unnamed: 0,mort_acc,collections_12_mths_ex_med,application_type,pub_rec,num_tl_op_past_12m,inq_last_12m,annual_inc,mths_since_last_delinq,dti_joint,delinq_amnt,mths_since_recent_inq,num_actv_rev_tl,fico_range_high,mths_since_recent_bc_dlq,sec_app_collections_12_mths_ex_med,num_actv_bc_tl,bc_open_to_buy,mo_sin_old_rev_tl_op,revol_bal,mo_sin_rcnt_tl,mths_since_last_record,total_cu_tl,sec_app_open_act_il,fico_range_low,sec_app_fico_range_high,total_acc,open_il_24m,emp_length,total_bal_il,pct_tl_nvr_dlq,bc_util,max_bal_bc,term,revol_bal_joint,pub_rec_bankruptcies,addr_state,num_op_rev_tl,inq_fi,sec_app_fico_range_low,open_rv_12m,num_bc_tl,open_il_12m,tot_cur_bal,num_rev_tl_bal_gt_0,initial_list_status,total_bc_limit,num_tl_30dpd,sec_app_revol_util,open_act_il,tot_hi_cred_lim,mths_since_recent_bc,int_rate,dti,sec_app_chargeoff_within_12_mths,num_il_tl,mths_since_recent_revol_delinq,num_tl_90g_dpd_24m,total_rev_hi_lim,sec_app_num_rev_accts,mo_sin_old_il_acct,percent_bc_gt_75,annual_inc_joint,grade,sec_app_open_acc,num_bc_sats,num_sats,id,tot_coll_amt,mths_since_last_major_derog,sec_app_mths_since_last_major_derog,open_acc,num_rev_accts,mo_sin_rcnt_rev_tl_op,open_rv_24m,tax_liens,total_bal_ex_mort,home_ownership,acc_now_delinq,sec_app_mort_acc,sub_grade,avg_cur_bal,revol_util,total_il_high_credit_limit,purpose,open_acc_6m,all_util,mths_since_rcnt_il,chargeoff_within_12_mths,earliest_cr_lineYear,earliest_cr_lineMonth,earliest_cr_lineWeek,earliest_cr_lineDayofweek,earliest_cr_lineDayofyear,earliest_cr_lineIs_quarter_start,earliest_cr_lineIs_year_start,sec_app_earliest_cr_lineYear,sec_app_earliest_cr_lineMonth,sec_app_earliest_cr_lineWeek,sec_app_earliest_cr_lineDayofweek,sec_app_earliest_cr_lineDayofyear,sec_app_earliest_cr_lineIs_month_start,sec_app_earliest_cr_lineIs_quarter_start,sec_app_earliest_cr_lineIs_year_start,mort_acc_isnull,num_tl_op_past_12m_isnull,inq_last_12m_isnull,mths_since_last_delinq_isnull,dti_joint_isnull,mths_since_recent_inq_isnull,num_actv_rev_tl_isnull,mths_since_recent_bc_dlq_isnull,sec_app_collections_12_mths_ex_med_isnull,num_actv_bc_tl_isnull,bc_open_to_buy_isnull,mo_sin_old_rev_tl_op_isnull,mo_sin_rcnt_tl_isnull,mths_since_last_record_isnull,total_cu_tl_isnull,sec_app_open_act_il_isnull,sec_app_fico_range_high_isnull,open_il_24m_isnull,total_bal_il_isnull,pct_tl_nvr_dlq_isnull,bc_util_isnull,max_bal_bc_isnull,revol_bal_joint_isnull,pub_rec_bankruptcies_isnull,num_op_rev_tl_isnull,inq_fi_isnull,sec_app_fico_range_low_isnull,open_rv_12m_isnull,num_bc_tl_isnull,open_il_12m_isnull,tot_cur_bal_isnull,num_rev_tl_bal_gt_0_isnull,total_bc_limit_isnull,num_tl_30dpd_isnull,sec_app_revol_util_isnull,open_act_il_isnull,tot_hi_cred_lim_isnull,mths_since_recent_bc_isnull,dti_isnull,sec_app_chargeoff_within_12_mths_isnull,num_il_tl_isnull,mths_since_recent_revol_delinq_isnull,num_tl_90g_dpd_24m_isnull,total_rev_hi_lim_isnull,sec_app_num_rev_accts_isnull,mo_sin_old_il_acct_isnull,percent_bc_gt_75_isnull,annual_inc_joint_isnull,sec_app_open_acc_isnull,num_bc_sats_isnull,num_sats_isnull,tot_coll_amt_isnull,mths_since_last_major_derog_isnull,sec_app_mths_since_last_major_derog_isnull,num_rev_accts_isnull,mo_sin_rcnt_rev_tl_op_isnull,open_rv_24m_isnull,total_bal_ex_mort_isnull,sec_app_mort_acc_isnull,avg_cur_bal_isnull,revol_util_isnull,total_il_high_credit_limit_isnull,open_acc_6m_isnull,all_util_isnull,mths_since_rcnt_il_isnull,sec_app_earliest_cr_lineYear_isnull,sec_app_earliest_cr_lineMonth_isnull,sec_app_earliest_cr_lineWeek_isnull,sec_app_earliest_cr_lineDay_isnull,sec_app_earliest_cr_lineDayofweek_isnull,sec_app_earliest_cr_lineDayofyear_isnull
0,-0.327285,-0.117411,1,-0.352891,-0.083137,-0.093644,0.073027,-0.144157,-0.051517,-0.01877,-0.301366,-0.200487,1.208559,-0.062979,-0.211297,-0.291372,-0.356447,-0.188624,0.553671,-0.221559,0.033398,-0.558696,-0.296749,1.236369,0.01207,1.435227,-0.408075,11,-0.276832,0.417338,0.111984,-0.245023,-0.569071,-0.256644,-0.3551,5,-0.278614,-0.675696,0.01207,-0.251984,-0.215141,-0.763586,-0.389286,-0.189408,1,-0.304778,-0.056117,0.086392,-0.264108,-0.350326,-0.354986,-0.277312,0.098177,-0.124037,-0.207145,-0.161319,-0.175972,-0.238528,-0.208106,0.058772,-0.063057,-0.189043,2,-0.200192,-0.248204,-0.117272,-1.551557,-0.022952,0.014224,-0.053184,-0.292063,-0.185135,-0.318279,-0.355168,-0.133408,-0.260331,4,-0.066639,-0.331904,9,-0.374688,0.007542,-0.241798,3,-0.012534,0.073615,-0.326829,-0.082886,1994,2,5,1,32,False,False,2004.0,7.0,27.0,3.0,182.0,False,False,False,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1


mort_acc                                    0.178357
collections_12_mths_ex_med                 -0.117411
application_type                                   1
pub_rec                                    -0.352891
num_tl_op_past_12m                          -1.17237
                                              ...   
sec_app_earliest_cr_lineMonth_isnull               1
sec_app_earliest_cr_lineWeek_isnull                1
sec_app_earliest_cr_lineDay_isnull                 1
sec_app_earliest_cr_lineDayofweek_isnull           1
sec_app_earliest_cr_lineDayofyear_isnull           1
Name: 51, Length: 174, dtype: object

In [99]:
# turn both dfs into dictionaries, map their unproc'd value to proc'd value, then check that every key in 
# the dicitonary for api_loans is in the key for base_loan_info.

In [100]:
proc_df_api_loan_vals = {}
for col in proc_df_api_loans:
    proc_df_api_loan_vals[col] = proc_df_api_loans[col].unique()
    
proc_df_base_loan_vals = {}
for col in proc_df_base_loans:
    proc_df_base_loan_vals[col] = proc_df_base_loans[col].unique()    

In [107]:
check = []
for col, vals in proc_df_api_loan_vals.items():
    if not all(v_api in proc_df_base_loan_vals[col] for v_api in vals):
        check.append(col)
len(check)

83

In [None]:
col = 'purpose'
api_loans[col].value_counts(dropna=False), proc_df_api_loans[col].value_counts(dropna=False)
base_loan_info[col].value_counts(dropna=False), proc_df_base_loans[col].value_counts(dropna=False)

In [88]:
mg.val_test_proc??

In [52]:
cb_both.score(api_loans, return_all=True)

(array([-999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999., -999., -999., -999., -999., -999., -999.,
        -999., -999., -999.]),
 array([-0.22214561, -0.22629796, -0.21701293, -0.24349122, -0.236954  ,
        -0.24378913, -0.25979034, -0.21356392, -0.29430755, -0.09189767,
        -0.24173285, -0.2863121 , -0.45356755, -0.36973553, -0.53295766,
        -0.20801924, -0.27798162, -0.15960499, -0.20774224, -0.3682837 ,
        -0.34958246, -0.31443847, -0.2

In [39]:
# last = set([(0,0), (1,1)])
last = set([(0,0)])

In [40]:
last

{(0, 0)}

In [41]:
for i,j in last:
    print(i,j)

0 0


# continuing invest script

In [7]:
_, api_loans['catboost_regr'], api_loans['catboost_clf'] = cb_both.score(api_loans, return_all=True)
api_loans['catboost_regr_scl'] = scr_util.scale_cb_regr_score(api_loans)
api_loans['catboost_comb_29'] = clf_wt_29_scorer('catboost_clf', 'catboost_regr_scl', api_loans)

In [8]:
api_loans.head()

Unnamed: 0,id,member_id,loan_amount,funded_amount,term,int_rate,exp_default_rate,service_fee_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,is_inc_v,accept_d,exp_d,list_d,credit_pull_d,review_status_d,review_status,desc,purpose,addr_zip,addr_state,investor_count,ils_exp_d,initial_list_status,emp_title,acc_now_delinq,acc_open_past_24_mths,bc_open_to_buy,percent_bc_gt_75,bc_util,dti,delinq_2_yrs,delinq_amnt,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6_mths,mths_since_last_delinq,mths_since_last_record,mths_since_recent_inq,mths_since_recent_revol_delinq,mths_since_recent_bc,mort_acc,open_acc,pub_rec,total_bal_ex_mort,revol_bal,revol_util,total_bc_limit,total_acc,total_il_high_credit_limit,num_rev_accts,mths_since_recent_bc_dlq,pub_rec_bankruptcies,num_accts_ever_12_0_ppd,chargeoff_within_12_mths,collections_12_mths_ex_med,tax_liens,mths_since_last_major_derog,num_sats,num_tl_op_past_12m,mo_sin_rcnt_tl,tot_hi_cred_lim,tot_cur_bal,avg_cur_bal,num_bc_tl,num_actv_bc_tl,num_bc_sats,pct_tl_nvr_dlq,num_tl_90g_dpd_24m,num_tl_30dpd,num_tl_12_0dpd_2m,num_il_tl,mo_sin_old_il_acct,num_actv_rev_tl,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,total_rev_hi_lim,num_rev_tl_bal_gt_0,num_op_rev_tl,tot_coll_amt,application_type,annual_inc_joint,dti_joint,is_inc_v_joint,open_acc_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,i_l_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,inq_fi,total_cu_tl,inq_last_12m,mtg_payment,housing_payment,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6_mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,revol_bal_joint,disbursement_method,open_act_il,sec_app_open_act_il,catboost_regr,catboost_clf,catboost_regr_scl,catboost_comb_29
0,162477892,203905831,25000.0,11150.0,36,23.049999,12.86,1.52,968.4,D,D3,1 year,OWN,60000.0,VERIFIED,2019-11-25T20:52:17.000-08:00,2020-01-09T21:00:28.000-08:00,2019-12-14T14:00:00.000-08:00,2019-11-25T20:48:59.000-08:00,2019-12-14T09:52:08.000-08:00,APPROVED,,debt_consolidation,782xx,TX,,2019-12-14T14:00:00.000-08:00,F,Office Manager,0.0,4,7336.0,33.299999,42.200001,29.370001,1,0.0,2011-12-25,665.0,669.0,1,7.0,,1.0,7.0,45.0,0.0,11.0,0.0,57771.0,9966.0,42.200001,12700.0,20.0,59182.0,12.0,7.0,0.0,1,0.0,0.0,0.0,7.0,11.0,1.0,1.0,82782.0,57771.0,5777.0,6.0,1.0,4.0,85.0,1.0,0.0,0,8.0,77.0,3.0,95.0,15.0,23600.0,3.0,8.0,0.0,INDIVIDUAL,,,,1.0,1.0,3.0,1.0,47805.0,76.8,0.0,1.0,5364.0,58.099998,2.0,4.0,1.0,0.0,,,,NaT,,,,,,,,,,CASH,3.0,,-0.187849,0.6745,0.627264,0.640962
1,162951079,204736303,16000.0,12000.0,36,20.549999,12.86,1.52,599.12,D,D2,< 1 year,OWN,52000.0,VERIFIED,2019-12-06T08:38:29.000-08:00,2020-01-12T14:00:00.000-08:00,2019-12-13T14:00:00.000-08:00,2019-12-06T06:22:49.000-08:00,2019-12-13T10:05:22.000-08:00,APPROVED,,credit_card,301xx,GA,,2019-12-13T14:00:00.000-08:00,F,server/sommelier,0.0,13,11808.0,12.5,52.200001,39.580002,0,0.0,2007-08-05,705.0,709.0,2,,94.0,1.0,,5.0,1.0,19.0,1.0,83609.0,51364.0,49.799999,24700.0,26.0,40900.0,12.0,,1.0,0,0.0,0.0,0.0,,19.0,4.0,4.0,268842.0,228510.0,12027.0,8.0,8.0,8.0,100.0,0.0,0.0,0,13.0,148.0,12.0,34.0,5.0,66700.0,12.0,12.0,0.0,INDIVIDUAL,,,,2.0,1.0,5.0,4.0,32245.0,78.8,3.0,8.0,5302.0,66.400002,4.0,0.0,4.0,1318.2,,,,NaT,,,,,,,,,,DIRECT_PAY,6.0,,-0.232829,0.637966,0.594595,0.607172
2,163223097,205380433,12000.0,9625.0,36,23.049999,12.86,1.52,464.83,D,D3,,MORTGAGE,0.0,NOT_VERIFIED,2019-12-11T10:26:53.000-08:00,2020-01-10T18:00:00.000-08:00,2019-12-11T18:00:00.000-08:00,2019-12-11T10:18:13.000-08:00,2019-12-11T14:09:33.000-08:00,APPROVED,,debt_consolidation,707xx,LA,,2019-12-11T18:00:00.000-08:00,F,,0.0,11,2204.0,50.0,79.199997,,0,0.0,2012-08-10,665.0,669.0,2,31.0,,1.0,31.0,1.0,1.0,12.0,0.0,40587.0,11281.0,66.400002,10600.0,18.0,31679.0,13.0,31.0,0.0,0,0.0,0.0,0.0,,12.0,7.0,1.0,220199.0,200369.0,16697.0,5.0,4.0,4.0,83.0,0.0,0.0,0,4.0,88.0,9.0,78.0,1.0,17000.0,9.0,10.0,107.0,JOINT,120000.0,12.46,NOT_VERIFIED,3.0,1.0,2.0,9.0,29306.0,92.5,6.0,9.0,7348.0,83.400002,4.0,0.0,2.0,1033.0,,615.0,619.0,2006-02-10,0.0,1.0,9.0,83.400002,4.0,0.0,0.0,76.0,11281.0,CASH,1.0,4.0,-0.146245,0.757973,0.657481,0.686624
3,163107485,205213343,20000.0,14150.0,36,25.65,12.86,1.52,802.09,D,D4,,RENT,40000.0,NOT_VERIFIED,2019-12-09T10:12:29.000-08:00,2020-01-09T10:00:00.000-08:00,2019-12-10T10:00:00.000-08:00,2019-12-09T10:10:55.000-08:00,2019-12-10T06:16:20.000-08:00,APPROVED,,credit_card,021xx,MA,,2019-12-10T10:00:00.000-08:00,F,,0.0,1,4690.0,75.0,70.900002,28.450001,0,0.0,1978-10-08,685.0,689.0,0,37.0,,,37.0,15.0,0.0,8.0,0.0,29497.0,14653.0,64.599998,16100.0,10.0,0.0,9.0,,0.0,0,0.0,0.0,0.0,,8.0,0.0,15.0,39749.0,29497.0,3687.0,4.0,4.0,4.0,90.0,0.0,0.0,0,0.0,,7.0,494.0,15.0,22700.0,7.0,7.0,0.0,INDIVIDUAL,,,,0.0,0.0,0.0,,0.0,,0.0,1.0,4459.0,74.199997,0.0,0.0,0.0,0.0,,,,NaT,,,,,,,,,,CASH,0.0,,-0.196215,0.638309,0.621187,0.626153
4,163158677,205265727,25000.0,16625.0,36,17.74,7.96,1.23,900.56,C,C5,2 years,MORTGAGE,115000.0,SOURCE_VERIFIED,2019-12-10T08:38:53.000-08:00,2020-01-09T14:00:00.000-08:00,2019-12-10T14:00:00.000-08:00,2019-12-10T08:33:52.000-08:00,2019-12-10T10:03:35.000-08:00,APPROVED,,credit_card,853xx,AZ,,2019-12-10T14:00:00.000-08:00,F,recreation supervisor,0.0,1,876.0,66.699997,93.699997,11.66,2,0.0,2010-05-09,665.0,669.0,1,7.0,,2.0,24.0,11.0,2.0,5.0,0.0,30413.0,12924.0,93.699997,13800.0,27.0,53069.0,8.0,24.0,0.0,2,0.0,1.0,0.0,7.0,5.0,1.0,11.0,66869.0,30413.0,6083.0,5.0,3.0,3.0,73.0,2.0,0.0,0,15.0,115.0,3.0,82.0,11.0,13800.0,3.0,3.0,0.0,INDIVIDUAL,,,,0.0,0.0,0.0,37.0,17489.0,16.2,1.0,1.0,5966.0,53.0,1.0,3.0,3.0,0.0,,,,NaT,,,,,,,,,,CASH,2.0,,-0.107237,0.797711,0.685813,0.718263


In [9]:
investable_loans = api_loans.query("catboost_comb_29 >= {0}".format(scr_util.min_comb_29_score))
investable_loans = investable_loans.sort_values('catboost_comb_29', ascending=False)

In [10]:
to_order_loan_ids = investable_loans['id'].values[:n_to_pick]
# See how many loans to pick from, set up order
orders_dict = {'aid': inv_acc_id}
orders_list = []
for loan_ids in to_order_loan_ids:
    orders_list.append({'loanId': int(loan_ids),
                        'requestedAmount': int(inv_amt),
                        'portfolioId': int(portfolio_id)})
orders_dict['orders'] = orders_list
payload = json.dumps(orders_dict)

In [11]:
def submit_lc_order():
    if cash_to_invest >= cash_limit:
        order_response = requests.post(order_url, headers=header, data=payload)
    else:
        pass

In [18]:
# temporary, should be api_loans called with allowing all loans, even if we
# invested in it already
all_loan_count = len(api_loans)
min_score = scr_util.min_comb_29_score
order_response = "testing from Justin"

In [22]:




#     print('Cash to invest is ${0}. Waiting for at least ${1} cash before investing'.format(
#         cash_to_invest, cash_limit))

# ids_and_scores.index.name = 'loan_id'    

def send_emails():
    subject = now.strftime("%Y-%m-%d %H:%M:%S.%f") + ' Investment Round'
    smtpserver = smtplib.SMTP('smtp.gmail.com',587)
    smtpserver.ehlo()
    smtpserver.starttls()
    smtpserver.login(my_gmail_account, my_gmail_password)
    message = '''
Ran investment round.
Cash to invest: ${0}, meaning {1} possible notes to invest in at ${2} each.
{3} loans seen through api in total.
{4} loans seen through api excluding already invested. 
{5} could be ordered due to score or cash available. Min score cutoff is {6}
Response: {7}, {8}
Scores from this batch:
{9}
    '''.format(cash_to_invest, n_to_pick, inv_amt, all_loan_count, len(api_loans), len(to_order_loan_ids), min_score, order_response, 'order_resp.content goes here', api_loans[['id', 'catboost_clf', 'catboost_regr', 'catboost_regr_scl', 'catboost_comb_29']]) #order_response.content
    msg = """From: %s\nTo: %s\nSubject: %s\n\n%s""" % (my_gmail_account, my_recipients, subject, message)
    smtpserver.sendmail(my_gmail_account, my_recipients, msg)
    smtpserver.close()
    
    
# send out the e-mails
send_emails()

# write some stats to a google spreadsheet
# TODO from https://www.twilio.com/blog/2017/02/an-easy-way-to-read-and-write-to-a-google-spreadsheet-in-python.html
sheet.append_row([now.strftime("%Y-%m-%d %H:%M:%S.%f"), all_loan_count])

# decide where I want to do these
# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)

In [23]:
my_recipients

['ruwenus@gmail.com']

In [14]:
%debug

> [0;32m<ipython-input-13-d00c2340b0f6>[0m(21)[0;36msend_emails[0;34m()[0m
[0;32m     19 [0;31m[0mScores[0m [0;32mfrom[0m [0mthis[0m [0mbatch[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m[0;34m{[0m[0;36m9[0m[0;34m}[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 21 [0;31m    '''.format(cash_to_invest, n_to_pick, inv_amt, len(all_loan_count), len(api_loans), len(to_order_loan_ids), min_score, order_response, order_response.content, ids_and_scores)
[0m[0;32m     22 [0;31m    [0mmsg[0m [0;34m=[0m [0;34m"""From: %s\nTo: %s\nSubject: %s\n\n%s"""[0m [0;34m%[0m [0;34m([0m[0mmy_gmail_account[0m[0;34m,[0m [0mmy_recipients[0m[0;34m,[0m [0msubject[0m[0;34m,[0m [0mmessage[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     23 [0;31m    [0msmtpserver[0m[0;34m.[0m[0msendmail[0m[0;34m([0m[0mmy_gmail_account[0m[0;34m,[0m [0mmy_recipients[0m[0;34m,[0m [0mmsg[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> di

In [56]:
my_gmail_account

'justindlrig@gmail.com'

In [None]:
api_loans[['exp_default_rate', 'catboost_clf']]

## OLD

In [None]:




# Load models and things for models
# RF
rf = investing_utils.load_RF()
with open(f'{investing_utils.data_save_path}/for_proc_df_model_loading.pkl', 'rb') as handle:
    nas_all_train, embeddings_all_train, train_cols_meds_all_train, use_cols, cols_all_train, col_cat_dict, mean_stdev_mapper_all_train, dl_df_train, dl_ys_train, cat_vars, emb_szs = pickle.load(handle)
    
# process the dataframe before I'm able to set up the neural net _____________
# wait until it is time to do the api call. I'm rate limited to 1 call a second
investing_utils.pause_until_time(test=False)

# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)
api_loans, api_ids = investing_utils.get_loans_and_ids(
    header, exclude_already=True)

# cut api loans to cols that are cols I'll need
api_ori_use_cols = [col for col in api_loans.columns if col in use_cols]
api_loans = api_loans[api_ori_use_cols]
api_loans['fake_ys'] = -999
date_cols = ['earliest_cr_line', 'sec_app_earliest_cr_line']
for col in date_cols:
    api_loans[col] = pd.to_datetime(api_loans[col]).apply(lambda dt: dt.replace(day=1))
investing_utils.add_dateparts(api_loans)    
investing_utils.train_cats(api_loans)
ordered_cat_cols = ['grade', 'sub_grade']
for col in col_cat_dict.keys():
    if col in ordered_cat_cols:
        ordered = True
    else:
        ordered = False
    api_loans[col] = pd.Categorical(api_loans[col], categories = col_cat_dict[col], ordered = ordered)
X_test, y_test, nas, _, mean_stdev_mapper = investing_utils.proc_df_justin(api_loans, 'fake_ys', valid_test = True, do_scale=True, na_dict=nas_all_train, mapper = mean_stdev_mapper_all_train, train_cols_meds=train_cols_meds_all_train, cols=cols_all_train)
# fake a last row for val_idxs for X_test and y_test
fake_row = pd.DataFrame(X_test.shape[1]*[-999]).T
fake_row.columns=X_test.columns
X_test = X_test.append(fake_row)
y_test = np.append(y_test, np.array([-999]))

# setup NN and load saved weights
md = investing_utils.ColumnarModelData.from_data_frame(investing_utils.PATH_NN, val_idxs=[len(X_test)-1], df=X_test, y=y_test, cat_flds=cat_vars, bs=1000, test_df=X_test.iloc[:-1,:])
n_cont = len(dl_df_train.columns)-len(cat_vars)
nn = md.get_learner(emb_szs, n_cont, 0.05, 1, [1000,500,500,250,250], [0.2,0.2,.2,.15,.05])
nn.load(f'{investing_utils.PATH_NN}{investing_utils.regr_version_NN}_{investing_utils.training_type}.pth')

# score the api_loans, filter to min score
# net score
nn_api_yhat = nn.predict(is_test=True)
nn_api_yhat = nn_api_yhat.reshape(-1)
# rf score
rf_api_yhat = rf.predict(X_test.iloc[:-1,:])
#combined score
api_yhat = (nn_api_yhat + rf_api_yhat)/2


# matching scores and loans
ids_and_scores = pd.DataFrame(pd.Series(dict(zip(api_ids, api_yhat))))
def get_preds(RF): return RF.predict(X_test.iloc[:-1,:])
preds = np.stack(investing_utils.parallel_trees(rf, get_preds))
# CIs = investing_utils.make_CIs(preds)
ids_and_scores = pd.DataFrame(ids_and_scores)
ids_and_scores.rename(columns={0:'3.0.0_score'}, inplace=True)
# ids_and_scores['rf_mean'] = CIs['mean'].values
# ids_and_scores['rf_std_dev'] = CIs['std_dev'].values
ids_and_scores = ids_and_scores.sort_values('3.0.0_score',ascending=False)
loans_to_pick_from = ids_and_scores[ids_and_scores['3.0.0_score'] >= min_score]
loans_to_pick_from = loans_to_pick_from.sort_values('3.0.0_score', ascending=False)





# See how many loans to pick from, set up order
orders_dict = {'aid': inv_acc_id}
orders_list = []
for loan_ids in to_order_loan_ids:
    orders_list.append({'loanId': int(loan_ids),
                        'requestedAmount': int(inv_amt),
                        'portfolioId': int(portfolio_id)})
orders_dict['orders'] = orders_list
payload = json.dumps(orders_dict)
if cash_to_invest >= cash_limit:
    order_response = requests.post(order_url, headers=header, data=payload)
else:
    pass
#     print('Cash to invest is ${0}. Waiting for at least ${1} cash before investing'.format(
#         cash_to_invest, cash_limit))

ids_and_scores.index.name = 'loan_id'    

def send_emails():
    subject = now.strftime("%Y-%m-%d %H:%M:%S.%f") + ' Investment Round'
    smtpserver = smtplib.SMTP('smtp.gmail.com',587)
    smtpserver.ehlo()
    smtpserver.starttls()
    smtpserver.login(my_gmail_account, my_gmail_password)
    message = '''
Ran investment round.
Cash to invest: ${0}, meaning {1} possible notes to invest in at ${2} each.
{3} loans seen through api in total.
{4} loans seen through api excluding already invested. 
{5} could be ordered due to score or cash available. Min score cutoff is {6}
Response: {7}, {8}
Scores from this batch:
{9}
    '''.format(cash_to_invest, n_to_pick, inv_amt, len(all_loan_count), len(api_loans), len(to_order_loan_ids), min_score, order_response, order_response.content, ids_and_scores)
    msg = """From: %s\nTo: %s\nSubject: %s\n\n%s""" % (my_gmail_account, my_recipients, subject, message)
    smtpserver.sendmail(my_gmail_account, my_recipients, msg)
    smtpserver.close()
    
    
# send out the e-mails
send_emails()

# write some stats to a google spreadsheet
# TODO from https://www.twilio.com/blog/2017/02/an-easy-way-to-read-and-write-to-a-google-spreadsheet-in-python.html
sheet.append_row([now.strftime("%Y-%m-%d %H:%M:%S.%f"), len(all_loan_count)])

# decide where I want to do these
# get the loans and process the dataframe
_, all_loan_count = investing_utils.get_loans_and_ids(
    header, exclude_already=False)

# Writing out to investing Utils

In [22]:
%%writefile investing_utils.py
import requests
import json
import re
import pandas as pd
import numpy as np
import datetime as dt
import lendingclub.account_info as acc_info
import pause
from fastai.imports import *
from fastai.structured import *
from fastai.column_data import *
from sklearn.base import TransformerMixin, BaseEstimator
from pandas_summary import DataFrameSummary
from sklearn.externals import joblib

class StandardScalerJustin(TransformerMixin, BaseEstimator):
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.copy = copy
    
    def fit(self, X, y=None):
        if type(X) == np.ndarray:
            X = pd.Series(X.reshape(-1))
        self.mean_ = X.dropna().mean()
        self.var_ = X.dropna().var()
        return self

    def transform(self, X):
        mean = self.mean_
        std_dev = np.sqrt(self.var_)
        if std_dev == 0:
            return X
        return (X-mean)/std_dev
    
def fit_scalers(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScalerJustin()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    return mapper    

def proc_df_justin(df, y_fld, valid_test, skip_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None, train_cols_meds=None, cols=None):

    """ proc_df takes a data frame df and splits off the response variable, and
    changes the df into an entirely numeric dataframe.

    Parameters:
    -----------
    df: The data frame you wish to process.

    y_fld: The name of the response variable
    
    valid_test: boolean indicating if this is a df to match to train columns.

    skip_flds: A list of fields that dropped from df.

    do_scale: Standardizes each column in df,Takes Boolean Values(True,False)

    na_dict: a dictionary of na columns to add. Na columns are also added if there
        are any missing values.

    preproc_fn: A function that gets applied to df.

    max_n_cat: The maximum number of categories to break into dummy values, instead
        of integer codes.

    subset: Takes a random subset of size subset from df.

    mapper: If do_scale is set as True, the mapper variable
        calculates the values used for scaling of variables during training time(mean and standard deviation).
        
    train_cols_meds: dict where keys are columns from training and values are medians, use for values to fill an entire missing column (shouldn't be needed when used to actually pick loans, was needed for train/valid/test due to new fields being added over the timeframe and missing in certain datasets while existing in others)
    
    cols: Just to compare column order and ensure the variables are in the right order.

    Returns:
    --------
    [x, y, nas, mapper(optional)]:

        x: x is the transformed version of df. x will not have the response variable
            and is entirely numeric.

        y: y is the response variable

        nas: returns a dictionary of which nas it created, and the associated median.

        mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continous
        variables which is then used for scaling of during test-time."""        
    assert type(valid_test) == bool, print('must indiciate if this is test/valid set to match columns with train')
    
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace=True)

    # fit the scalers
    if do_scale: mapper = fit_scalers(df, mapper)
    if na_dict is None: na_dict = {}      
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    df[mapper.transformed_names_] = mapper.transform(df)
    embeddings=[]
    for n,c in df.items():
        numericalize(df, c, n, max_n_cat)
        if not is_numeric_dtype(c):
            embeddings.append(prep_embeddings(c, n))
    df = pd.get_dummies(df, dummy_na=True)
    # fix the nas
    if valid_test:
        for col, med in train_cols_meds.items():
            try:
                df[col].fillna(med, inplace=True)
            except KeyError:
                print(col)
                df[col] = med
        df = df[cols]
        
    res = [df, y, na_dict, embeddings]
    if not valid_test: res += [res[0].median(), res[0].columns]
    if do_scale: res = res + [mapper]
    return res

def prep_embeddings(c, n):
    # allocate in embeddings for a null
    return (n, len(c.cat.categories)+1)

def eval_models(trials, port_size, available_loans, regr_version, X_test, y_test,
                default_series, yhat_test): #regr, 
    results = {}
    pct_default = {}
    test_copy = X_test.copy()
    
    default_series = default_series.loc[X_test.index]
    yhats_ys_defs = pd.DataFrame([yhat_test, y_test, default_series.values]).T
    yhats_ys_defs.rename(columns={0:'yhat', 1:'y', 2:'defaults'}, inplace=True)
    for trial in tqdm_notebook(np.arange(trials)):
        # of all test loans, grab a batch of n=available_loans
        available_idx = np.random.choice(
            np.arange(len(test_copy)), available_loans, replace=False)
        available_loans_df = yhats_ys_defs.ix[available_idx,:]
        available_loans_df.sort_values('yhat', inplace=True, ascending=False)
        picks = available_loans_df[:port_size]
        results[trial] = picks['y'].mean()
        pct_default[trial] = picks['defaults'].sum()/port_size
    pct_default_series = pd.Series(pct_default)
    results_df = pd.DataFrame(pd.Series(results))
    results_df['pct_def'] = pct_default_series
    results_df.columns = pd.MultiIndex(levels=[[regr_version], [0.07, 'pct_def']],
           labels=[[0, 0,], [0, 1,]],
           names=['discount_rate', 'model'])
    return results_df

def load_RF():
    return joblib.load(f'{PATH_RF}{regr_version_RF}_{training_type}.pkl')
    
def add_dateparts(df):
    '''Uses the fastai add_datepart to turn datetimes into numbers to process
       does not do it for issue_d'''
    date_cols = df.select_dtypes(['datetime64']).columns
    for date_col in date_cols:
        if date_col not in special_cols:
            add_datepart(df, date_col, drop=True)
    return [col for col in date_cols if col not in special_cols]    

def pause_until_time(test=False):
    # pause 3 seconds, then print hello world
    now = dt.datetime.now()
    current_hour = now.hour
    if not test:
        pause_until = dt.datetime(
            now.year, now.month, now.day, now.hour + 1, 0, 0)
    if test:
        # if testing, wait 2 seconds and print('will pause 2 seconds')
        pause_until = dt.datetime(
            now.year, now.month, now.day, now.hour, now.minute, now.second + 2)
#         print('will pause 2 seconds')
#     print('right now it is {0}, pausing until {1}'.format(
#         now.strftime('%H:%M:%S'), pause_until.strftime('%H:%M:%S')))
    pause.until(pause_until)


def convert_to_underscore(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([0-9A-Z])', r'\1_\2', s1).lower()


def get_already_invested_filter_id(header):
    filters_list = json.loads(requests.get(
        'https://api.lendingclub.com/api/investor/v1/accounts/' + str(inv_acc_id) + '/filters', headers=header).content)
    filters_df = pd.DataFrame(filters_list['filters'])
    # I manually made a single filter that excludes loans already invested in.
    # Not sure if there is a way to do this entirely through the api.
    return filters_df[filters_df['name'] == 'exclude_already_invested'].iloc[0, 0]


def get_loans_and_ids(header, exclude_already=True):
    '''Gets loans from lendingclub with the single filter of exclude loans already invested in.'''
    if exclude_already:
        filter_id = get_already_invested_filter_id(header)
        payload = {'showAll': 'true', 'filterId': filter_id}
        resp = requests.get(
            'https://api.lendingclub.com/api/investor/v1/loans/listing', headers=header, params=payload) #'https://api.lendingclub.com/api/investor/v1/loans/listing'
        loans_list = json.loads(resp.content)['loans']
    if not exclude_already:
        payload = {'showAll': 'true'}
        resp = requests.get(
            'https://api.lendingclub.com/api/investor/v1/loans/listing', headers=header, params=payload)
        loans_list = json.loads(resp.content)['loans']

    api_loans = pd.DataFrame(loans_list)
    api_loans.columns = np.array(
        [convert_to_underscore(col) for col in api_loans.columns.values])
    # save the loan ids
    loan_ids = api_loans['id']
    return api_loans, loan_ids


def match_col_names(api_loans):
    # cols to add
    # make a col of nans so cols match up exactly
    api_loans['issue_d'] = 0
    api_loans['line_history_m'] = 0
    api_loans['maturity_paid'] = 0
    api_loans['maturity_time'] = 0
    api_loans['npv_roi_10'] = 0
    api_loans['orig_amt_due'] = 0
    api_loans['target_loose'] = 0
    api_loans['target_strict'] = 0
    api_loans['fico'] = 0

    cols_to_drop_immediately = [
        'accept_d',
        'credit_pull_d',
        'desc',
        'emp_title',
        'exp_d',
        'exp_default_rate',
        'funded_amount',
        'housing_payment',
        'id',
        'ils_exp_d',
        'initial_list_status',
        'investor_count',
        'list_d',
        'member_id',
        'mtg_payment',
        'review_status',
        'review_status_d',
        'sec_app_earliest_cr_line',
        'sec_app_fico_range_high',
        'sec_app_fico_range_low',
        'service_fee_rate',
    ]
    api_loans.drop(cols_to_drop_immediately, axis=1, inplace=True)
    rename_dict = {
        'acc_open_past_24_mths': 'acc_open_past_24mths',
        'addr_zip': 'zip_code',
        'delinq_2_yrs': 'delinq_2yrs',
        'i_l_util': 'il_util',
        'inq_last_6_mths': 'inq_last_6mths',
        'installment': 'installment_amount',
        'is_inc_v': 'verification_status',
        'is_inc_v_joint': 'verification_status_joint',
        'loan_amount': 'loan_amnt',
        'num_accts_ever_12_0_ppd': 'num_accts_ever_120_pd',
        'num_tl_12_0dpd_2m': 'num_tl_120dpd_2m',
        'sec_app_inq_last_6_mths': 'sec_app_inq_last_6mths',
    }
    api_loans.rename(columns=rename_dict, inplace=True)
    return api_loans


def match_existing_cols_to_csv(api_loans):
    api_loans.fillna(value=np.nan, inplace=True)
    api_loans['all_util'] = api_loans['all_util'] / 100.0
    api_loans['application_type'] = api_loans['application_type'].str.lower()

    # turn employment length into categorical
    emp_len_dict = {np.nan: 'n/a',
                    0.0: '< 1 year',
                    12.0: '1 year',
                    24.0: '2 years',
                    36.0: '3 years',
                    48.0: '4 years',
                    60.0: '5 years',
                    72.0: '6 years',
                    84.0: '7 years',
                    96.0: '8 years',
                    108.0: '9 years',
                    120.0: '10+ years', }
    api_loans['emp_length'] = api_loans['emp_length'].replace(emp_len_dict)
    api_loans['home_ownership'] = api_loans['home_ownership'].str.lower()
    api_loans['int_rate'] = api_loans['int_rate'] / 100.0

    # verification status
    dic_veri_status = {'NOT_VERIFIED': 'none',
                       'SOURCE_VERIFIED': 'source',
                       'VERIFIED': 'platform'}
    api_loans['verification_status'] = api_loans[
        'verification_status'].replace(dic_veri_status)
    api_loans['verification_status_joint'] = api_loans[
        'verification_status_joint'].replace(dic_veri_status)
    api_loans['pct_tl_nvr_dlq'] = api_loans['pct_tl_nvr_dlq'] / 100.0
    api_loans['percent_bc_gt_75'] = api_loans['percent_bc_gt_75'] / 100.0
    api_loans['revol_util'] = api_loans['revol_util'] / 100.0
    return api_loans


def make_missing_cols_and_del_dates(api_loans):
    # probably something with earliest credit line, fico range high/low
    # need to add line_history_m, orig_amt_due, fico
    api_loans['fico'] = (api_loans['fico_range_high'] +
                         api_loans['fico_range_low']) / 2
    # line_history_m depends on issue_d, which doesn't exist for listed loans.
    # Assume it takes one month to issue so increase the number compared to
    # the csvs by 1
    today = pd.to_datetime(dt.date.today())
    api_loans['earliest_cr_line'] = pd.to_datetime(
        api_loans['earliest_cr_line'])
    line_hist_d = (today - api_loans['earliest_cr_line']) / np.timedelta64(
        1, 'D')
    api_loans['line_history_m'] = (line_hist_d * (12 / 365.25)).round() + 1
    api_loans['orig_amt_due'] = api_loans[
        'term'] * api_loans['installment_amount']

    api_loans.drop(['earliest_cr_line', 'fico_range_high',
                    'fico_range_low'], axis=1, inplace=True)
    return api_loans


def verify_df_base_cols(api_loans, test_loans):
    api_cols = api_loans.columns.values.copy()
    api_cols.sort()
    csv_cols = test_loans.columns.values.copy()
    csv_cols.sort()
    assert len(api_cols) == len(csv_cols)
    examine = dict(zip(api_cols, csv_cols))
    for key, val in examine.iteritems():
        if key != val:
            print(key, val)
            return None
    return True

def make_CIs(preds):
    means = np.mean(preds, axis=0)
    std_devs = np.std(preds, axis=0)
    df = pd.DataFrame(np.zeros((preds.shape[1],2)), columns=['mean', 'std_dev'])
    df['mean'] = means
    df['std_dev'] = std_devs
    return df

# constants
inv_acc_id = acc_info.investor_id
special_cols = []
platform = 'lendingclub'
datapath = '/home/justin/all_data/'
PATH_NN = f'{datapath}{platform}/NN/'
PATH_RF = f'{datapath}{platform}/RF/'
data_save_path = f'{datapath}{platform}/'
training_type = 'all'
regr_version_RF = '0.2.2'
regr_version_NN = '1.0.1'

Overwriting investing_utils.py
