# Make npv_rois first, then can run this

In [1]:
%matplotlib inline
pd.set_option('display.max_columns',500)

In [2]:
import dir_constants as dc
seed = 42

In [3]:
store_path=f'{datapath}lendingclub/lendingclub.h5'
store = pd.HDFStore(store_path)
loan_info = store['clean_loan_info']
npv_rois = store['loan_npv_rois']
issue_d_id_mats = store['issue_d_id_mats']
store.close()

In [4]:
loan_info[loan_info.duplicated('id')]['id']

Series([], Name: id, dtype: int64)

# split data similar to how I'm using it, training on old stuff and
# testing on newer stuff

In [6]:
nearly_done = issue_d_id_mats[(issue_d_id_mats['maturity_paid'] > .9) & (issue_d_id_mats['maturity_time'] > .9)]

In [7]:
# take last quarter of nearly done as test/valid, take all loans with end_d before first month of last quarter to train on
cutoff_issue_d = nearly_done['issue_d'].max() + pd.DateOffset(months=-3)
test_valid = nearly_done[nearly_done['issue_d'] >= cutoff_issue_d]
train = nearly_done[nearly_done['end_d'] < cutoff_issue_d]

In [8]:
ctest_id = test_valid['id'].sample(frac=.5, replace=False, random_state=42).values
cvalid_id = [ids for ids in test_valid['id'] if ids not in ctest_id]
ctrain_id = train['id']
assert len(set(ctest_id).intersection(set(cvalid_id))) == 0
assert len(set(ctest_id).intersection(set(ctrain_id))) == 0
assert len(set(ctrain_id).intersection(set(cvalid_id))) == 0

In [9]:
ctest = loan_info[loan_info['id'].isin(ctest_id)]
cvalid = loan_info[loan_info['id'].isin(cvalid_id)]
ctrain = loan_info[loan_info['id'].isin(ctrain_id)]
assert len(ctest_id) == ctest.shape[0]
assert len(cvalid_id) == cvalid.shape[0]
assert len(ctrain_id) == ctrain.shape[0]

In [None]:
dont_keep_cols = [
    'id',
    'member_id',
    'funded_amnt',
    'installment_funded',
    'emp_title',
    #'issue_d',
    'loan_status',
    'pymnt_plan',
    'fico_range_low',
    'fico_range_high',
    'initial_list_status',
    'out_prncp',
    'total_pymnt',
    'total_pymnt_inv',
    'total_rec_prncp',
    'total_rec_int',
    'total_rec_late_fee',
    'recoveries',
    'collection_recovery_fee',
    'last_pymnt_d',
    'last_pymnt_amnt',
    'next_pymnt_d',
    'last_credit_pull_d',
    'last_fico_range_high',
    'last_fico_range_low',
    'policy_code',
    'end_d',
    #'line_history_m',
    'line_history_y',
    'months_paid',
    'earliest_cr_line',
    'sec_app_earliest_cr_line',
#     'maturity_time',
    'rem_to_be_paid',
#     'maturity_paid',
    'roi_simple',
#     'target_strict',
#     'target_loose',
]

In [None]:
# first lets try and look at only loans with done status.
loan_info = loan_info[loan_info['loan_status'].isin(['paid', 'defaulted', 'charged_off'])]
npv_rois = npv_rois.loc[loan_info.index,:]

In [None]:
# examine what the loans I have "look" like
loan_info['npv_roi_10'] = npv_rois[0.1]
# somehow the npv rois higher seem unrealistic
loan_info = loan_info[loan_info['npv_roi_10'] <= 0.9]
# only take npv_rois that aren't null (although none should be null)
loan_info = loan_info[loan_info['npv_roi_10'].notnull()]

In [None]:
paid = loan_info[loan_info['loan_status'] == 'paid']
defaulted = loan_info[~loan_info['loan_status'].isin(['paid'])]

In [None]:
loan_info['npv_roi_10'].hist(bins=50)

In [None]:
keep_cols = [col for col in loan_info.columns.values if col not in dont_keep_cols]
useless_cols = []
for col in keep_cols:
    if len(loan_info[col].value_counts(dropna=False))<=1:
        useless_cols.append(col)
keep_cols = [col for col in keep_cols if col not in useless_cols]

In [None]:
train, validate, test = np.split(
    loan_info.sample(frac=1, random_state=seed),
    [int(.7 * len(loan_info)),
     int(.8 * len(loan_info))])

In [None]:
# save this loan_info set
store.open()
store['base_dataset_all_columns'] = loan_info
store['base_dataset_filtered_columns'] = loan_info[keep_cols]
store['train_all_columns'] = train
store['train_filtered_columns'] = train[keep_cols]
store['validate_all_columns'] = validate
store['validate_filtered_columns'] = validate[keep_cols]
store['test_all_columns'] = test
store['test_filtered_columns'] = test[keep_cols]
store.close()