# So I chose a min_score from the other jupyter notebook, but when I look at the max scores of investment rounds, the highest scores are always 1-1.5% below the score cutoff threshold. My theory is that perhaps at the highest score percentile, the model is picking B grade loans that back then had a higher interest rate. In this notebook I will look at the distribution of issuance dates within each percentile

In [None]:
import modeling_utils.data_prep as data_prep
from sklearn.externals import joblib
import time

In [None]:
platform = 'lendingclub'

store = pd.HDFStore(
    '/Users/justinhsi/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)

# Pull in loans and do monte carlo over the batches again

In [None]:
store.open()
test = store['test_filtered_columns']
loan_npv_rois = store['loan_npv_rois']
default_series = test['target_strict']
store.close()

# Add scores and npv_roi_5 to test set

In [None]:
test_X, test_y = data_prep.process_data_test(test)
test_y = test_y['npv_roi_10'].values
regr = joblib.load('model_dump/model_0.2.1.pkl')
regr_version = '0.2.1'
test_yhat = regr.predict(test_X)
test['0.2.1_scores'] = test_yhat
test['npv_roi_5'] = loan_npv_rois[.05]

# find what is a good percentile to cutoff at, and what the distribution for scores is at that percentile

In [None]:
good_percentiles = np.arange(71,101,1)
good_percentiles = good_percentiles[::-1]

In [None]:
def find_min_score_models(trials, available_loans, test, percentiles):
    # looks at loans that scored in top 30%, computes avg npv_roi_5 in each of those
    # top percentiles
    results = {}
    results_scores = {}
    results_percentile_id = {}
    pct_default = {}
    test_copy = test.copy()
    for trial in tqdm_notebook(np.arange(trials)):
        loan_ids = np.random.choice(
            test_copy.index.values, available_loans, replace=False)
        loans_to_pick_from = test_copy.loc[loan_ids, :]
        loans_to_pick_from.sort_values('0.2.1_scores', ascending=False, inplace = True)
        chunksize = int(len(loans_to_pick_from)/100)
        results_dict = {}
        results_scores_dict = {}
        results_percentile_id_dict = {}
        for k,perc in enumerate(percentiles):
            subset = loans_to_pick_from[k*chunksize:(k+1)*chunksize]
            results_dict[perc] = subset['npv_roi_5'].mean()
            results_scores_dict[perc] = subset['0.2.1_scores'].mean()
            results_percentile_id_dict[perc] = subset.index.values

        results[trial] = pd.Series(results_dict)
        results_scores[trial] = pd.Series(results_scores_dict)
        results_percentile_id[trial] = pd.Series(results_percentile_id_dict)

    return results_percentile_id#pd.DataFrame.from_dict(results).T, pd.DataFrame.from_dict(results_scores).T

In [None]:
# assume there's 200 loans per batch
trials = 20000
available_loans = 200
results, results_scores = find_min_score_models(trials, available_loans, test, good_percentiles)

In [None]:
summaries = results.describe()
summaries_scores = results_scores.describe()

In [None]:
plt.figure(figsize=(12,9))
plt.plot(summaries.columns.values, summaries.loc['mean',:], 'o', label='mean')
plt.plot(summaries.columns.values, summaries.loc['25%',:], 'ro', label='25%')
# plt.plot(summaries.columns.values, summaries.loc['50%',:], '-.')
plt.plot(summaries.columns.values, summaries.loc['75%',:], 'ko', label='75%')
plt.title('return per percentile over batches')
plt.legend(loc='best')
plt.xlabel('percentile of 0.2.1_score')
plt.ylabel('npv_roi_5')
plt.show()

In [None]:
summaries_scores

In [None]:
# lets take one sided 99% cofidence interval at score is greater than mean -3 std_dev at 90th percentile

In [None]:
cutoff = summaries_scores.loc['mean', 90] - 3*summaries_scores.loc['std', 90]

# Say I wanted the 75pctile of the 80th percentile (-0.36289), what grade distribution of loans are those?

In [None]:
picks = test[test['0.2.1_scores'] >= cutoff]

In [None]:
# grade distribution of picks
picks['grade'].value_counts(dropna=False)/len(picks)

In [None]:
# compared to grade distribution of all test loans
test['grade'].value_counts(dropna=False)/len(test)

In [None]:
cutoff