# Seeing the performance of 1.0.1 NN and 0.2.1 RF used together

In [None]:
import data_science.lendingclub.dataprep_and_modeling.modeling_utils.data_prep_new as data_prep
import dir_constants as dc
from sklearn.externals import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import time
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from  lendingclub.dataprep_and_modeling.model_dump.nn_1_0_1 import net_class
import torch
%matplotlib inline

In [None]:
def figure_this(summaries):
    plt.figure(figsize=(9,6))
    plt.plot(summaries.columns.values, summaries.loc['mean',:], 'o', label='mean')
    plt.plot(summaries.columns.values, summaries.loc['25%',:], 'ro', label='25%')
    # plt.plot(summaries.columns.values, summaries.loc['50%',:], '-.')
    plt.plot(summaries.columns.values, summaries.loc['75%',:], 'ko', label='75%')
    plt.title('return per percentile over batches')
    plt.legend(loc='best')
    plt.xlabel('percentile of 1.0.1_score')
    plt.ylabel('npv_roi_10')
    plt.show()

In [None]:
platform = 'lendingclub'
nn_path = '/home/justin/justin_tinkering/data_science/lendingclub/dataprep_and_modeling/model_dump/nn_1_0_1/1.0.1_e600'
rf_path = '/home/justin/justin_tinkering/data_science/lendingclub/dataprep_and_modeling/model_dump/model_0.2.1.pkl'
regr_version = '2.0.0'

store = pd.HDFStore(
    dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)

# Make sure no loan in test set was in train set

In [None]:
store.open()
train = store['train_filtered_columns']
test = store['test_filtered_columns']
loan_npv_rois = store['loan_npv_rois']
default_series = test['target_strict']
results = store['results']
store.close()

train_ids = set(train.index.values)
test_ids = set(test.index.values)
assert len(train_ids.intersection(test_ids)) == 0

# Examine performance on test set

In [None]:
test_X, test_y = data_prep.process_data_test(test)
test_y = test_y['npv_roi_10'].values
# NN score
net = net_class.Net()
net.load_state_dict(torch.load(nn_path))
test_yhat_NN = net_class.torch_version(test_X,net)
# RF score
regr = joblib.load('model_dump/model_0.2.1.pkl')
test_yhat_RF = regr.predict(test_X)

In [None]:
# Checking that there was no null columns in test_X just to verify
# that data_prep.process_data_test was working properly
for col in test_X.columns:
    if len(test_X[test_X[col].isnull()]) > 0:
        print(col)

In [None]:
test_yhat = (test_yhat_RF + test_yhat_NN)/2

# Short digression on examining feature importances (N/A)

In [None]:
test[regr_version+'_scores'] = test_yhat

In [None]:
test['2.0.0_scores'].hist(bins=50)

In [None]:
len(test[test['2.0.0_scores'] > 0])

In [None]:
len(test)

In [None]:
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000
plt.figure(figsize=(12,9))
plt.plot(test[regr_version+'_scores'], test['npv_roi_10'], 'o')
plt.show()

# at scores above 0, check the pctage of positive returns

In [None]:
score_cutoff = -.02
pos_scores = test[test[regr_version+'_scores'] >= score_cutoff]
ps_pos_returns = pos_scores[pos_scores['npv_roi_10'] > score_cutoff]

In [None]:
len(ps_pos_returns)/len(pos_scores)

# checking return per percentile below

In [None]:
percentiles = np.arange(0,100,1)

In [None]:
def eval_models(trials, port_size, available_loans, test, percentiles):
    results = {}
    pct_default = {}
    test_copy = test.copy()
    for trial in tqdm_notebook(np.arange(trials)):
        loan_ids = np.random.choice(
            test_copy.index.values, available_loans, replace=False)
        loans_to_pick_from = test_copy.loc[loan_ids, :]
        loans_to_pick_from.sort_values(regr_version+'_scores', ascending=True, inplace = True)
        chunksize = int(len(loans_to_pick_from)/100)
        results_dict = {}
        for k,perc in enumerate(percentiles):
            subset = loans_to_pick_from[k*chunksize:(k+1)*chunksize]
            results_dict[perc] = subset['npv_roi_10'].mean()

        results[trial] = pd.Series(results_dict)
        
    return pd.DataFrame.from_dict(results).T


#         picks = scores_series[:900].index.values
#         results[trial] = loan_npv_rois.loc[picks, :].mean().to_dict()
#         pct_default[trial] = (default_series.loc[picks].sum()) / port_size
#     pct_default_series = pd.Series(pct_default)
#     results_df = pd.DataFrame(results).T
#     results_df['pct_def'] = pct_default_series
#     return results_df

In [None]:
# as per done with baseline models, say 3000 loans available
# , pick 900 of them
trials = 20000
port_size = 900
available_loans = 3000
results = eval_models(trials, port_size, available_loans, test, percentiles)

In [None]:
summaries = results.describe()

In [None]:
store.open()
store['percentiles_for_2.0.0'] = results
# store.close()

In [None]:
rf_results = store['percentiles_for_0.2.1']
nn_results = store['percentiles_for_1.0.1']

In [None]:
rf_summaries = rf_results.describe()
nn_summaries = nn_results.describe()

In [None]:
multi_index = []
for col in summaries.columns.values:
    multi_index.append((str(col),'2.0.0'))
append_results = summaries.copy()
append_results.columns = pd.MultiIndex.from_tuples(multi_index, names = ['percentile', 'model'])    

In [None]:
rf_multi_index = []
for col in rf_summaries.columns.values:
    rf_multi_index.append((str(col),'0.2.1'))
rf_append_results = rf_summaries.copy()
rf_append_results.columns = pd.MultiIndex.from_tuples(rf_multi_index, names = ['percentile', 'model'])    

In [None]:
nn_multi_index = []
for col in nn_summaries.columns.values:
    nn_multi_index.append((str(col),'1.0.1'))
nn_append_results = nn_summaries.copy()
nn_append_results.columns = pd.MultiIndex.from_tuples(nn_multi_index, names = ['percentile', 'model'])    

In [None]:
all_results = pd.concat([append_results,rf_append_results,nn_append_results], axis=1)

In [None]:
all_results.sort_index(axis=1)

In [None]:
store.close()