In [None]:
import modeling_utils.data_prep as data_prep
from sklearn.linear_model import ElasticNet
from sklearn.externals import joblib
import time

# DO NOT FORGET TO DROP ISSUE_D AFTER PREPPING

In [None]:
platform = 'lendingclub'

store = pd.HDFStore(
    '/Users/justinhsi/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)

loan_info = store['train_filtered_columns']

# Until I figure out a good imputation method (e.g. bayes PCA), just drop columns with null still

In [None]:
standardized, eval_cols, mean_series, std_dev_series = data_prep.process_data_train(
    loan_info)

# straight up out of box elastic net with slightly tweaked alpha

In [None]:
regr = ElasticNet(alpha = .004, random_state=0, max_iter = 1500)
regr.fit(standardized, eval_cols)

In [None]:
# dump the model
joblib.dump(regr, 'model_dump/model_0.1.0.pkl')
joblib.dump((mean_series, std_dev_series), 'model_dump/mean_stddev.pkl')

In [None]:
coef_dict = {}
for index, coef in enumerate(regr.coef_):
    coef_dict[index] = coef
pd.Series(coef_dict).value_counts(dropna=False)

In [None]:
regr.score(standardized, eval_cols)

In [None]:
now = time.strftime("%Y_%m_%d_%Hh_%Mm_%Ss")
# info to stick in detailed dataframe describing each model
model_info = {'model_version': '0.1.0',
              'target': 'npv_roi_10',
              'weights': 'None',
              'algo_model': 'elastic_net',
              'hyperparams': "alpha:.004, random_state: 0, max_iter: 1500",
              'cost_func': 'sklearn default, which I think is mse',
              'useful_notes': 'R2 score of .0604167 (regr.score())',
              'date': now}

model_info_df = pd.DataFrame(model_info, index = ['0.1.0'])
store.open()
store.append(
            'model_info',
            model_info_df,
            data_columns=True,
            index=True,
            append=True,
            min_itemsize={'model_version': 20,
                  'target': 20,
                  'weights': 200,
                  'algo_model': 20,
                  'hyperparams': 500,
                  'cost_func': 300,
                  'useful_notes': 1000,
                  'date': 30}
)
store.close()

# Examine performance on test set

In [None]:
store.open()
test = store['test_filtered_columns']
train = store['train_filtered_columns']
loan_npv_rois = store['loan_npv_rois']
default_series = test['target_strict']
results = store['results']
store.close()

In [None]:
train_X, train_y = data_prep.process_data_test(train)
train_y = train_y['npv_roi_10'].values
test_X, test_y = data_prep.process_data_test(test)
test_y = test_y['npv_roi_10'].values
regr = joblib.load('model_dump/model_0.1.0.pkl')
regr_version = '0.1.0'
test_yhat = regr.predict(test_X)
train_yhat = regr.predict(train_X)

In [None]:
test_mse = np.sum((test_yhat - test_y)**2)/len(test_y)
train_mse = np.sum((train_yhat - train_y)**2)/len(train_y)

In [None]:
def eval_models(trials, port_size, available_loans, regr, regr_version, test, loan_npv_rois,
                default_series):
    results = {}
    pct_default = {}
    test_copy = test.copy()
    for trial in tqdm_notebook(np.arange(trials)):
        loan_ids = np.random.choice(
            test_copy.index.values, available_loans, replace=False)
        loans_to_pick_from = test_copy.loc[loan_ids, :]
        scores = regr.predict(loans_to_pick_from)
        scores_series = pd.Series(dict(zip(loan_ids, scores)))
        scores_series.sort_values(ascending=False, inplace=True)
        picks = scores_series[:900].index.values
        results[trial] = loan_npv_rois.loc[picks, :].mean().to_dict()
        pct_default[trial] = (default_series.loc[picks].sum()) / port_size
    pct_default_series = pd.Series(pct_default)
    results_df = pd.DataFrame(results).T
    results_df['pct_def'] = pct_default_series
    return results_df

In [None]:
# as per done with baseline models, say 3000 loans available
# , pick 900 of them
trials = 20000
port_size = 900
available_loans = 3000
model_results = eval_models(trials, port_size, available_loans, regr, regr_version, test_X, loan_npv_rois, default_series)

In [None]:
multi_index = []
for col in model_results.columns.values:
    multi_index.append((col,regr_version))

In [None]:
append_results = model_results
append_results.columns = pd.MultiIndex.from_tuples(multi_index, names = ['discount_rate', 'model'])

In [None]:
try:
    results = results.join(append_results)
except ValueError:
    results.loc[:, (slice(None), slice('0.1.0','0.1.0'))] = append_results
results.sort_index(axis=1, inplace = True)

In [None]:
store.open()
store['results'] = results
store.close()

In [None]:
results.describe()