In [1]:
np.set_printoptions(suppress=True)

In [74]:
# %load ../lc_utils.py
import sys
import os
import pandas as pd
sys.path.append(os.path.join(os.path.expanduser('~'),'projects','j_utils',))
import munging as mg
import hyperlearn.hyperlearn.impute.SVDImpute as hpl_imp

def gen_expt_datasets(today, oldest, valid_start, base_loan_info, eval_loan_info, target, valid_end=None, verbose=False):
    '''
    all loans from oldest until today are taken as train. All loans issued after today until valid_end are used for validation. Uses hyperlearn svd_impute to impute missing values. Returns the train and test datasets. target can be single colname or list of colnames
    '''
    train_ids = eval_loan_info[(eval_loan_info['issue_d'] <= today) & (eval_loan_info['issue_d'] >= oldest)]['id'].unique()
    if valid_end:
        valid_ids = eval_loan_info[(eval_loan_info['issue_d'] >= valid_start) & (eval_loan_info['issue_d'] <= valid_end)]['id'].unique()
    else:
        valid_ids = eval_loan_info[(eval_loan_info['issue_d'] >= valid_start)]['id'].unique()
    train = base_loan_info[base_loan_info['id'].isin(train_ids)]
    valid = base_loan_info[base_loan_info['id'].isin(valid_ids)]
    
    # setup for catboost
    # a bit more data processing and nan handling for catboost
    train_copy = train.copy()
    valid_copy = valid.copy()
    
    # get ready for hyperlearn svdimpute
    train_copy, max_dict, min_dict, cats_dict, norm_dict = mg.train_hpl_proc(train_copy, verbose=verbose)
    valid_copy = mg.val_test_hpl_proc(valid_copy, train_copy, max_dict, min_dict, cats_dict, verbose=verbose)

    # fit to train
    S, VT, mean, std, mins, standardise = hpl_imp.fit(train_copy.values)
    
    # impute on train
    train_svdimp = hpl_imp.transform(train_copy.values, S, VT, mean, std, mins, standardise)
    train_svdimp = pd.DataFrame(train_svdimp)
    train_svdimp.index = train_copy.index
    train_svdimp.columns = train_copy.columns
    
    # impute on test
    valid_svdimp = hpl_imp.transform(valid_copy.values, S, VT, mean, std, mins, standardise)
    valid_svdimp = pd.DataFrame(valid_svdimp)
    valid_svdimp.index = valid_copy.index
    valid_svdimp.columns = valid_copy.columns
    
    # imputing changes some ids. Make the ids the originals again.
    train_svdimp['id'] = train_ids
    valid_svdimp['id'] = valid_ids
    
    train_y = eval_loan_info[eval_loan_info['id'].isin(train_ids)][target]
    valid_y = eval_loan_info[eval_loan_info['id'].isin(valid_ids)][target]
    
    return train_svdimp, train_y, valid_svdimp, valid_y, train_ids, valid_ids

# make a crude test set for now
def get_split_date(df, date_column, quantile): 

    """
    https://stackoverflow.com/questions/31018622/pandas-quantile-function-for-dates
    Get the date on which to split a dataframe for timeseries splitting
    Adjusted coerce param to errors since SO is old.
    """ 

    # 1. convert date_column to datetime (useful in case it is a string) 
    # 2. convert into int (for sorting) 
    # 3. get the quantile 
    # 4. get the corresponding date
    # 5. return, pray that it works 

    quantile_date = pd.to_datetime(df[date_column], errors = 'raise').astype('int64').quantile(q=quantile)#.astype('datetime64[ns]')

    return pd.to_datetime(quantile_date)

def split_out_traintestable_loans(df, eval_df, oldness_thrsh=.9):
    '''Can train/test on loans that pass the oldness_thrsh or have status paid/defaulted/charged_off'''
    old_enough_ids = eval_df[(eval_df['maturity_time_stat_adj'] >= oldness_thrsh) | 
                                    (eval_df['maturity_paid_stat_adj'] >= oldness_thrsh) | 
                                    (eval_df['loan_status'].isin(['paid', 'defaulted', 'charged_off']))]['id'].unique()
    df = df[df['id'].isin(old_enough_ids)]
    eval_df = eval_df[eval_df['id'].isin(old_enough_ids)]
    return df, eval_df


def add_custom_lc_features(df):
    # added features
    df['monthly_inc'] = df['annual_inc'] / 12
    df['dti_w_loan'] = (df['dti'] * df['monthly_inc'] +
                                    df['installment']) / df['monthly_inc']
    df['delinq_to_monthly_inc'] = df['delinq_amnt'] / \
        df['monthly_inc']
    df['tot_cur_bal_to_monthly_inc'] = df['tot_cur_bal'] / \
        df['monthly_inc']
    df['loan_to_inc'] = df['loan_amount'] / \
        df['monthly_inc']

 # test hyperlearn impute

In [2]:
%autoreload 2

In [3]:
import hyperlearn.hyperlearn.impute.SVDImpute as hpl_imp

Note that first time import of HyperLearn will be slow, since NUMBA code has to be compiled to machine code for optimization purposes.


In [4]:
base_row = [1,1]
for i in range(1,10):
    base_row.append(base_row[i] + base_row[i-1])

In [5]:
base_row

[1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89]

In [6]:
mat = [base_row]
for i in range(20):
    next_row = mat[-1][:]
    next_row.append(next_row[-1] + next_row[-2])
    mat.append(next_row[1:])
mat = np.array(mat, dtype=float)

In [7]:
mask = np.random.randint(2, size=mat.shape)

In [8]:
to_imp = mat.copy()
to_imp[mask == 1] = np.nan

In [12]:
S, VT, mean, std, mins, standardise = hpl_imp.fit(mat)

UnboundLocalError: local variable 'f' referenced before assignment

In [11]:
%debug

> [0;32m/home/justin/hyperlearn/hyperlearn/utils.py[0m(35)[0;36m__init__[0;34m()[0m
[0;32m     33 [0;31m                        [0;32mtry[0m[0;34m:[0m [0mf[0m [0;34m=[0m [0meval[0m[0;34m([0m[0;34mf'numba.{function}'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     34 [0;31m                        [0;32mexcept[0m[0;34m:[0m [0;32mpass[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 35 [0;31m                        [0mf[0m [0;34m=[0m [0meval[0m[0;34m([0m[0mf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     36 [0;31m                        [0mself[0m[0;34m.[0m[0mf[0m [0;34m=[0m [0mf[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     37 [0;31m[0;34m[0m[0m
[0m
ipdb> dir
<built-in function dir>
ipdb> dir()
['fast', 'function', 'numba', 'self']
ipdb> function
'syevd'
ipdb> u
> [0;32m/home/justin/hyperlearn/hyperlearn/linalg.py[0m(396)[0;36meigh[0;34m()[0m
[0;32m    394 [0;31m        [0mold_alpha[0m [0;34m=[0m [0;36m0[0m

In [None]:
# impute on train
imputed = hpl_imp.transform(to_imp, S, VT, mean, std, mins, standardise)

In [10]:
%debug

> [0;32m/home/justin/hyperlearn/hyperlearn/linalg.py[0m(398)[0;36meigh[0;34m()[0m
[0;32m    396 [0;31m[0;34m[0m[0m
[0m[0;32m    397 [0;31m        [0;32mif[0m [0;32mnot[0m [0mqr[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 398 [0;31m                [0meig[0m [0;34m=[0m [0mlapack[0m[0;34m([0m[0mX[0m[0;34m.[0m[0mdtype[0m[0;34m,[0m [0;34m"syevd"[0m[0;34m,[0m [0mfast[0m[0;34m,[0m [0;34m"eigh"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    399 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    400 [0;31m                [0meig[0m [0;34m=[0m [0mlapack[0m[0;34m([0m[0mX[0m[0;34m.[0m[0mdtype[0m[0;34m,[0m [0;34m"syevr"[0m[0;34m,[0m [0mfast[0m[0;34m,[0m [0;34m"eigh"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> q
