# Author: Justin Hsi
## Part 2 of cleaning lending club payment history

In [1]:
import dir_constants as dc
from tqdm import tqdm_notebook

project = 'lendingclub'

In [2]:
def find_dupe_dates(group):
    return pd.to_datetime(group[group.duplicated('date')]['date'].values)

def merge_dupe_dates(group):
    df_chunks = []
    
    dupe_dates = find_dupe_dates(group)
    df_chunks.append(group[~group['date'].isin(dupe_dates)])
    
    for date in dupe_dates:
        problem_rows = group[group['date'] == date]
        ori_index = problem_rows.index
        keep_row = problem_rows.iloc[-1].to_dict()
        keep_row['outs_princp_beg'] = problem_rows.ix[ori_index[0],column_iloc_map['outs_princp_beg']]
        
        summed = problem_rows.sum()
        keep_row['princp_paid'] = summed['princp_paid']
        keep_row['int_paid'] = summed['int_paid']
        keep_row['fee_paid'] = summed['fee_paid']
        keep_row['amt_due'] = summed['amt_due']
        keep_row['amt_paid'] = summed['amt_paid']
        keep_row['charged_off_amt'] = summed['charged_off_amt']
        keep_row['recovs'] = summed['recovs']
        keep_row['recov_fees'] = summed['recov_fees']
            
        df_chunks.append(pd.DataFrame(pd.Series(keep_row),columns=[ori_index[-1]]).T)
            
    return pd.concat(df_chunks)

In [13]:
!ls {dc.data_path+project}

dl_df_train.fth		       oldstuff		    X_train.pkl
dl_ys_train.fth		       pmt_hist_c1.fth	    X_valid.pkl
for_proc_df_model_loading.pkl  RF		    y_all_train.pkl
lendingclub.h5		       to_keep_fi_cols.pkl  y_test.pkl
loan_info.fth		       X_all_train.pkl	    y_train.pkl
NN			       X_test.pkl	    y_valid.pkl


In [15]:
# store =  pd.HDFStore(
#     dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
#     format(project),
#     append=True)
path = dc.data_path+project

pmt_hist = pd.read_feather(path+'/pmt_hist_c1.fth')

# There are loans that have multiple row entries per month (as in multiple pmts in same month) and there are also loans that don't have any entry for a month

In [17]:
pmt_hist.head()

Unnamed: 0,pmt_date,status_period_end,date,issue_d,addr_state,home_ownership,first_credit_line,public_recs,emp_len,grade,...,outs_princp_end,princp_paid,recov_fees,recovs,revol_credit_bal,revol_line_util,term,total_credit_lines,all_cash_to_inv,loan_id_num
0,2009-09-01,current,2009-09-01,2009-08-01,CA,rent,1994-02-01,0.0,< 1 year,B,...,24418.701172,581.297974,0.0,0.0,28854.0,0.521,36,42.0,829.099976,54734
1,2009-10-01,current,2009-10-01,2009-08-01,CA,rent,1994-02-01,0.0,< 1 year,B,...,23831.642578,587.059998,0.0,0.0,28854.0,0.521,36,42.0,829.099976,54734
2,2009-11-01,current,2009-11-01,2009-08-01,CA,rent,1994-02-01,0.0,< 1 year,B,...,23238.763672,592.879028,0.0,0.0,28854.0,0.521,36,42.0,829.100037,54734
3,2009-12-01,current,2009-12-01,2009-08-01,CA,rent,1994-02-01,0.0,< 1 year,B,...,22640.007812,598.755005,0.0,0.0,28854.0,0.521,36,42.0,829.099976,54734
4,2010-01-01,current,2010-01-01,2009-08-01,CA,rent,1994-02-01,0.0,< 1 year,B,...,22035.318359,604.690002,0.0,0.0,28854.0,0.521,36,42.0,829.099976,54734


In [None]:
pmt_hist_ids = store['pmt_hist_ids'].astype(int)
max_id = pmt_hist_ids.max()
chunksize = 800
n_chunks = len(pmt_hist_ids)//chunksize + 1

# fix loans with double month entries _________________________________________
# left_bound = 0
# right_bound = pmt_hist_ids[chunksize]
already_good_dfs = []
fixed_dfs = []
k = 0
for n in tqdm_notebook(np.arange(n_chunks)):
    if n == 0:
        left_bound = 0
    else:
        left_bound = pmt_hist_ids[n*chunksize]
    if n == (n_chunks - 1):
        right_bound = max_id
    else:
        right_bound = pmt_hist_ids[(n+1)*chunksize]
    
    chunk = pd.read_hdf(
        store,
        'pmt_hist_intermediary_1',
        where='(loan_id_num > left_bound) & (loan_id_num <= right_bound)')
    loans_with_two_entries_in_same_month = chunk[chunk.duplicated(
    ['loan_id', 'date'])]
    dup_date_ids = loans_with_two_entries_in_same_month['loan_id'].unique()
    if k == 0:
        column_iloc_map = {
            col_name: chunk.iloc[-1].index.get_loc(col_name)
            for col_name in chunk.columns.values
        }
        k += 1

    id_grouped = chunk.groupby('loan_id')
    already_good = chunk[~chunk['loan_id'].isin(dup_date_ids)]
    for ids, group in id_grouped:
        if ids in dup_date_ids:
            fixed_dfs.append(merge_dupe_dates(group))
        else:
            pass

    already_good_dfs.append(already_good)

# store before next cleaning step

In [None]:
# Create min_itemsize_dict for allocating size when storing ___________________
min_itemsize_dict = {}
for col in already_good.columns:
    if already_good[col].dtype == np.object:
        print(col, already_good[col].str.len().max())
        if col in ['State', 'VINTAGE', 'grade']:
            pass
        else:
            min_itemsize_dict[col] = 15

col_dtype_map = already_good_dfs[0].dtypes.to_dict()
all_fixed_dfs = pd.concat(fixed_dfs)
for col, dtype in col_dtype_map.items():
    all_fixed_dfs[col] = all_fixed_dfs[col].astype(dtype)
     
k = 0
for chunk in tqdm_notebook([all_fixed_dfs] + already_good_dfs):
    if k == 0:
        store.append(
            'pmt_hist_intermediary_2',
            chunk,
            data_columns=True,
            index=True,
            append=False,
            min_itemsize=min_itemsize_dict)
        k += 1
    else:
        store.append(
            'pmt_hist_intermediary_2',
            chunk,
            data_columns=True,
            index=True,
            append=True)           
        
store.close()        