# Author: Justin Hsi
## Part 3 of cleaning lending club payment history

In [None]:
import dir_constants as dc
from tqdm import tqdm_notebook

In [None]:
def find_closest_previous_record(ids, issue_d, first_date, actual_months, month):
    '''This function finds the closest previous month that is in the group. 
    It is here to handle cases where a record of one month is missing, but the
    record before that missing month is also missing.'''
    offset = pd.DateOffset(months=-1)
    prev_month = month + offset
    if month < issue_d:
        print(ids)
        return first_date
    elif prev_month in actual_months:
        return prev_month
    else:
        find_closest_previous_record(ids, issue_d, first_date, actual_months, prev_month)

In [None]:
platform = 'lendingclub'

store =  pd.HDFStore(
    dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)

# There are loans that have multiple row entries per month (as in multiple pmts in same month) and there are also loans that don't have any entry for a month

In [None]:
# fix loans with no record at all for a specific month ________________________
pmt_hist_ids = store['pmt_hist_ids'].astype(int)
max_id = pmt_hist_ids.max()
chunksize = 800
n_chunks = len(pmt_hist_ids)//chunksize + 1

already_good_dfs = []
better_sized_already_good_dfs = []
fixed_dfs = []
# k = 0
for n in tqdm_notebook(np.arange(n_chunks)):
    if n == 0:
        left_bound = 0
    else:
        left_bound = pmt_hist_ids[n*chunksize]
    if n == (n_chunks - 1):
        right_bound = max_id
    else:
        right_bound = pmt_hist_ids[(n+1)*chunksize]
    
    chunk = pd.read_hdf(
        store,
        'pmt_hist_intermediary_2',
        where='(loan_id_num > left_bound) & (loan_id_num <= right_bound)')
    
    id_grouped = chunk.groupby('loan_id')
    for ids, group in id_grouped:
        # Copy Paste finished below
        issue_d = group['issue_d'].min()
        first_date = group['date'].min()
        last_date = group['date'].max()
        expected_months = set(pd.DatetimeIndex(start=first_date, end=last_date, freq='MS'))
        actual_months = set(group['date'])
        to_make_months = list(expected_months.symmetric_difference(actual_months))
        to_make_months.sort()
        if len(to_make_months) > 1:
            months_to_copy = []
            for month in to_make_months:
                months_to_copy.append(find_closest_previous_record(ids, issue_d, first_date, actual_months, month))
            copied = group[group['date'].isin(months_to_copy)].copy()
            copied['amt_paid'] = 0.0
            copied['date'] = to_make_months
            copied['amt_due'] = np.where(copied['date'] < first_date, 0, copied['amt_due'])
            fixed_dfs.append(pd.concat([group, copied]))
        else:
            already_good_dfs.append(group)
            if len(already_good_dfs) == chunksize:
                better_sized_already_good_dfs.append(pd.concat(already_good_dfs))
                already_good_dfs = []
            if n+1 == n_chunks: # if on the last chunk
                better_sized_already_good_dfs.append(pd.concat(already_good_dfs))
                already_good_dfs = []

# Store it

In [None]:
# # Create min_itemsize_dict for allocating size when storing ___________________
min_itemsize_dict = {}
#arbitrarily take last 10000 to hopefully be long enough for min item size
example = pd.concat(better_sized_already_good_dfs[-20:]) 
for col in example.columns:
    if example[col].dtype == np.object:
        print(col, example[col].str.len().max())
        if col in ['State', 'VINTAGE', 'grade']:
            pass
        else:
            min_itemsize_dict[col] = 15

col_dtype_map = better_sized_already_good_dfs[0].dtypes.to_dict()
all_fixed_dfs = pd.concat(fixed_dfs)
for col, dtype in col_dtype_map.items():
    all_fixed_dfs[col] = all_fixed_dfs[col].astype(dtype)

In [None]:
k = 0
for chunk in tqdm_notebook([all_fixed_dfs] + better_sized_already_good_dfs):
    sorted_chunk = chunk.sort_values(['loan_id', 'date'])
    if k == 0:
        store.append(
            'pmt_hist_clean',
            sorted_chunk,
            data_columns=True,
            index=True,
            append=False,
            min_itemsize=min_itemsize_dict)
        k += 1
    else:
        store.append(
            'pmt_hist_clean',
            sorted_chunk,
            data_columns=True,
            index=True,
            append=True)       
        
store.close()        