# Author: Justin Hsi
## Part 3 of cleaning lending club payment history

In [1]:
import dir_constants as dc
from tqdm import tqdm_notebook

In [2]:
def find_closest_previous_record(ids, issue_d, first_date, actual_months, month):
    '''This function finds the closest previous month that is in the group. 
    It is here to handle cases where a record of one month is missing, but the
    record before that missing month is also missing.'''
    offset = pd.DateOffset(months=-1)
    prev_month = month + offset
    if month < issue_d:
        print(ids)
        return first_date
    elif prev_month in actual_months:
        return prev_month
    else:
        find_closest_previous_record(ids, issue_d, first_date, actual_months, prev_month)

In [3]:
platform = 'lendingclub'

store =  pd.HDFStore(
    dc.home_path+'/justin_tinkering/data_science/lendingclub/{0}_store.h5'.
    format(platform),
    append=True)

# There are loans that have multiple row entries per month (as in multiple pmts in same month) and there are also loans that don't have any entry for a month

In [4]:
# fix loans with no record at all for a specific month ________________________
pmt_hist_ids = store['pmt_hist_ids'].astype(int)
max_id = pmt_hist_ids.max()
chunksize = 600
n_chunks = len(pmt_hist_ids)//chunksize + 1

already_good_dfs = []
fixed_dfs = []
# k = 0
for n in tqdm_notebook(np.arange(n_chunks)):
    if n == 0:
        left_bound = 0
    else:
        left_bound = pmt_hist_ids[n*chunksize]
    if n == (n_chunks - 1):
        right_bound = max_id
    else:
        right_bound = pmt_hist_ids[(n+1)*chunksize]
    
    chunk = pd.read_hdf(
        store,
        'pmt_hist_intermediary_2',
        where='(loan_id_num > left_bound) & (loan_id_num <= right_bound)')
    
    id_grouped = chunk.groupby('loan_id')
    for ids, group in id_grouped:
        # Copy Paste finished below
        issue_d = group['issue_d'].min()
        first_date = group['date'].min()
        last_date = group['date'].max()
        expected_months = set(pd.DatetimeIndex(start=first_date, end=last_date, freq='MS'))
        actual_months = set(group['date'])
        to_make_months = list(expected_months.symmetric_difference(actual_months))
        to_make_months.sort()
        if len(to_make_months) > 1:
            months_to_copy = []
            for month in to_make_months:
                months_to_copy.append(find_closest_previous_record(ids, issue_d, first_date, actual_months, month))
            copied = group[group['date'].isin(months_to_copy)].copy()
            copied['amt_paid'] = 0.0
            copied['date'] = to_make_months
            copied['amt_due'] = np.where(copied['date'] < first_date, 0, copied['amt_due'])
            fixed_dfs.append(pd.concat([group, copied]))
        else:
            already_good_dfs.append(group)

A Jupyter Widget




# Store it

In [6]:
# # Create min_itemsize_dict for allocating size when storing ___________________
min_itemsize_dict = {}
#arbitrarily take last 10000 to hopefully be long enough for min item size
already_good = pd.concat(already_good_dfs[-10000:]) 
for col in already_good.columns:
    if already_good[col].dtype == np.object:
        print(col, already_good[col].str.len().max())
        if col in ['State', 'VINTAGE', 'grade']:
            pass
        else:
            min_itemsize_dict[col] = 15

col_dtype_map = already_good_dfs[0].dtypes.to_dict()
all_fixed_dfs = pd.concat(fixed_dfs)
for col, dtype in col_dtype_map.items():
    all_fixed_dfs[col] = all_fixed_dfs[col].astype(dtype)

addr_state 2
emp_len 9
grade 1
home_ownership 8
loan_id 9
status_period_end 11
vintage 4


In [None]:
# so apprently I keep failing when I make better_sized_already_good_dfs... examine what is going on

In [8]:
len(already_good_dfs)

1523380

In [11]:
len(all_fixed_dfs)

21589

In [None]:
type(all_fixed_)

In [None]:
n_dfs = len(already_good_dfs)
n_chunks = len(already_good_dfs)//chunksize + 1

better_sized_already_good_dfs = []
for n in tqdm_notebook(np.arange(n_chunks)):
    bigger_dfs = pd.concat(already_good_dfs[n*chunksize:(n+1)*chunksize])
    better_sized_already_good_dfs.append(bigger_dfs)
    
k = 0
for chunk in tqdm_notebook([all_fixed_dfs] + better_sized_already_good_dfs):
    sorted_chunk = chunk.sort_values(['loan_id', 'date'])
    if k == 0:
        store.append(
            'pmt_hist_clean',
            sorted_chunk,
            data_columns=True,
            index=True,
            append=False,
            min_itemsize=min_itemsize_dict)
        k += 1
    else:
        store.append(
            'pmt_hist_clean',
            sorted_chunk,
            data_columns=True,
            index=True,
            append=True)       
        
store.close()        

addr_state 2
emp_len 9
grade 1
home_ownership 8
loan_id 9
status_period_end 11
vintage 4


A Jupyter Widget

In [8]:
chunk

Unnamed: 0,addr_state,all_cash_to_inv,amt_due,amt_paid,charged_off_amt,charged_off_this_month,current_policy,date,dq_24m,dti,...,princp_paid,public_recs,recov_fees,recovs,revol_credit_bal,revol_line_util,status_period_end,term,total_credit_lines,vintage
27857116,PA,188.580,188.58,188.580,0.0,0,1,2017-07-01,0.0,12.21,...,140.171,0.0,0.0,0.0,5860.0,0.416,current,36,17.0,17Q2
27857117,PA,188.580,188.58,188.580,0.0,0,1,2017-08-01,0.0,12.21,...,141.331,0.0,0.0,0.0,5860.0,0.416,current,36,17.0,17Q2
27857118,PA,188.580,188.58,188.580,0.0,0,1,2017-09-01,0.0,12.21,...,142.501,0.0,0.0,0.0,5860.0,0.416,current,36,17.0,17Q2
27857119,MN,879.180,879.18,879.180,0.0,0,1,2017-07-01,0.0,13.84,...,545.430,0.0,0.0,0.0,10093.0,0.332,current,36,22.0,17Q2
27857120,MN,879.180,879.18,879.180,0.0,0,1,2017-08-01,0.0,13.84,...,552.711,0.0,0.0,0.0,10093.0,0.332,current,36,22.0,17Q2
27857121,MN,0.000,879.18,0.000,0.0,0,1,2017-09-01,0.0,13.84,...,0.000,0.0,0.0,0.0,10093.0,0.332,late_30,36,22.0,17Q2
27857122,NJ,794.820,794.82,794.820,0.0,0,1,2017-07-01,0.0,23.27,...,294.820,0.0,0.0,0.0,61899.0,0.759,current,60,23.0,17Q2
27857123,NJ,794.820,794.82,794.820,0.0,0,1,2017-08-01,0.0,23.27,...,299.734,0.0,0.0,0.0,61899.0,0.759,current,60,23.0,17Q2
27857124,NJ,794.820,794.82,794.820,0.0,0,1,2017-09-01,0.0,23.27,...,304.729,0.0,0.0,0.0,61899.0,0.759,current,60,23.0,17Q2
27857125,CT,805.860,805.86,805.860,0.0,0,1,2017-07-01,0.0,16.89,...,598.985,0.0,0.0,0.0,34688.0,0.744,current,36,28.0,17Q2


In [6]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('pmt_hist_ids', 24385256),
 ('already_good_dfs', 12383856),
 ('chunk', 160613),
 ('fixed_dfs', 6232),
 ('group', 2196),
 ('copied', 1466),
 ('NamespaceMagics', 888),
 ('actual_months', 224),
 ('expected_months', 224),
 ('find_closest_previous_record', 136),
 ('tqdm_notebook', 136),
 ('var_dic_list', 136),
 ('first_date', 120),
 ('issue_d', 120),
 ('last_date', 120),
 ('month', 120),
 ('months_to_copy', 96),
 ('dc', 80),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('getsizeof', 72),
 ('to_make_months', 64),
 ('ids', 58),
 ('id_grouped', 56),
 ('store', 56),
 ('left_bound', 32),
 ('max_id', 32),
 ('n', 32),
 ('right_bound', 32),
 ('chunksize', 28),
 ('n_chunks', 28)]

In [7]:
sys.getsizeof?