# Imports and Constants

In [None]:
from sqlalchemy import create_engine
import math
from pandas.io import sql

import pickle
import json

pd.set_option('display.float_format', lambda x: '%.6f' % x)

# Set engine
engine = create_engine('sqlite:////Users/justinhsi/LRData/lc_database.db')
batch_size = 200000

pmt_hist_cols = [
    'acc_int', 'charged_off_recovs_collection_fees',
    'gross_charged_off_recovs', 'id', 'm_on_books', 'pmt_amt_received',
    'gross_pmt_to_int', 'gross_pmt_to_princp', 'outs_princp_end',
    'outs_princp_beg', 'date'
]

# Functions

In [None]:
def filter_to_picks(ids, dataframe):
    return dataframe[dataframe['id'].isin(ids)]


def create_original_amounts_funded(dataframe):
    # Find out how much is being originated in each month and store that in a dict
    just_first = dataframe.sort_values(['platform', 'id', 'date'])
    just_first.drop_duplicates(subset=['id', 'platform'], inplace=True)
    #assert just_first.shape[0] == (len(pmt_history['id'].unique()))# + len(pmt_history_P['id'].unique()))
    origination_date_grouped = just_first.groupby('issue_d')
    original_origination_amounts_dict = {}
    for issue_d, group in origination_date_grouped:
        original_origination_amounts_dict[issue_d] = group['funded'].sum()
    return original_origination_amounts_dict

def select_some_loans_h5():
    datapath_store = '/Users/justinhsi/LRData/lendingclub/old/lendingclub_store.h5'
    return pd.read_hdf(datapath_store, 'clean_pmt_history', where='LOAN_ID=ids_list', chunksize = 200000)

def select_some_loans():
    engine = create_engine(
        'sqlite:////Users/justinhsi/LRData/lc_database.db')
    selected_loans = pd.read_sql_query(
        'SELECT * from jclean_pmt_history_month_merged',
        engine,
        parse_dates=['date'])
    return selected_loans


def speed_test(group):
    to_concat = []
    group['acc_int'] = make_acc_int(group)
    group['status'] = make_status(group)
    to_concat.append(group)
    
def lc_rename_cols(df):
    return df.rename(columns = {'LOAN_ID': 'id',
                              'PBAL_BEG_PERIOD': 'outs_princp_beg',
                              'PRNCP_PAID': 'gross_pmt_to_princp',
                              'INT_PAID': 'gross_pmt_to_int',
                              'FEE_PAID': 'gross_pmt_to_late_fee',
                              'DUE_AMT': 'm_amt_due',
                              'RECEIVED_AMT': 'pmt_amt_received',
                              'RECEIVED_D': 'pmt_date_received',
                              'PERIOD_END_LSTAT': 'status',
                              'MONTH': 'date',
                              'PBAL_END_PERIOD': 'outs_princp_end',
                              'MOB': 'm_on_books',
                              'CO': 'charged_off',
                              'COAMT': 'charged_off_amt',
                              'InterestRate': 'rate',
                              'IssuedDate': 'issue_d',
                              'MONTHLYCONTRACTAMT': 'installment_funded',                              
                              'dti': 'dti',
                              'State': 'addr_state',
                              'HomeOwnership': 'home_ownership',
                              'MonthlyIncome': 'monthly_income',
                              'EarliestCREDITLine': 'line_earliest',
                              'OpenCREDITLines': 'lines_open',
                              'TotalCREDITLines': 'lines_total',
                              'RevolvingCREDITBalance': 'revol_bal',
                              'RevolvingLineUtilization':'revol_util',
                              'Inquiries6M': 'inquiries_6m',
                              'DQ2yrs': 'delinq_24m',
                              'MonthsSinceDQ': 'm_since_delinq',
                              'PublicRec': 'records',
                              'MonthsSinceLastRec': 'm_since_record',
                              'EmploymentLength': 'emp_length',
                              'currentpolicy': 'current_policy',
                              'grade': 'grade',
                              'term': 'term',
                              'APPL_FICO_BAND': 'fico_apply',
                              'Last_FICO_BAND': 'fico_last',
                              'VINTAGE': 'yr_qtr',
                              'PCO_RECOVERY': 'gross_charged_off_recovs',
                              'PCO_COLLECTION_FEE': 'charged_off_recovs_collection_fees',
                              'policy_code': 'policy_code',})

def make_acc_int(group):
    m_int_rate = group['rate'].values[0] / 12
    acc_int = group['outs_princp_beg'] * m_int_rate
    return acc_int


def make_status(group):
    pmts = group['pmt_amt_received'].values
    outs_princp_end = group['outs_princp_end'].values
    m_amt_due = group['m_amt_due'].values * .99
    #     m_installment = group['installment_funded'].values[0]*.985
    status = []
    for i in np.arange(len(group)):
        pmt = pmts[i]
        amt_due = m_amt_due[i]
        if pmt < amt_due:
            if len(status) == 0:
                status.append('late')
            elif status[-1] == 'late':
                status.append('late_60')
            elif status[-1] == 'late_60':
                status.append('late_90')
            elif status[-1] == 'late_90':
                status.append('late_120')
            elif status[-1] == 'late_120':
                status.append('late_max')
            else:
                status.append('late')
        else:
            if outs_princp_end[i] <= 0:
                status.append('paid')
            else:
                status.append('current')
    return status

def make_service_fees(group):
    before_13 = group[group['m_on_books'] <= 12]
    rest = group[~group['date'].isin(before_13['date'].values)]
    
    b_13_service_fees = []
    for index in before_13.index.values:
        pmt = before_13.ix[index, 'pmt_amt_received']
        m_amt_due = before_13.ix[index, 'm_amt_due']
        b_13_service_fees.append(min(pmt*.01, m_amt_due*.01))
    
    rest_service_fees = list((rest['pmt_amt_received']).values*.01)
    service_fees = np.array(b_13_service_fees + rest_service_fees)
    assert(len(service_fees) == len(group)), print(group['id'].unique())
    return service_fees

def make_cummulative_service_fees(service_fee_list):
    cum_service_fees = service_fee_list.cumsum()
    return cum_service_fees

def add_cols_lc(df):
    id_grouped = df.groupby('id')
    id_dict_subdict = {}
    for ids, group in tqdm_notebook(id_grouped):
        subdict = {}
        subdict['status'] = make_status(group)
        subdict['acc_int'] = make_acc_int(group)
        subdict['service_fees'] = make_service_fees(group)
#         subdict['cum_service_fees'] = make_cummulative_service_fees(subdict['service_fees'])
        id_dict_subdict[ids] = subdict    
    
        
    seen_ids = set()
    status_list = []
    acc_int_list = []
    service_fees_list = []
#     cum_service_fees_list = []
    for ids in tqdm_notebook(df['id'].values):
        if ids not in seen_ids:
            seen_ids.update([ids])
            status_list.extend(id_dict_subdict[ids]['status'])
            acc_int_list.extend(id_dict_subdict[ids]['acc_int'])
            service_fees_list.extend(id_dict_subdict[ids]['service_fees'])
#             cum_service_fees_list.extend(id_dict_subdict[ids]['cum_service_fees'])
#         else:
#             pass
    df['acc_int'] = acc_int_list
    df['j_status'] = status_list
    df['service_fees'] = service_fees_list
#     df['cum_service_fees'] = cum_service_fees_list
    
    df['j_status'] = np.where(df['issue_d'] == df['date'], 'issued', df['j_status'])
    return df



# Running as script

In [None]:
# get in data
store = pd.HDFStore('/Users/justinhsi/LRData/lendingclub/lendingclub_store.h5')
loan_info = store['clean_loan_info']
pmt_hist = store['clean_pmt_history']
store.close()

In [None]:
pmt_history = lc_rename_cols(pmt_hist)

# Fix 600 loans with double entries in same date

In [None]:
all_loans= pmt_history
loans_with_two_entries_in_same_month = all_loans[all_loans.duplicated(['id', 'date'])]
dup_date_ids = loans_with_two_entries_in_same_month['id'].unique()
column_iloc_map = {col_name:all_loans.iloc[-1].index.get_loc(col_name) for col_name in all_loans.columns.values}

In [None]:
# group = all_loans[all_loans['id'].isin([dup_date_ids[0]])] #a test group

In [None]:
def find_dupe_dates(group):
    return pd.to_datetime(group[group.duplicated('date')]['date'].values)

def merge_dupe_dates(group):
    df_chunks = []
    
    dupe_dates = find_dupe_dates(group)
    df_chunks.append(group[~group['date'].isin(dupe_dates)])
    
    for date in dupe_dates:
        problem_rows = group[group['date'] == date]
        ori_index = problem_rows.index
        keep_row = problem_rows.iloc[-1].to_dict()
        keep_row['outs_princp_beg'] = problem_rows.ix[ori_index[0],column_iloc_map['outs_princp_beg']]
        
        summed = problem_rows.sum()
        keep_row['gross_pmt_to_princp'] = summed['gross_pmt_to_princp']
        keep_row['gross_pmt_to_int'] = summed['gross_pmt_to_int']
        keep_row['gross_pmt_to_late_fee'] = summed['gross_pmt_to_late_fee']
        keep_row['m_amt_due'] = summed['m_amt_due']
        keep_row['pmt_amt_received'] = summed['pmt_amt_received']
        keep_row['charged_off_amt'] = summed['charged_off_amt']
        keep_row['gross_charged_off_recovs'] = summed['gross_charged_off_recovs']
        keep_row['charged_off_recovs_collection_fees'] = summed['charged_off_recovs_collection_fees']
            
        df_chunks.append(pd.DataFrame(pd.Series(keep_row),columns=[ori_index[-1]]).T)
            
    return df_chunks

In [None]:
id_grouped = all_loans.groupby('id')

already_good = all_loans[~all_loans['id'].isin(dup_date_ids)]
to_concat = []
for ids, group in tqdm_notebook(id_grouped):
    if ids in dup_date_ids:
        to_concat.extend(merge_dupe_dates(group))
    else:
        pass
        
to_concat.append(already_good)
examine = pd.concat(to_concat)

In [None]:
examine['date'] = pd.to_datetime(examine['date'])
examine['issue_d'] = pd.to_datetime(examine['issue_d'])
examine['pmt_date_received'] = pd.to_datetime(examine['pmt_date_received'])
all_loans = examine.sort_values(['id', 'date'])
all_loans = all_loans.set_index(['id', 'date'], drop=False)

# Do I need to fix stuff that pays immediately in issuance month?

In [None]:
all_loans[all_loans['m_on_books'] == 0]
group = all_loans.ix['10645201',:]
group[['issue_d', 'date', 'm_amt_due', 'pmt_amt_received', 'outs_princp_beg', 'outs_princp_end']]

In [None]:
# all_lc_loans = add_status_acc_int_lc(all_lc_loans)

all_loans = add_cols_lc(all_loans)

In [None]:
all_loans['service_fees'] = np.where(
    all_loans['date'] == all_loans['issue_d'],
    (all_loans['pmt_amt_received'] - all_loans['outs_princp_beg']) * .01,
    all_loans['service_fees'])
all_loans['service_fees'] = np.where(
    all_loans['gross_pmt_to_int'] == 0, 0, all_loans['service_fees'])
all_loans['acc_int'] = np.where(all_loans['date'] == all_loans['issue_d'], 0,
                                all_loans['acc_int'])

In [None]:
group = all_loans.ix['10645201',:]
group

# now adjust each pmt_history about each loan by the appropriate amount

In [None]:
all_loans = all_loans.fillna(0)
id_grouped = all_loans.groupby('id')
result = {}
charged_off_recovs_collection_fees_list = []
gross_charged_off_recovs_list = []
pmt_amt_received_list = []
outs_princp_end_list = []
outs_princp_beg_list = []
gross_pmt_to_princp_list = []
acc_int_list = []
service_fees_list = []
gross_pmt_to_int_list = []
ids_list = []
date_list = []
status_list = []
m_on_books_list = []
charged_off_list = []
charged_off_amt_list = []
m_amt_due_list = []
issue_d_list = []
rate_list = []
grade_list = []
for ids, group in tqdm_notebook(id_grouped):
    invest_amt = plat_dict[plat][ids]
    ori_index = group.index
    funded = group['outs_princp_beg'].values[0]

    charged_off_recovs_collection_fees = group[
        'charged_off_recovs_collection_fees'].values / funded * invest_amt
    charged_off_recovs_collection_fees_list.extend(
        charged_off_recovs_collection_fees)

    gross_charged_off_recovs = group[
        'gross_charged_off_recovs'].values / funded * invest_amt
    gross_charged_off_recovs_list.extend(gross_charged_off_recovs)

    pmt_amt_received = group['pmt_amt_received'].values / funded * invest_amt
    pmt_amt_received_list.extend(pmt_amt_received)

    outs_princp_end = group['outs_princp_end'].values / funded * invest_amt
    outs_princp_end_list.extend(outs_princp_end)

    outs_princp_beg = group['outs_princp_beg'].values / funded * invest_amt
    outs_princp_beg_list.extend(outs_princp_beg)

    gross_pmt_to_princp = group[
        'gross_pmt_to_princp'].values / funded * invest_amt
    gross_pmt_to_princp_list.extend(gross_pmt_to_princp)

    acc_int = group['acc_int'].values / funded * invest_amt
    acc_int_list.extend(acc_int)

    service_fees = group['service_fees'].values / funded * invest_amt
    service_fees_list.extend(service_fees)

    gross_pmt_to_int = group['gross_pmt_to_int'].values / funded * invest_amt
    gross_pmt_to_int_list.extend(gross_pmt_to_int)

    m_amt_due = group['m_amt_due'].values / funded * invest_amt
    m_amt_due_list.extend(m_amt_due)

    ids_list.extend([ids] * len(group))
    
    grade_list.extend(group['grade'].values)

    date_list.extend(group['date'].values)

    status_list.extend(group['j_status'].values)

    m_on_books_list.extend(group['m_on_books'].values)

    issue_d_list.extend(group['issue_d'].values)

    rate_list.extend(group['rate'].values)

    charged_off_list.extend(group['charged_off'].values)

    charged_off_amt = group['charged_off_amt'].values / funded * invest_amt
    charged_off_amt_list.extend(charged_off_amt)

    result[
        'charged_off_recovs_collection_fees'] = charged_off_recovs_collection_fees_list
    result['gross_charged_off_recovs'] = gross_charged_off_recovs_list
    result['pmt_amt_received'] = pmt_amt_received_list
    result['outs_princp_end'] = outs_princp_end_list
    result['outs_princp_beg'] = outs_princp_beg_list
    result['gross_pmt_to_princp'] = gross_pmt_to_princp_list
    result['acc_int'] = acc_int_list
    result['service_fees'] = service_fees_list
    result['gross_pmt_to_int'] = gross_pmt_to_int_list
    result['id'] = ids_list
    result['date'] = date_list
    result['status'] = status_list
    result['m_on_books'] = m_on_books_list
    result['issue_d'] = issue_d_list
    result['rate'] = rate_list
    result['charged_off'] = charged_off_list
    result['charged_off_amt'] = charged_off_amt_list
    result['m_amt_due'] = m_amt_due_list
    result['grade'] = grade_list

df = pd.DataFrame.from_dict(result)
df['date'] = pd.to_datetime(df['date'])
df['issue_d'] = pd.to_datetime(df['issue_d'])
# for status adjustment. If it paid and is not late, acc_int - pmt_to_int should equal 0
# and thus you're only left with outs_princp_end. If they didn't pay or paid less,
# then acc_int - pmt_to_int is > 0 and you add that value to outs_princp_end, and then
# adjust it by the lateness
df['stat_adj_acc_int'] = np.where(
    df['status'] == 'late', df['acc_int'] / 2,
    np.where(df['status'] == 'late_60', 0, df['acc_int']))
df['stat_adj_outs_princp_end'] = np.where(
    df['status'] == 'late', df['outs_princp_end'] / 2,
    np.where(df['status'] == 'late_60', 0, df['outs_princp_end']))

sorted_df = df.sort_values(['id', 'date'])

In [None]:
sorted_df = sorted_df.set_index(['id', 'date'], drop=False)

In [None]:
sorted_df.ix['10645201',:]

# Starting munge work for emmanuel

In [None]:
scaled = sorted_df

In [None]:
scaled['stat_adj_value'] = np.where(
    scaled['pmt_amt_received'] >= scaled['m_amt_due'],
    scaled['stat_adj_outs_princp_end'],
    scaled['stat_adj_outs_princp_end'] +
    scaled['stat_adj_acc_int'])

In [None]:
zero_date = scaled['issue_d'].min()
last_date = scaled['date'].max()
date_mapper = {}
k = 0
while zero_date + pd.DateOffset(months=+k) <= last_date:
    date_mapper[zero_date + pd.DateOffset(months=+k)] = k
    k += 1

In [None]:
id_grouped = plat_selected_dfs['LC'].groupby('id')

In [None]:
last_status_mapper = {}
for ids, group in tqdm_notebook(id_grouped):
    last_status_mapper[ids] = group['status'].values[-1]
    
    

In [None]:
term_mapper = plat_selected_dfs['LC'].drop_duplicates('id').set_index('id')['term'].to_dict()
rate_mapper = plat_selected_dfs['LC'].drop_duplicates('id').set_index('id')['rate'].to_dict()

In [None]:
scaled['issue_d_mapped'] = scaled['issue_d'].map(date_mapper)
scaled['date_mapped'] = scaled['date'].map(date_mapper)
# scaled['last_pmt_d_mapped']

In [None]:
scaled

In [None]:
id_scaled_grouped = scaled.groupby('id')

In [None]:
# mismatch_issue_date_and_date = []
# for ids, group in tqdm_notebook(id_scaled_grouped):
#     if group['issue_d'].min() != group['date'].min():
#         mismatch_issue_date_and_date.append(ids)

In [None]:
scaled['days_late'] = np.where(scaled['status'] == 'late', 15,
        np.where(scaled['status'] == 'late_60', 45,
                np.where(scaled['status'] == 'late_90', 75,
                         np.where(scaled['status'] == 'late_120', 105,
                                  np.where(scaled['status'] == 'late_max', 135, 0)))))

In [None]:
# add the 

In [None]:
scaled['yr_quarter'] = scaled['date'].dt.year.astype(str).str[-2:] + 'Q' + scaled['date'].dt.quarter.astype(str)

In [None]:
combined_dict = {}
for ids, group in tqdm_notebook(id_scaled_grouped):
    sub_dict = {}
    sub_dict['Grade'] = group['grade'].values[0]
    sub_dict['Month of Issuance'] = group['issue_d_mapped'].min()
    sub_dict['Month of Last Record'] = group['date_mapped'].max()
    sub_dict['Month of First Record'] = group['date_mapped'].min()
    try:
        sub_dict['Month of Last Payment'] = date_mapper[pd.to_datetime(group[
            'date'].iloc[np.nonzero(group['pmt_amt_received']
                                    .values)].values.max())]
    except:
        # if no payment was ever made
        sub_dict['Month of Last Payment'] = np.nan
    sub_dict['Amount'] = plat_dict['LC'][ids]
    sub_dict['Term'] = term_mapper[ids]
    sub_dict['Rate'] = rate_mapper[ids]
    sub_dict['Status'] = last_status_mapper[ids]

    outstandings = np.where(group['pmt_amt_received'] == 0,
                            group['outs_princp_end'] + group['acc_int'],
                            group['outs_princp_end'])
    sub_dict['Outstandings'] = outstandings
    sub_dict['Days Late'] = group['days_late'].values
#     sub_dict['Status Adjusted Value'] = group['stat_adj_value'].values
    
    assert type(outstandings) != type(None), print(outstandings, ids)

    sub_dict['Gross Payments'] = group['pmt_amt_received'].values
    sub_dict['Net Payments'] = group['pmt_amt_received'].values - group['service_fees'].values

    if len(group[group['gross_charged_off_recovs'] > 0]):
        sub_dict['Month of Recov_timestamp'] = pd.to_datetime(
            group[group['gross_charged_off_recovs'] > 0]['date'].values[0])
        sub_dict['Month of Recov'] = date_mapper[sub_dict[
            'Month of Recov_timestamp']]
        recov_net = group.ix[(ids, sub_dict['Month of Recov_timestamp']),
                             'gross_charged_off_recovs'] - group.ix[
                                 (ids, sub_dict['Month of Recov_timestamp']
                                  ), 'charged_off_recovs_collection_fees']
        sub_dict['Net Recov'] = recov_net
    else:
        sub_dict['Month of Recov_timestamp'] = sub_dict['Month of Issuance']
        sub_dict['Month of Recov'] = sub_dict['Month of Last Record']
        sub_dict['Net Recov'] = 0

    sub_dict['CO_amt'] = group['charged_off_amt'].max()
    sub_dict['Month of CO'] = np.nan if len(group[group['charged_off'] != 0]) == 0 else group[group['charged_off'] == 1]['date_mapped'].values[0]
    sub_dict['yr_qrtr'] = group['yr_quarter'].values[0]
    
    
    combined_dict[ids] = sub_dict

In [None]:
#turn all the arrays into lists
fixed_dict = {}
for ids, sub_dict in combined_dict.iteritems():
    fixed_dict[ids] = sub_dict
    fixed_dict[ids]['Outstandings'] = fixed_dict[ids]['Outstandings'].tolist()
    fixed_dict[ids]['Days Late'] = fixed_dict[ids]['Days Late'].tolist()
#     fixed_dict[ids]['Status Adjusted Value'] = fixed_dict[ids]['Status Adjusted Value'].tolist()
    fixed_dict[ids]['Net Payments'] = fixed_dict[ids]['Net Payments'].tolist()
    fixed_dict[ids]['Gross Payments'] = fixed_dict[ids]['Gross Payments'].tolist()

In [None]:
# fix some sequence of dates that are missing some dates by filling with 0s 
fix_counter = 0
for ids, group in tqdm_notebook(id_scaled_grouped):
    recorded_months = group['date_mapped'].values
    expected_months = np.arange(group['issue_d_mapped'].min(), group['date_mapped'].max()+1)
    
    month_to_index_dict = {}
    k = 0
    for month in expected_months:
        month_to_index_dict[month] = k
        k += 1
    indicies_to_insert = set(expected_months)^set(recorded_months)
#     if ids == '77511438':
#         print(indicies_to_insert)
    if len(indicies_to_insert) > 0:
        fix_counter += 1
        for index in indicies_to_insert:
            index = month_to_index_dict[index]
            fixed_dict[ids]['Outstandings'].insert(index, 0)
            fixed_dict[ids]['Days Late'].insert(index, 0)
#             fixed_dict[ids]['Status Adjusted Value'].insert(index, 0)
            fixed_dict[ids]['Net Payments'].insert(index, 0)
            fixed_dict[ids]['Gross Payments'].insert(index, 0)
    else:
        pass

In [None]:
all_months = np.arange(date_mapper[last_date] + 1)
to_df = {}
for ids, sub_dict in tqdm_notebook(fixed_dict.iteritems()):
    ids_dict = {}
    ids_dict = {k: v for k, v in sub_dict.iteritems()}
    
#     print(type(sub_dict['Month of Last Record']), type(sub_dict['Month of Issuance']), ids)
#     expected_format = sub_dict['Month of Issuance'] < sub_dict['Month of First Record']
#     months_diff =  int(sub_dict['Month of First Record'] - sub_dict['Month of Issuance'])
#     if expected_format:
#         n_months = int(sub_dict['Month of Last Record'] - sub_dict['Month of Issuance']) # or First Record instead of issuance
#         new_net_recovs = [0] * n_months
#     elif sub_dict['Month of Issuance'] == sub_dict['Month of First Record']:
#         n_months = int(sub_dict['Month of Last Record'] - sub_dict['Month of Issuance'])+1
#         new_net_recovs = [0] * n_months
    new_net_recovs = [0] * int(len(sub_dict['Outstandings']))
    new_net_recovs[-1] = sub_dict['Net Recov']
    new_outstandings = sub_dict['Outstandings']
    new_days_late = sub_dict['Days Late']
#     new_stat_adj_value = sub_dict['Status Adjusted Value']
    new_gross_payments = sub_dict['Gross Payments']
    new_net_payments = sub_dict['Net Payments']

#     print(months_diff)
#     if months_diff > 1:
#         n_zeros_to_prepend = int(months_diff - 1)
#         new_outstandings = [0]*n_zeros_to_prepend + new_outstandings
#         new_gross_payments = [0]*n_zeros_to_prepend + new_gross_payments
#         new_net_payments = [0]*n_zeros_to_prepend + new_net_payments
    assert(len(new_net_recovs) == len(new_outstandings)), print(new_net_recovs, len(new_net_recovs), new_outstandings, len(new_outstandings), ids)
    assert(len(new_gross_payments) == len(new_outstandings))
    assert(len(new_days_late) == len(new_outstandings))
#     assert(len(new_stat_adj_value) == len(new_outstandings))
    assert(len(new_net_payments) == len(new_outstandings))
    
    # adding padding before and after the lists as necessary
    pad_before = [0] * int(sub_dict['Month of Issuance'])
    pad_after = [0] * int(all_months.max() - sub_dict['Month of Last Record'])
    
    new_outstandings = pad_before + new_outstandings + pad_after
    new_days_late = pad_before + new_days_late + pad_after
#     new_stat_adj_value = pad_before + new_stat_adj_value + pad_after
    new_gross_payments = pad_before + new_gross_payments + pad_after
    new_net_payments = pad_before + new_net_payments + pad_after
    new_net_recovs = pad_before + new_net_recovs + pad_after
    
# #     print(pad_after, pad_before, sub_dict['Month of First Record'], sub_dict['Month of Last Record'], ids)
    
    assert len(new_outstandings) == len(all_months)#, print(len(new_outstandings), len(all_months), ids)
#     assert len(new_stat_adj_value) == len(all_months)
    assert len(new_days_late) == len(all_months)
    assert len(new_gross_payments) == len(all_months)
    assert len(new_net_payments) == len(all_months)
    assert len(new_net_recovs) == len(all_months)
    

    ids_dict['Outstandings'] = new_outstandings
    ids_dict['Days Late'] = new_days_late
#     ids_dict['Status Adjusted Value'] = new_stat_adj_value
    ids_dict['Gross Payments'] = new_gross_payments
    ids_dict['Net Payments'] = new_net_payments
    ids_dict['Net Recov'] = new_net_recovs
    to_df[ids] = ids_dict

In [None]:
to_export = pd.DataFrame.from_dict(to_df).T

In [None]:
to_export = to_export.drop(['Month of Recov', 'Month of Recov_timestamp'], axis = 1)

In [None]:
to_export[to_export['Month of CO'].notnull()]

In [None]:
to_export['Status'].value_counts(dropna=False)

In [None]:
to_export.shape

In [None]:
to_export.head()

In [None]:
to_export['Amount'].sum()

In [None]:
to_export.tail()

In [None]:
to_export.to_pickle('/Users/justinhsi/LRData/lendingclub/emmanuel_pmts_format.pkl')

In [None]:
to_export.to_csv('/Users/justinhsi/LRData/lendingclub/pmts_csv_form.csv')

In [None]:
to_export.drop('Status', axis=1,inplace=True)

In [None]:
to_export.drop('Rate', axis=1, inplace=True)

In [None]:
to_export

In [None]:
scaled_LC = scaled

In [None]:
scaled_LC

# Now do it for Prosper

In [None]:
original_prosper_loans = plat_selected_dfs['P'].copy(deep=True)
cols_P = ['LoanID', 'ListingNumber', 'LoanAmount', 'Term', 'OriginationDate', 'Observation_Month', 'BorrowerRate',
          'ScheduledMonthlyPaymentAmount', 'Completed_Month', 'Monthof_Last_Payment', 'Debt_Sale_Month',
          'NetCashToInvestorsFromDebtSale', 'CycleCounter', 'DaysPastDue', 'DaysPastDue_EOM',
          'PaymentsReceived', 'CollectionFees', 'PrincipalPaid', 'InterestPaid', 'LateFees', 'ServicingFees',
          'RecoveryPayments', 'RecoveryPrin', 'BOMPrin', 'EOMPrin', 'ProsperRating']
p_loans = original_prosper_loans[cols_P].sort_values(['LoanID', 'Observation_Month'])

In [None]:
# check that there's a row corresponding to origination month, that there's one and only one entry per obs
# month

id_grouped = p_loans.groupby('LoanID')
for ids, group in tqdm_notebook(id_grouped):
    assert len(group[group['Observation_Month']==group['OriginationDate'].values[0]] == 1)
    assert len(group) == len(group['Observation_Month'].unique()), print(ids)

In [None]:
# add acc_int
p_loans['acc_int'] = p_loans['BOMPrin'] * p_loans['BorrowerRate']/12
p_loans['acc_int'] = np.where(p_loans['OriginationDate'] == p_loans['Observation_Month'], 0, p_loans['acc_int'])

p_loans['ScheduledMonthlyPaymentAmount'] = np.where(p_loans['acc_int'] == 0, 0, p_loans['ScheduledMonthlyPaymentAmount'])

# Now scale amounts

In [None]:
p_loans = p_loans.fillna(0)
id_grouped = p_loans.groupby('LoanID')

result = {}

ids_list = []
listing_list = []
amt_list = []
term_list = []
issue_d_list = []
date_list = []
rate_list = []
m_amt_due_list = []
acc_int_list = []
recovs_debt_sale_list = []
days_late_list = []
payments_received_list = []
collection_fees_list = []
principal_paid_list = []
interest_paid_list = []
late_fees_list = []
servicing_fees_list = []
recovs_pmts_list = []
recov_prin_list = []
outs_princp_beg_list = []
outs_princp_end_list = []

plat = 'P'
for ids, group in id_grouped:
    list_number = str(group['ListingNumber'].values[0])
    invest_amt = plat_dict[plat][list_number]
    #     ori_index = group.index
    funded = group['BOMPrin'].values[0]

    ids_list.extend(group['LoanID'].values)
    listing_list.extend([list_number] * len(group))
    amt_list.extend([invest_amt] * len(group))
    term_list.extend(group['Term'].values)
    issue_d_list.extend(group['OriginationDate'].values)
    date_list.extend(group['Observation_Month'].values)
    rate_list.extend(group['BorrowerRate'].values)
    m_amt_due_list.extend(group['ScheduledMonthlyPaymentAmount'].values *
                          (invest_amt / funded))
    acc_int_list.extend(group['acc_int'].values * (invest_amt / funded))
    recovs_debt_sale_list.extend(group['NetCashToInvestorsFromDebtSale'].values
                                 * (invest_amt / funded))
    days_late_list.extend(group['DaysPastDue'].values)
    payments_received_list.extend(group['PaymentsReceived'].values *
                                  (invest_amt / funded))
    collection_fees_list.extend(group['CollectionFees'].values *
                                (invest_amt / funded))
    principal_paid_list.extend(group['PrincipalPaid'].values *
                               (invest_amt / funded))
    interest_paid_list.extend(group['InterestPaid'].values *
                              (invest_amt / funded))
    late_fees_list.extend(group['LateFees'].values * (invest_amt / funded))
    servicing_fees_list.extend(group['ServicingFees'].values *
                               (invest_amt / funded))
    recovs_pmts_list.extend(group['RecoveryPayments'].values *
                            (invest_amt / funded))
    recov_prin_list.extend(group['RecoveryPrin'].values *
                           (invest_amt / funded))
    outs_princp_beg_list.extend(group['BOMPrin'].values *
                                (invest_amt / funded))
    outs_princp_end_list.extend(group['EOMPrin'].values *
                                (invest_amt / funded))
#     print(len(ids_list), len(outs_princp_end_list))

result['id'] = ids_list
result['listing'] = listing_list
result['amt'] = amt_list
result['term'] = term_list
result['issue_d'] = issue_d_list
result['date'] = date_list
result['rate'] = rate_list
result['m_amt_due'] = m_amt_due_list
result['acc_int'] = acc_int_list
result['recovs_sale'] = recovs_debt_sale_list
result['days_late'] = days_late_list
result['net_norm_payments'] = payments_received_list
result['collection_fees'] = collection_fees_list
result['princp_paid'] = principal_paid_list
result['int_paid'] = interest_paid_list
result['late_fees'] = late_fees_list
result['servicing_fees'] = servicing_fees_list
result['recov_payments'] = recovs_pmts_list
result['recov_princp'] = recov_prin_list
result['outs_princp_beg'] = outs_princp_beg_list
result['outs_princp_end'] = outs_princp_end_list

df = pd.DataFrame.from_dict(result)
df['date'] = pd.to_datetime(df['date'])
df['issue_d'] = pd.to_datetime(df['issue_d'])

sorted_df = df.sort_values(['id', 'date'])

In [None]:
sorted_df

In [None]:
scaled = sorted_df

In [None]:
term_mapper = plat_selected_dfs['P'].drop_duplicates('LoanID').set_index('LoanID')['Term'].to_dict()
rate_mapper = plat_selected_dfs['P'].drop_duplicates('LoanID').set_index('LoanID')['BorrowerRate'].to_dict()

In [None]:
scaled['issue_d_mapped'] = scaled['issue_d'].map(date_mapper)
scaled['date_mapped'] = scaled['date'].map(date_mapper)

In [None]:
id_scaled_grouped = scaled.groupby('id')

In [None]:
scaled.head()

In [None]:
combined_dict = {}
for ids, group in tqdm_notebook(id_scaled_grouped):
    sub_dict = {}
#     sub_dict['Grade'] = group['grade'].values[0]
    sub_dict['Month of Issuance'] = group['issue_d_mapped'].min()
    sub_dict['Month of Last Record'] = group['date_mapped'].max()
    sub_dict['Month of First Record'] = group['date_mapped'].min()
    try:
        sub_dict['Month of Last Payment'] = date_mapper[pd.to_datetime(group[
            'date'].iloc[np.nonzero(group['pmt_amt_received']
                                    .values)].values.max())]
    except:
        # if no payment was ever made
        sub_dict['Month of Last Payment'] = np.nan
        

    sub_dict['Amount'] = group['amt'].values[0]
    sub_dict['Term'] = term_mapper[ids]
    sub_dict['Rate'] = rate_mapper[ids]
#     sub_dict['Status'] = last_status_mapper[ids]

    outstandings = np.where((group['net_norm_payments'] == 0) & (group['recov_payments'] == 0),
                            group['outs_princp_end'] + group['acc_int'],
                            group['outs_princp_end'])
    sub_dict['Outstandings'] = outstandings
    sub_dict['Days Late'] = group['days_late'].values
#     sub_dict['Status Adjusted Value'] = group['stat_adj_value'].values
    
    assert type(outstandings) != type(None), print(outstandings, ids)

    sub_dict['Gross Payments'] = group['late_fees'].values + group['princp_paid'].values + group['int_paid'].values
    sub_dict['Net Payments'] = group['net_norm_payments'].values

#     if len(group[group['gross_charged_off_recovs'] > 0]):
#         sub_dict['Month of Recov_timestamp'] = pd.to_datetime(
#             group[group['gross_charged_off_recovs'] > 0]['date'].values[0])
#         sub_dict['Month of Recov'] = date_mapper[sub_dict[
#             'Month of Recov_timestamp']]
#         recov_net = group.ix[(ids, sub_dict['Month of Recov_timestamp']),
#                              'gross_charged_off_recovs'] - group.ix[
#                                  (ids, sub_dict['Month of Recov_timestamp']
#                                   ), 'charged_off_recovs_collection_fees']
#         sub_dict['Net Recov'] = recov_net
#     else:
#         sub_dict['Month of Recov_timestamp'] = sub_dict['Month of Issuance']
#         sub_dict['Month of Recov'] = sub_dict['Month of Last Record']
#         sub_dict['Net Recov'] = 0
    sub_dict['Month of Recov_teimstamp'] = 0
    sub_dict['Month of Recov'] = 0
    sub_dict['Net Recov'] = group['recov_payments'].values + group['recovs_sale'].values

    combined_dict[ids] = sub_dict

In [None]:
combined_dict['185490']

In [None]:
#turn all the arrays into lists
fixed_dict = {}
for ids, sub_dict in combined_dict.iteritems():
    fixed_dict[ids] = sub_dict
    fixed_dict[ids]['Outstandings'] = fixed_dict[ids]['Outstandings'].tolist()
    fixed_dict[ids]['Days Late'] = fixed_dict[ids]['Days Late'].tolist()
#     fixed_dict[ids]['Status Adjusted Value'] = fixed_dict[ids]['Status Adjusted Value'].tolist()
    fixed_dict[ids]['Net Payments'] = fixed_dict[ids]['Net Payments'].tolist()
    fixed_dict[ids]['Gross Payments'] = fixed_dict[ids]['Gross Payments'].tolist()
    fixed_dict[ids]['Net Recov'] = fixed_dict[ids]['Net Recov'].tolist()

In [None]:
# fix some sequence of dates that are missing some dates by filling with 0s 
fix_counter = 0
for ids, group in tqdm_notebook(id_scaled_grouped):
    recorded_months = group['date_mapped'].values
    expected_months = np.arange(group['issue_d_mapped'].min(), group['date_mapped'].max()+1)
    
    month_to_index_dict = {}
    k = 0
    for month in expected_months:
        month_to_index_dict[month] = k
        k += 1
    indicies_to_insert = set(expected_months)^set(recorded_months)
#     if ids == '77511438':
#         print(indicies_to_insert)
    if len(indicies_to_insert) > 0:
        fix_counter += 1
        for index in indicies_to_insert:
            index = month_to_index_dict[index]
            fixed_dict[ids]['Outstandings'].insert(index, 0)
            fixed_dict[ids]['Days Late'].insert(index, 0)
#             fixed_dict[ids]['Status Adjusted Value'].insert(index, 0)
            fixed_dict[ids]['Net Payments'].insert(index, 0)
            fixed_dict[ids]['Gross Payments'].insert(index, 0)
            fixed_dict[ids]['Net Recov'].insert(index, 0)
    else:
        pass

In [None]:
fixed_dict['185490']

In [None]:
all_months = np.arange(date_mapper[last_date] + 1)
to_df = {}
for ids, sub_dict in tqdm_notebook(fixed_dict.iteritems()):
    ids_dict = {}
    ids_dict = {k: v for k, v in sub_dict.iteritems()}
    
#     print(type(sub_dict['Month of Last Record']), type(sub_dict['Month of Issuance']), ids)
#     expected_format = sub_dict['Month of Issuance'] < sub_dict['Month of First Record']
#     months_diff =  int(sub_dict['Month of First Record'] - sub_dict['Month of Issuance'])
#     if expected_format:
#         n_months = int(sub_dict['Month of Last Record'] - sub_dict['Month of Issuance']) # or First Record instead of issuance
#         new_net_recovs = [0] * n_months
#     elif sub_dict['Month of Issuance'] == sub_dict['Month of First Record']:
#         n_months = int(sub_dict['Month of Last Record'] - sub_dict['Month of Issuance'])+1
#         new_net_recovs = [0] * n_months
#     new_net_recovs = [0] * int(len(sub_dict['Outstandings']))
    new_net_recovs = sub_dict['Net Recov']
    new_outstandings = sub_dict['Outstandings']
    new_days_late = sub_dict['Days Late']
#     new_stat_adj_value = sub_dict['Status Adjusted Value']
    new_gross_payments = sub_dict['Gross Payments']
    new_net_payments = sub_dict['Net Payments']

#     print(months_diff)
#     if months_diff > 1:
#         n_zeros_to_prepend = int(months_diff - 1)
#         new_outstandings = [0]*n_zeros_to_prepend + new_outstandings
#         new_gross_payments = [0]*n_zeros_to_prepend + new_gross_payments
#         new_net_payments = [0]*n_zeros_to_prepend + new_net_payments
    assert(len(new_net_recovs) == len(new_outstandings)), print(new_net_recovs, len(new_net_recovs), new_outstandings, len(new_outstandings), ids)
    assert(len(new_gross_payments) == len(new_outstandings))
    assert(len(new_days_late) == len(new_outstandings))
#     assert(len(new_stat_adj_value) == len(new_outstandings))
    assert(len(new_net_payments) == len(new_outstandings))
    
    # adding padding before and after the lists as necessary
    pad_before = [0] * int(sub_dict['Month of Issuance'])
    pad_after = [0] * int(all_months.max() - sub_dict['Month of Last Record'])
    
    new_outstandings = pad_before + new_outstandings + pad_after
    new_days_late = pad_before + new_days_late + pad_after
#     new_stat_adj_value = pad_before + new_stat_adj_value + pad_after
    new_gross_payments = pad_before + new_gross_payments + pad_after
    new_net_payments = pad_before + new_net_payments + pad_after
    new_net_recovs = pad_before + new_net_recovs + pad_after
    
# #     print(pad_after, pad_before, sub_dict['Month of First Record'], sub_dict['Month of Last Record'], ids)
    
    assert len(new_outstandings) == len(all_months)#, print(len(new_outstandings), len(all_months), ids)
#     assert len(new_stat_adj_value) == len(all_months)
    assert len(new_days_late) == len(all_months)
    assert len(new_gross_payments) == len(all_months)
    assert len(new_net_payments) == len(all_months)
    assert len(new_net_recovs) == len(all_months)
    

    ids_dict['Outstandings'] = new_outstandings
    ids_dict['Days Late'] = new_days_late
#     ids_dict['Status Adjusted Value'] = new_stat_adj_value
    ids_dict['Gross Payments'] = new_gross_payments
    ids_dict['Net Payments'] = new_net_payments
    ids_dict['Net Recov'] = new_net_recovs
    to_df[ids] = ids_dict

In [None]:
to_export_P = pd.DataFrame.from_dict(to_df).T

In [None]:
to_export['plat'] = 'LC'

In [None]:
to_export_P['plat'] = 'P'

In [None]:
combined_export = pd.concat([to_export, to_export_P])

# This next bit is just for Stephen because he wanted the payment history for this guy Charles Guest

In [None]:
guest_lc = pd.read_csv('/Users/justinhsi/Downloads/charles_guest_loans_lc.csv')
guest_p = pd.read_csv('/Users/justinhsi/Downloads/charles_guest_loans_p.csv')

In [None]:
lc_loans = plat_selected_dfs['LC'][plat_selected_dfs['LC']['id'].isin(guest_lc['loan_id'].astype(str))]

In [None]:
lc_loans.to_csv('/Users/justinhsi/Downloads/charles_lc_pmts.csv')

In [None]:
p_loans = plat_selected_dfs['P'][plat_selected_dfs['P']['LoanID'].isin(guest_p['loan_id'].astype(str))]

In [None]:
p_loans

In [None]:
combined_export.to_pickle('/Users/justinhsi/LRData/lendingclub/emmanuel_pmts_format.pkl')

In [None]:
#185490 , 185266
len(to_export_P.ix['185490','Net Recov'])

In [None]:
# Payments Reived is net everything but recoveries. It includes principal, interest, late fees (good for investor)
# less collection fees and service fees (both bad for investor)
# Recovery Payments are payments, assume no fees associated. 
# NetCashToInvestorsFromDebtSale is different from recovery payments


In [None]:
examine = p_loans[p_loans['LoanID'] == '607186']
# p_loans[p_loans['NetCashToInvestorsFromDebtSale']==1175.89]

In [None]:
p_loans[p_loans['RecoveryPayments'] != p_loans['RecoveryPrin']]

In [None]:
examine

In [None]:
644.03 + 410.83 + 52.74 - 25.66 - 188.29

In [None]:
np.pmt(.224800/12, 36, -6000)

# Back to justin stuff

In [None]:
scaled

In [None]:
# # adding the actual fully combined stat_adjusted_value column, which is stat_adj_outs_princp if the loan
# # made a payment, or stat_adj_outs_princp + stat_adj_acc_int if no payment was received.

# scaled['stat_adj_value'] = np.where(
#     scaled['pmt_amt_received'] >= scaled['m_amt_due'],
#     scaled['stat_adj_outs_princp_end'],
#     scaled['stat_adj_outs_princp_end'] +
#     scaled['stat_adj_acc_int'])

# Investigate fake loans

In [None]:
def fresh_prng(int):
    prng = np.random.RandomState(int)
    return prng


def pmt_amt_received(int_rate, term, loan_amt):
    # find what a prototypical loan looks like (36 month, 10% interest rate, fully amortizing)
    m_int_rate = int_rate / 12
    m_installment = np.pmt(m_int_rate, term, -loan_amt)
    pmt_amt_received = [0] + [m_installment] * term
    return pd.Series(pmt_amt_received)


def rem_outs_princp(int_rate, term, loan_amt):
    pmts_list = pmt_amt_received(int_rate, term, loan_amt)
    ori_amt = loan_amt
    m_int_rate = int_rate / 12
    rem_outs_princp_list = [ori_amt]
    for i in np.arange(len(pmts_list) - 1):
        ori_amt *= (1 + m_int_rate)
        ori_amt -= pmts_list[i + 1]
        rem_outs_princp_list.append(ori_amt)
    return pd.Series(rem_outs_princp_list)


def gross_charged_off_recovs(term):
    return pd.Series([0] * (term + 1))


def charged_off_recovs_collection_fees(term):
    return pd.Series([0] * (term + 1))


def m_on_books(term):
    return pd.Series(np.arange(term + 1))


def set_loan_counter():
    global loan_counter
    loan_counter = 0


def increment_loan_counter():
    global loan_counter
    loan_counter += 1

def make_late_loan_history(int_rate, term, loan_amt):
    pass
    
def make_loan_history(int_rate, term, loan_amt):
    df_dict = {}
    df_dict['pmt_amt_received'] = pmt_amt_received(int_rate, term, loan_amt)
    df_dict['rem_outs_princp_end'] = rem_outs_princp(int_rate, term, loan_amt)
    df_dict['gross_charged_off_recovs'] = gross_charged_off_recovs(term)
    df_dict[
        'charged_off_recovs_collection_fees'] = charged_off_recovs_collection_fees(
            term)
    df_dict['m_on_books'] = m_on_books(term)
    df_dict['id'] = loan_counter
    m_int_rate = int_rate / 12
    df_dict['pmt_to_int'] = np.where(df_dict['pmt_amt_received'] == 0, 0,
                                     df_dict['rem_outs_princp_end'].shift(1) *
                                     m_int_rate)
    df_dict['acc_int'] = df_dict['pmt_to_int']
    df_dict['pmt_to_princp'] = np.where(
        df_dict['pmt_amt_received'] == 0, 0,
        df_dict['pmt_amt_received'] - df_dict['pmt_to_int'])
    increment_loan_counter()
    return pd.DataFrame.from_dict(df_dict)


def add_rem_outs_princp_beg(loan_history):
    loan_history['rem_outs_princp_beg'] = loan_history[
        'rem_outs_princp_end'].shift(1)
    return loan_history


def make_fake_pmt_history(loan,
                          start_date,
                          m_worth_of_pmts,
                          seed_number,
                          random_loans_per_month=True,
                          growing_loan_amounts=False):
    set_loan_counter()
    global loan_counter
    start_date = pd.Timestamp(start_date)
    int_rate = loan['int_rate']
    term = loan['term']
    loan_amt = loan['loan_amt']
    prng = fresh_prng(seed_number)
    
    if growing_loan_amounts:
        month_num = np.arange(m_worth_of_pmts)
        growth_rate = [1+.3/12] * m_worth_of_pmts
        multiplier = np.power(growth_rate,month_num)
    else:
        multiplier = np.array([1] * m_worth_of_pmts)
        
    if random_loans_per_month == True:
        loans_per_month = prng.randint(2, 20, m_worth_of_pmts)
    else:
        loans_per_month = [20] * (m_worth_of_pmts)
    loans_per_month = np.multiply(np.array(loans_per_month), multiplier)
    segment_to_concat = []
    for i in np.arange(len(loans_per_month)):
        s_date = start_date + pd.DateOffset(months=+i)
        n_loans = loans_per_month[i]
        to_concat = []
        date_col = []
        for i in np.arange(n_loans):
            to_concat.append(
                add_rem_outs_princp_beg(
                    make_loan_history(int_rate, term, loan_amt)))
            date_col.extend(
                pd.date_range(s_date, periods=term + 1, freq='MS').tolist())
        segment = pd.concat(to_concat)
        segment['date'] = date_col
        segment_to_concat.append(segment)
    complete = pd.concat(segment_to_concat)
    return complete


In [None]:
start_date = pd.Timestamp('2010-01-01')
loan={}
loan['int_rate'] = .10
loan['term'] = 36
loan['loan_amt'] = 100
m_int_rate = loan['int_rate']
term = loan['term']
loan_amt = loan['loan_amt']

# 200 months of data, 20 loans per month with growth rate.

In [None]:
test_history = make_fake_pmt_history(loan, start_date, 80, 42, False, True)
# quick rename
test_history.rename(columns={'pmt_to_int':'gross_pmt_to_int',
                             'pmt_to_princp':'gross_pmt_to_princp',
                             'rem_outs_princp_end':'outs_princp_end',
                             'rem_outs_princp_beg':'outs_princp_beg'}, inplace = True)

In [None]:
test_history['status'] = 'current'
test_history['stat_adj_outs_princp_end'] = test_history['outs_princp_end']
test_history['charged_off'] = 0
test_history['m_amt_due'] = np.pmt(m_int_rate, term, -loan_amt)
test_history['stat_adj_value'] = test_history['stat_adj_outs_princp_end']
test_history['rate'] = m_int_rate*12
test_history['service_fees'] = test_history['pmt_amt_received']*.01
test_history['stat_adj_acc_int'] = test_history['acc_int']
test_history['charged_off_amt'] = 0
test_history['issue_d'] = start_date

In [None]:
def summing_std_pmt_history(std_pmt_hist, test = False):
    result = {}
    seen_id_set = set()
    date_grouped = std_pmt_hist.groupby('date')
    
    k = 0
    for date, group in date_grouped:
        sub_dict = {}
        
        newly_funded_loans = group[~group['id'].isin(seen_id_set)]
        newly_funded_ids = newly_funded_loans['id'].unique()
        seen_ids = group['id'].unique()
        seen_id_set.update(seen_ids)
        
        new_funds_needed = 0
        for new_ids in newly_funded_ids:
            if test == False:
                new_funds_needed += plat_dict[plat][new_ids]
            elif test == 'all':
                new_funds_needed += lc_funded_dict[new_ids]
            else:
                new_funds_needed += 100
        
        # split group into 2 groups, those that don't pay in origination month and 
        # those that do, then split up work and combine in sums
        pay_at_ori_group = group[(group['date'] == group['issue_d']) & (group['pmt_amt_received'] > 0)]
        
        other_group = group[~group['id'].isin(pay_at_ori_group['id'].values)]
        
#         deconstructed_seen_ids = set(np.concatenate([pay_at_ori_group['id'].values, other_group['id'].values]))
#         print(type(seen_ids))
#         print(set(seen_ids)^deconstructed_seen_ids)
        
        assert len(group) == (len(pay_at_ori_group) + len(other_group))#, print(len(group), len(pay_at_ori_group), len(other_group), date)
        
        #other_group
        outs_princp_end = other_group['outs_princp_end'].sum()
        outs_princp_beg = other_group['outs_princp_beg'].sum()
        gross_pmt_to_princp = other_group['gross_pmt_to_princp'].sum()
        gross_pmt_to_int = other_group['gross_pmt_to_int'].sum()
        acc_int = other_group['acc_int'].sum()
        stat_adj_acc_int = other_group['stat_adj_acc_int'].sum()
        stat_adj_outs_princp_end = other_group['stat_adj_outs_princp_end'].sum()
        stat_adj_value = other_group['stat_adj_value'].sum()
        pmt_amt_received = other_group['pmt_amt_received'].sum()
        gross_charged_off_recovs = other_group['gross_charged_off_recovs'].sum()
        charged_off_recovs_collection_fees = other_group[
            'charged_off_recovs_collection_fees'].sum()
        service_fees = other_group['service_fees'].sum()
        
        #pay at ori group
        outs_princp_end += pay_at_ori_group['outs_princp_end'].sum()
        outs_princp_beg += pay_at_ori_group['outs_princp_end'].sum() + pay_at_ori_group['gross_pmt_to_princp'].sum()
        gross_pmt_to_princp += pay_at_ori_group['gross_pmt_to_princp'].sum()
        gross_pmt_to_int += pay_at_ori_group['gross_pmt_to_int'].sum()
        acc_int += pay_at_ori_group['acc_int'].sum()
        stat_adj_acc_int += pay_at_ori_group['stat_adj_acc_int'].sum()
        stat_adj_outs_princp_end += pay_at_ori_group['stat_adj_outs_princp_end'].sum()
        stat_adj_value += pay_at_ori_group['stat_adj_value'].sum()
        pmt_amt_received += pay_at_ori_group['pmt_amt_received'].sum()
        gross_charged_off_recovs += pay_at_ori_group['gross_charged_off_recovs'].sum()
        charged_off_recovs_collection_fees += pay_at_ori_group[
            'charged_off_recovs_collection_fees'].sum()
        service_fees += pay_at_ori_group['service_fees'].sum()

        sub_dict['outs_princp_end'] = outs_princp_end
        sub_dict['outs_princp_beg'] = outs_princp_beg
        sub_dict['pmt_amt_received'] = pmt_amt_received
        sub_dict['gross_pmt_to_princp'] = gross_pmt_to_princp
        sub_dict['gross_pmt_to_int'] = gross_pmt_to_int
        sub_dict['acc_int'] = acc_int
        sub_dict['gross_charged_off_recovs'] = gross_charged_off_recovs
        sub_dict[
            'charged_off_recovs_collection_fees'] = charged_off_recovs_collection_fees
        sub_dict['service_fees'] = service_fees
        sub_dict['n_newly_funded_loans'] = len(newly_funded_ids)
        sub_dict['new_funds_needed'] = new_funds_needed
        sub_dict['new_investor_cash'] = max(
            0, new_funds_needed - pmt_amt_received)
        sub_dict['stat_adj_acc_int'] = stat_adj_acc_int
        sub_dict['stat_adj_outs_princp_end'] = stat_adj_outs_princp_end
        sub_dict['stat_adj_value'] = stat_adj_value

        result[date] = sub_dict
        df = pd.DataFrame.from_dict(result).T
        df['stat_adj_outs_princp_end_no_newly_funded'] = df['stat_adj_outs_princp_end'] - df['new_funds_needed']
        df['stat_adj_value_no_newly_funded'] = df['stat_adj_value'] - df['new_funds_needed']
    return df

def compute_unit_values(beg_unit_value, summed_pmt_hist, LR_fee = True):
    summed_pmt_hist = summed_pmt_hist.fillna(0)
    summed_pmt_hist['beg_unit_value'] = '?'
    summed_pmt_hist['new_units_created'] = '?'
    summed_pmt_hist['units_beg'] = '?'
    summed_pmt_hist['units_end'] = '?'
    summed_pmt_hist['beg_value'] = '?'
    summed_pmt_hist['total_cash'] = '?'
    summed_pmt_hist['total_note_value'] = '?'
    summed_pmt_hist['LR_fee'] = '?'
    summed_pmt_hist['beg_value_netfee'] = '?'
    #     summed_pmt_hist['outs_princp_end_no_newly_issued'] = summed_pmt_hist[
    #         'outs_princp_end'] - summed_pmt_hist['new_funds_needed']
    for i in np.arange(len(summed_pmt_hist)):
        if i == 0:
            summed_pmt_hist.ix[i, 'beg_unit_value'] = beg_unit_value
            summed_pmt_hist.ix[i, 'units_beg'] = 0
            summed_pmt_hist.ix[i, 'new_units_created'] = max(
                0, summed_pmt_hist.ix[i, 'new_investor_cash'] /
                summed_pmt_hist.ix[i, 'beg_unit_value'])
            summed_pmt_hist.ix[i, 'units_end'] = summed_pmt_hist.ix[
                i, 'units_beg'] + summed_pmt_hist.ix[i, 'new_units_created']
            summed_pmt_hist.ix[i, 'end_unit_value'] = beg_unit_value
            summed_pmt_hist.ix[i, 'dragging_cash_netfee_from_this_period'] = 0
            summed_pmt_hist.ix[i, 'prev_undeployed_cash'] = 0
            summed_pmt_hist.ix[i, 'beg_value'] = summed_pmt_hist.ix[
                i, 'units_end'] * summed_pmt_hist.ix[i, 'beg_unit_value']
            summed_pmt_hist.ix[i, 'beg_value_netfee'] = summed_pmt_hist.ix[
                i, 'units_end'] * summed_pmt_hist.ix[i, 'beg_unit_value']
            summed_pmt_hist.ix[i, 'total_cash'] = 0
            summed_pmt_hist.ix[i, 'total_note_value'] = summed_pmt_hist.ix[
                i, 'new_investor_cash']
            summed_pmt_hist.ix[i, 'LR_fee'] = 0

        else:
            #             ratio = summed_pmt_hist.ix[i, 'outs_princp_beg'] = summed_pmt_hist.ix[i-1, 'total_note_value_netfee']
            summed_pmt_hist.ix[i, 'prev_undeployed_cash'] = summed_pmt_hist.ix[
                i - 1, 'dragging_cash_netfee_from_this_period']
            summed_pmt_hist.ix[i, 'units_beg'] = summed_pmt_hist.ix[
                i - 1, 'units_end']

            # All Cash:
            cash = (
                summed_pmt_hist.ix[i, 'pmt_amt_received'] -
                summed_pmt_hist.ix[i, 'service_fees'] +
                summed_pmt_hist.ix[i, 'gross_charged_off_recovs'] -
                summed_pmt_hist.ix[i, 'charged_off_recovs_collection_fees'] +
                summed_pmt_hist.ix[i, 'prev_undeployed_cash']
            )  # idle cash this period before receiving payments, netfee from this period

            summed_pmt_hist.ix[i, 'total_cash'] = cash

            
            # Applying our discount for note value definitely changes the value of things a lot.
            note_value = summed_pmt_hist.ix[i,
                                            'stat_adj_value_no_newly_funded']
#             note_value = summed_pmt_hist.ix[i,
#                                             'outs_princp_end'] - summed_pmt_hist.ix[i,
#                                             'new_funds_needed']            

            summed_pmt_hist.ix[i, 'total_note_value'] = note_value

            summed_pmt_hist.ix[i, 'beg_value'] = cash + note_value
            if LR_fee == True:
                summed_pmt_hist.ix[i, 'LR_fee'] = summed_pmt_hist.ix[
                    i, 'beg_value'] * (.0045 / 12)
            else:
                summed_pmt_hist.ix[i, 'LR_fee'] = summed_pmt_hist.ix[
                    i, 'beg_value'] * 0

            summed_pmt_hist.ix[i, 'beg_value_netfee'] = (
                summed_pmt_hist.ix[i, 'beg_value'] -
                summed_pmt_hist.ix[i, 'LR_fee'])

            summed_pmt_hist.ix[i, 'beg_unit_value'] = (
                summed_pmt_hist.ix[i, 'beg_value_netfee'] /
                summed_pmt_hist.ix[i, 'units_beg'])

            summed_pmt_hist.ix[i, 'new_investor_cash'] = max(
                0, summed_pmt_hist.ix[i, 'new_funds_needed'] -
                (cash - summed_pmt_hist.ix[i, 'LR_fee']))
            summed_pmt_hist.ix[i, 'new_units_created'] = summed_pmt_hist.ix[
                i, 'new_investor_cash'] / summed_pmt_hist.ix[i,
                                                             'beg_unit_value']
            summed_pmt_hist.ix[i, 'units_end'] = summed_pmt_hist.ix[
                i, 'units_beg'] + summed_pmt_hist.ix[i, 'new_units_created']
            summed_pmt_hist.ix[
                i, 'dragging_cash_netfee_from_this_period'] = max(
                    0, (cash - summed_pmt_hist.ix[i, 'LR_fee']
                        ) - summed_pmt_hist.ix[i, 'new_funds_needed'])
            # This SHOULD match beg_unit_value but it is off by a few cents????
            summed_pmt_hist.ix[i, 'end_unit_value'] = (
                summed_pmt_hist.ix[i, 'beg_value_netfee'] +
                summed_pmt_hist.ix[i, 'new_investor_cash']  # + 
                #                 summed_pmt_hist.ix[i, 'dragging_cash']
            ) / (summed_pmt_hist.ix[i, 'units_end'])

    # adding return
    summed_pmt_hist['return'] = (summed_pmt_hist['end_unit_value'] /
                                 summed_pmt_hist['end_unit_value'].shift(1)
                                 ) - 1
    summed_pmt_hist['ann_return'] = (summed_pmt_hist['end_unit_value'] /
                                     summed_pmt_hist['end_unit_value'].shift(1)
                                     )**12 - 1
    
    # adding pct dollar discount
    summed_pmt_hist['outs_princp_end_no_newly_funded'] = summed_pmt_hist['outs_princp_end'] - summed_pmt_hist['new_funds_needed']
    summed_pmt_hist['dollar_discount'] = summed_pmt_hist['outs_princp_end_no_newly_funded'] - summed_pmt_hist['total_note_value']
    summed_pmt_hist['dollar_discount_pct'] = summed_pmt_hist['dollar_discount']/summed_pmt_hist['outs_princp_end_no_newly_funded']

    return summed_pmt_hist

In [None]:
fake_summed_pmt_hist = summing_std_pmt_history(test_history, test=True)
fake_results_with_fee = compute_unit_values(100, fake_summed_pmt_hist)
fake_results_no_fee = compute_unit_values(100, fake_summed_pmt_hist)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(fake_results_no_fee.index, fake_results_no_fee['ann_return'], label='no_fee')
plt.plot(fake_results_with_fee.index, fake_results_with_fee['ann_return'], label='with_fee')
plt.legend(loc='best')
plt.show()

In [None]:
fake_results_with_fee[['LR_fee', 'total_cash', 'total_note_value', 'dragging_cash_netfee_from_this_period']].plot()

In [None]:
test_history

In [None]:
test_history.columns.values

In [None]:
scaled.head()

In [None]:
def combined_compute_unit_values(beg_unit_value, std_pmt_hist, test=False):
    result = {}
    seen_id_set = set()
    if test == False:
        date_grouped = std_pmt_hist.groupby('date_mapped')
    else:
        date_grouped = std_pmt_hist.groupby('date')
    k = 0
    for date, group in date_grouped:
        sub_dict = {}
        
        if test == False:
            print(group['issue_d_mapped'].value_counts(dropna=False))
        newly_funded_loans = group[~group['id'].isin(seen_id_set)]
        newly_funded_ids = newly_funded_loans['id'].unique()
        seen_ids = group['id'].unique()
        seen_id_set.update(seen_ids)

        new_funds_needed = 0
        for new_ids in newly_funded_ids:
            if test == False:
                new_funds_needed += plat_dict[plat][new_ids]
            elif test == 'all':
                new_funds_needed += lc_funded_dict[new_ids]
            else:
                new_funds_needed += 100

        # split group into 2 groups, those that don't pay in origination month and 
        # those that do, then split up work and combine in sums
        if test == False:
            pay_at_ori_group = group[(group['date_mapped'] == group['issue_d_mapped']) &
                                     (group['pmt_amt_received'] > 0)]

            other_group = group[~group['id'].isin(pay_at_ori_group['id'].values)]
        else:
            pay_at_ori_group = group[(group['date'] == group['issue_d']) &
                                     (group['pmt_amt_received'] > 0)]

            other_group = group[~group['id'].isin(pay_at_ori_group['id'].values)]

        assert len(group) == (len(pay_at_ori_group) + len(other_group))

        #other_group
        stat_adj_value = other_group['stat_adj_value'].sum()
        pmt_amt_received = other_group['pmt_amt_received'].sum()
        gross_charged_off_recovs = other_group[
            'gross_charged_off_recovs'].sum()
        charged_off_recovs_collection_fees = other_group[
            'charged_off_recovs_collection_fees'].sum()
        service_fees = other_group['service_fees'].sum()

        #pay at ori group
        stat_adj_value += pay_at_ori_group['stat_adj_value'].sum()
        pmt_amt_received += pay_at_ori_group['pmt_amt_received'].sum()
        gross_charged_off_recovs += pay_at_ori_group[
            'gross_charged_off_recovs'].sum()
        charged_off_recovs_collection_fees += pay_at_ori_group[
            'charged_off_recovs_collection_fees'].sum()
        service_fees += pay_at_ori_group['service_fees'].sum()

        sub_dict['stat_adj_value'] = stat_adj_value
        sub_dict['pmt_amt_received'] = pmt_amt_received
        sub_dict['gross_charged_off_recovs'] = gross_charged_off_recovs
        sub_dict[
            'charged_off_recovs_collection_fees'] = charged_off_recovs_collection_fees
        sub_dict['service_fees'] = service_fees
        sub_dict['n_newly_funded_loans'] = len(newly_funded_ids)
        sub_dict['new_funds_needed'] = new_funds_needed
        sub_dict['stat_adj_value_no_newly_funded'] = sub_dict[
            'stat_adj_value'] - new_funds_needed
        result[date] = sub_dict

    df = pd.DataFrame.from_dict(result).T
    df['prev_dragging_cash'] = '?'
    df['stat_adj_value_no_newly_funded'] = df['stat_adj_value'] - df[
        'new_funds_needed']
    df['beg_value'] = '?'
    df['LR_fee'] = '?'
    df['beg_value_netfee'] = '?'
    df['units_beg'] = '?'
    df['beg_unit_value'] = '?'
    df['new_investor_cash'] = '?'
    df['new_units_created'] = '?'
    df['units_end'] = '?'
    df['end_unit_value'] = '?'
    df['dragging_cash'] = '?'

    # iterate row after row to fill in the blanks
    for _ in np.arange(len(df)):
        if _ == 0:
            df.ix[_, 'prev_dragging_cash'] = 0
            df.ix[_, 'beg_value'] = df.ix[
                _, 'stat_adj_value_no_newly_funded'] + df.ix[
                    _, 'prev_dragging_cash']
            df.ix[_, 'LR_fee'] = df.ix[_, 'beg_value'] * .0045/12
#             df.ix[_, 'LR_fee'] = df.ix[_, 'new_funds_needed'] * .0045/12
            df.ix[_, 'beg_value_netfee'] = df.ix[_, 'beg_value'] - df.ix[
                _, 'service_fees'] - df.ix[_, 'LR_fee']
            df.ix[_, 'units_beg'] = 0
            df.ix[_, 'beg_unit_value'] = beg_unit_value
            df.ix[_, 'new_investor_cash'] = df.ix[_, 'new_funds_needed']
            df.ix[_, 'new_units_created'] = df.ix[
                _, 'new_investor_cash'] / df.ix[_, 'beg_unit_value']
            df.ix[_, 'units_end'] = df.ix[_, 'new_units_created'] + df.ix[
                _, 'units_beg']
            df.ix[_, 'end_unit_value'] = (
                df.ix[_, 'beg_value_netfee'] + df.ix[_, 'new_investor_cash']
            ) / df.ix[_, 'units_end']
            df.ix[_, 'dragging_cash'] = 0
        else:
            df.ix[_, 'prev_dragging_cash'] = df.ix[_ - 1, 'dragging_cash']
            df.ix[_, 'beg_value'] = (df.ix[_, 'stat_adj_value_no_newly_funded'] +
                                     df.ix[_, 'pmt_amt_received'] - 
                                     df.ix[_, 'service_fees'] + 
                                     df.ix[_, 'gross_charged_off_recovs'] -
                                     df.ix[_, 'charged_off_recovs_collection_fees'] + 
                                     df.ix[_, 'prev_dragging_cash'])
            df.ix[_, 'LR_fee'] = df.ix[_, 'beg_value'] * .0045/12
            df.ix[_, 'beg_value_netfee'] = df.ix[_, 'beg_value'] - df.ix[_, 'LR_fee']
            df.ix[_, 'units_beg'] = df.ix[_-1, 'units_end']
            df.ix[_, 'beg_unit_value'] = df.ix[_, 'beg_value_netfee']/df.ix[_, 'units_beg']
            df.ix[_, 'new_investor_cash'] = max(0, df.ix[_, 'new_funds_needed'] - 
                                                   (df.ix[_, 'pmt_amt_received'] -
                                                 df.ix[_, 'service_fees'] + 
                                                 df.ix[_, 'gross_charged_off_recovs'] -
                                                 df.ix[_, 'charged_off_recovs_collection_fees'] + 
                                                 df.ix[_, 'prev_dragging_cash'] -
                                                 df.ix[_, 'LR_fee']))
            df.ix[_, 'new_units_created'] = df.ix[
                _, 'new_investor_cash'] / df.ix[_, 'beg_unit_value']
            df.ix[_, 'units_end'] = df.ix[_, 'new_units_created'] + df.ix[
                _, 'units_beg']
            df.ix[_, 'end_unit_value'] = (
                df.ix[_, 'beg_value_netfee'] + df.ix[_, 'new_investor_cash']
            ) / df.ix[_, 'units_end']
            df.ix[_, 'dragging_cash'] = max(0, df.ix[_, 'pmt_amt_received'] -
                                         df.ix[_, 'service_fees'] + 
                                         df.ix[_, 'gross_charged_off_recovs'] -
                                         df.ix[_, 'charged_off_recovs_collection_fees'] + 
                                         df.ix[_, 'prev_dragging_cash'] -
                                         df.ix[_, 'LR_fee'] -
                                         df.ix[_, 'new_funds_needed'])
            
#             pmt_amt_received - service_fees + gross_charged_off_recovs -
#              charged_off_recovs_collection_fees
            
#             df.ix[_, 'dragging_cash'] = max(
#                 0, (df.ix[_, 'pmt_amt_received'] - df.ix[_, 'service_fees'] +
#                     df.ix[_, 'gross_charged_off_recovs'] -
#                     df.ix[_, 'charged_off_recovs_collection_fees'] -
#                     df.ix[_, 'LR_fee']) - df.ix[_, 'new_funds_needed'])
    df['return'] = df['end_unit_value']/df['end_unit_value'].shift(1)-1
    df['ann_return'] = (1+df['return'])**12-1

    return df

In [None]:
def other_approach(beg_unit_value, pmts_hist):
    months = pmts_hist['date_mapped']

In [None]:
examine = combined_compute_unit_values(100, scaled, test=False)

In [None]:
examine

In [None]:
scaled.drop_duplicates('id')['issue_d_mapped'].value_counts(dropna=False)

In [None]:
date_mapped_grouped = scaled.groupby('date_mapped')

In [None]:
group1 = date_mapped_grouped.get_group(1)
group1[group1['issue_d_mapped'] == 1]

In [None]:
to_export.ix['17814923']

In [None]:
to_export[to_export['Month of Issuance'] == 1]['Month of First Record'].value_counts(dropna=False)

In [None]:
to_export[to_export['Month of Issuance'] == 0]['Month of First Record'].value_counts(dropna=False)

In [None]:
to_export[(to_export['Month of Issuance'] == 0) & (to_export['Month of First Record'] == 2)]

In [None]:
original_df.head()

In [None]:
original_df[original_df['LOAN_ID'] == '15471076']

In [None]:
scaled.ix['15471076']

# Investigating small loans

In [None]:
# ten_loan_ids = np.random.choice(scaled_to_invest_amt_dfs['LC']['id'].unique(), 10)
# examine_ids = ['11696485', '74492661', '20006612', '17744006']
prepaid_id_1 = ['10024615']
prepaid_id_2 = ['738934']
gone_late_and_paid_id_1 = ['16382558']
gone_late_and_paid_id_2 = ['16240827']
defaulted_id_1 = ['11696485']
defaulted_id_2 = ['75173629']
random_3_ids_1 = np.random.choice(plat_selected_dfs['LC']['id'].unique(), 3, replace = False)

random_10_ids_1 = np.random.choice(plat_selected_dfs['LC']['id'].unique(), 10, replace = False)

random_ids = np.random.choice(plat_selected_dfs['LC']['id'].unique(), 50000, replace = False)


test_id = random_ids
examine = scaled_to_invest_amt_dfs['LC'][scaled_to_invest_amt_dfs['LC']['id'].isin(test_id)]
summed_examine = summing_std_pmt_history(examine)
unit_vals = compute_unit_values(100, summed_examine)

In [None]:
# scaled_to_invest_amt_dfs['LC'][scaled_to_invest_amt_dfs['LC']['id'].isin(test_id)][['date', 'm_amt_due', 'pmt_amt_received', 'status']]

In [None]:
summed_examine

In [None]:
unit_vals[['new_investor_cash', 'pmt_amt_received', 'service_fees', 'beg_unit_value', 'new_units_created', 'units_beg', 'units_end', 'total_cash', 'total_note_value', 'beg_value_netfee', 'end_unit_value', 'prev_undeployed_cash', 'ann_return']]#, 'dollar_discount', 'dollar_discount_pct']]

In [None]:
unit_vals['ann_return'].plot()

# All unit value stuff

In [None]:
all_loans_unit_vals = compute_unit_values(100, summing_std_pmt_history(scaled_to_invest_amt_dfs['LC']))
all_loans_unit_vals['outs_princp_end_no_newly_funded'] = all_loans_unit_vals['outs_princp_end'] - all_loans_unit_vals['new_funds_needed']
all_loans_unit_vals['dollar_discount'] = all_loans_unit_vals['outs_princp_end_no_newly_funded'] - all_loans_unit_vals['total_note_value']
all_loans_unit_vals['dollar_discount_pct'] = all_loans_unit_vals['dollar_discount']/all_loans_unit_vals['outs_princp_end_no_newly_funded']
all_loans_unit_vals['return'] = all_loans_unit_vals['end_unit_value']/all_loans_unit_vals['end_unit_value'].shift(1)-1
all_loans_unit_vals['ann_return'] = (all_loans_unit_vals['end_unit_value']/all_loans_unit_vals['end_unit_value'].shift(1))**12-1
all_loans_unit_vals['ann_return'].plot()

In [None]:
all_loans_unit_vals

In [None]:
all_loans_unit_vals[['LR_fee', 'total_cash', 'total_note_value']].plot()

In [None]:
# all_loans_unit_vals['gross_charged_off_recovs']/all_loans_unit_vals['dollar_discount'].shift(6)

# Do unit value comparison for all lendingclub loans in general.

In [None]:
datapath_store = '/Users/justinhsi/LRData/lendingclub/lendingclub_store.h5'
lc_iterator = pd.read_hdf(datapath_store, 'clean_pmt_history', chunksize = 200000)

merge_lc = []
for chunk in tqdm_notebook(lc_iterator):
    merge_lc.append(chunk)
all_lc_pmt_hist = pd.concat(merge_lc)

In [None]:
all_lc_pmt_hist = lc_rename_cols(all_lc_pmt_hist)

# drop loans with multiple entries per month just for ease of computations

In [None]:
# see how many loans do have a double entry in same month
print(all_lc_pmt_hist.shape)
loans_with_two_entries_in_same_month = all_lc_pmt_hist[all_lc_pmt_hist.duplicated(['id', 'date'])]
all_lc_pmt_hist = all_lc_pmt_hist[~all_lc_pmt_hist['id'].isin(loans_with_two_entries_in_same_month['id'].values)]
print(all_lc_pmt_hist.shape)

# Find a way to chunk through the dataframe because the kernel crashes otherwise.

In [None]:
add_cols_lc(all_lc_pmt_hist)

In [None]:
len(all_lc_pmt_hist['id'].unique())

In [None]:
all_lc_pmt_hist.shape

In [None]:
one_record_each = all_lc_pmt_hist.drop_duplicates(['id'])

In [None]:
lc_funded_dict = dict(zip(one_record_each['id'], one_record_each['outs_princp_beg']))

In [None]:
all_lc_summed_pmt_hist = summing_std_pmt_history(all_lc_pmt_hist, test='all')

In [None]:
all_lc_summed_pmt_hist