# Imports and Setup

In [492]:
import pandas as pd
from pathlib import Path
import sqlalchemy as sa
from dotenv import load_dotenv
import os
import psutil
import regex as re
import numpy as np
from tqdm import tqdm
import datetime
import pickle
import preprocess_pmts
load_dotenv()

# set options for pandas and numpy formatting (no errors should occur if these are ommitted)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.options.mode.chained_assignment = None  # default='warn'
np.set_printoptions(formatter={'float':"{:6.5g}".format})

engine = sa.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
			.format(host=os.getenv('host'), db=os.getenv('db'), user=os.getenv('uname'), pw=os.getenv('password')))
class Loan:
    ObservationNmb: int
    maturity_dt: datetime.date
    origination_dt: datetime.date
    lender_status: int
    all_pmts: np.ndarray
    
    def __init__(self, nmb, m_dt, o_dt, l_st, pmts):
        self.ObservationNmb = nmb
        self.maturity_dt = m_dt
        self.origination_dt = o_dt
        self.all_pmts = pmts
        self.lender_status = l_st
        
    def find_payoff_date(self):
        pass

## Read in Pickle File

2000-2010

In [2]:
with open('pickle_files/cohort_map_df.pickle','rb') as in_f:
    cohort_map = pickle.load(in_f)

2015-2021

In [546]:
with open('pickle_files/build-outs/2015-2021-df.pickle', 'rb') as in_f:
    cohort_map = pickle.load(in_f)
    

In [547]:
x = 0
for k in cohort_map.keys():    
    x+=len(cohort_map[k])
    print(f"{k}:{len(cohort_map[k])}")
    
print(f"{x:,}")

2015:40793
2016:40391
2017:42527
2018:39748
2019:35290
2020:29497
2021:34270
262,516


### Check structures before preprocessing

In [548]:
def set_top_bot(test: pd.DataFrame)-> pd.DataFrame:
    """Re-window a dataframe to the last occurring max TransactionBalanceAmt 
    and the first occurring min TransactionBalanceAmt"""
    ix_top = test[test['TransactionBalanceAmt'].values == test['TransactionBalanceAmt'].values.max()].index
    ix_bot = test[test['TransactionBalanceAmt'].values == test['TransactionBalanceAmt'].values.min()].index
    return test.iloc[ix_top[-1]:ix_bot[0]+1]

def remove_end_codes(test: pd.DataFrame) -> pd.DataFrame:
    """Remove increasing balances that occur with an end code"""
    e_codes = set(['195','218','305','410','416'])
    idx = test[(test['TransactionCd'].isin(e_codes))&(test['TransactionAmt']>0)].index.to_list()
    return test.drop(idx,axis=0)

def get_yrs_till_mat(test:pd.DataFrame)->float:
    try:
        return round((test['LoanFundedDt'][0] - test['MaturityDt'][0])/np.timedelta64(1, 'Y'),0)
    except:
        return 0
    
def find_spikes(test: pd.DataFrame):
    """Find remaining balance spikes"""
    test = set_top_bot(test)
    test = remove_end_codes(test)
    test = test.reset_index().drop(columns='index')
    arr = test['TransactionBalanceAmt'].values
    mins = (np.diff(np.sign(np.diff(arr))) > 0).nonzero()[0] + 1
    maxs = (np.diff(np.sign(np.diff(arr))) < 0).nonzero()[0] + 1
    return test, mins, maxs



Basic errorhandling -- keep loan data seperate

In [549]:
from preprocess_pmts import stratify_df_types, peak_handler
c = 0
empty = 0
stp = False
handler_tracker = {}
error_msg = ""
for yr in (cohort_map.keys()):
    handler_errors = 0
    if stp:
        print(error_msg)
        break            
    for i, df in enumerate(cohort_map[yr]):
        if df.shape[0] == 0:
            empty += 1
            cohort_map[yr].pop(i)
            continue
        df = stratify_df_types(df.copy())
        test = df[['ObservationNmb','LoanFundedDt','EffectiveDt','MaturityDt']].isna()
        test = test[test.values==True]
        if (test.shape[0])>0:
            c += 1
            cohort_map[yr].pop(i)
            continue
        else:
            df, mins, maxs = find_spikes(df)
            yrs = get_yrs_till_mat(df)
            if yrs is not None:
                df, revolver_cd = peak_handler(df, mins, maxs, yrs)
                df['RLC'] = revolver_cd
                if 'resolved_Balance' not in df.columns:
                    handler_errors += 1                    
                cohort_map[yr][i] = df
            else:
                cohort_map[yr].pop(i)
                print(f"mins: {mins} , maxs: {maxs}, yrs: {yrs}")
                display(test)
                error_msg = "Found yrs == None"
                stp = True
                break
    handler_tracker[yr] = handler_errors


In [533]:
with open('pickle_files/post-processing/2015-2021-df.pickle','wb') as out_f:
    pickle.dump(cohort_map, out_f)

Concatenate all loans in vintage bins

In [485]:
correct_cols = ['ObservationNmb',
  'LoanFundedDt',
  'MaturityDt',
  'EffectiveDt',
  'TransactionCd',
  'GeneralLedgerCd',
  'TransactionAmt',
  'TransactionBalanceAmt',
  'resolved_Amt',
  'resolved_Balance',
  'RLC']
concat_map = {}
for yr in cohort_map.keys():
    incorrect_dfs = []
    correct_dfs = []
    for df in cohort_map[yr]:
        if df.columns.to_list() == correct_cols:
            correct_dfs.append(df)
        else:
            incorrect_dfs.append(df)
    concat_map[yr] = correct_dfs


Amortization

In [None]:
from prepay_utilities import create_amortization_lookup, amortize
print('creating amortization lookup')
amortization_lookup = create_amortization_lookup(engine_obj=engine)
amortization_lookup = amortization_lookup.set_index('ObservationNmb')
amortization_lookup= amortization_lookup.fillna(0)
print('done.\n')

In [490]:
def merge_to_final(df_actual: pd.DataFrame, df_sched: pd.DataFrame)->pd.DataFrame:
    df_a = df_actual.copy()
    df_b = df_sched.copy()
    df_a.set_index(pd.RangeIndex(df_a.shape[0]), inplace=True)
    df_b.set_index(pd.RangeIndex(df_b.shape[0]), inplace=True)
    df_a = df_a[['ObservationNmb', 'EffectiveDt', 'resolved_Amt', 'resolved_Balance']]
    df_b = df_b[['Scheduled Principal']]
    df_b['Scheduled Principal'] = df_b['Scheduled Principal'].shift()
    df_b['Scheduled Principal'] = df_b['Scheduled Principal']*-1
    merged = df_a.merge(df_b, left_index=True, right_index=True, how='outer')
    merged['Prepay Amount'] = merged['resolved_Amt'] - merged['Scheduled Principal']
    chop_here = merged[merged['ObservationNmb'].isnull()].iloc[0].name
    # Set PaidOff to first instance of payoff -- should have been done already but make double sure there aren't multiple payoffs
    try:
        idx = merged.loc[merged['resolved_Balance']==0].iloc[0].name
    except IndexError:
        idx = 0
    if chop_here > idx:
        return merged.iloc[0:idx+1]
    else:
        return merged.iloc[0:chop_here]

In [None]:
c = 0
revs = 0
empty = 0
for year in (concat_map.keys()):
    final_version = []
    for df in concat_map[year]:
        if df.shape[0]>0:
            if df['RLC'][0] == 'Y':
                revs += 1
                continue
            num= df['ObservationNmb'][0]
            lf_dt = df['LoanFundedDt'][0]
            m_dt = df['MaturityDt'][0]
            interest_rate = amortization_lookup.loc[num]['BankInterestPct']
            if amortization_lookup.loc[num]['GrossGtyDisbursementAmt']==0:
                loan_amt = df['TransactionBalanceAmt'].max()
            else: 
                loan_amt = amortization_lookup.loc[num]['GrossGtyDisbursementAmt']
            try:
                months = int( pd.Timedelta(m_dt - lf_dt)/np.timedelta64(1, 'M'))
            except TypeError:
                months = amortization_lookup.loc[num]['MaturityMthsQty']
        
            scheduled_df = pd.DataFrame(amortize(interest_rate, loan_amt, months))
            
            if 'Scheduled Principal' in scheduled_df.columns:
                try:
                    final_version.append(merge_to_final(df, scheduled_df))
                    
                except IndexError:
                    c += 1
            else:
                c += 1

        else:
            empty += 1
            continue
        
    print(f"Finishing {year}, total loans: {len(final_version)}")
    concat_map[year] = pd.concat(final_version)
    print(f"Loans labeled revolvers in {year}: {revs}")
    print(f"errors for {year}: {c}")
    print(f"Empty DFs for {year}: {empty}\n")

In [464]:
"""This code reorganizes the vintages based on what year they make their first payment"""
push = False    
for year in concat_map.keys():
    d = (concat_map[year].copy())
    if push: 
       d = pd.concat([d, to_push])
    subset = d[['EffectiveDt','ObservationNmb']].groupby('ObservationNmb').first()
    subset = subset.loc[(subset['EffectiveDt'].dt.year > int(year))]
    push_list = subset.index.to_list()
    to_push = d[(d['ObservationNmb'].isin(push_list))]
    if to_push.shape[0] != 0:
        push = True
    else:
        push = False
    d = d[~(d['ObservationNmb'].isin(push_list))]
    concat_map[year] = d

In [466]:
test = concat_map['2021']
len(test['ObservationNmb'].unique())

15887

In [467]:
import openpyxl

def pool_balances(in_df:pd.DataFrame)->pd.DataFrame:
    x_Balances = in_df[['ObservationNmb', 'Year', 'Month', 'resolved_Amt', 'resolved_Balance']].groupby(['ObservationNmb', 'Year', 'Month']).mean()[['resolved_Balance']].reset_index()
    x_Balances = x_Balances.groupby(['Year', 'Month']).sum()
    x_Balances['smoothed'] = x_Balances['resolved_Balance'].rolling(window=3).mean()
    return x_Balances

def pool_amounts(in_df:pd.DataFrame)->pd.DataFrame:
    x_Amts = in_df[['ObservationNmb','Year','Month','resolved_Amt', 'Prepay Amount']].groupby(['ObservationNmb', 'Year', 'Month']).mean()[['resolved_Amt','Prepay Amount']].reset_index()
    x_Amts = x_Amts.groupby(['Year','Month']).sum()
    return x_Amts

def pool_counts(in_df:pd.DataFrame)->pd.DataFrame:
    x_Counts = in_df[['ObservationNmb','Year','Month','resolved_Balance', 'Prepay Amount']]
    PrincipalPmtCnt_col = x_Counts[['ObservationNmb', 'Year', 'Month', 'Prepay Amount']].groupby(['ObservationNmb', 'Year', 'Month']).count().reset_index()
    PrincipalPmtCnt_col = PrincipalPmtCnt_col.groupby(['Year', 'Month']).sum()['Prepay Amount'].values
    x_Counts['Num PIF'] = 0
    x_Counts['Num Outstanding'] = 0
    x_Counts.loc[x_Counts['resolved_Balance']==0, 'Num PIF' ] = 1
    x_Counts.loc[x_Counts['resolved_Balance']!=0, 'Num Outstanding' ] = 1
    x_Counts = x_Counts.groupby(['ObservationNmb','Year','Month']).last()[['Num PIF','Num Outstanding']].reset_index().groupby(['Year','Month']).sum()
    x_Counts['Number Principal Pmts'] = PrincipalPmtCnt_col
    return x_Counts

def merge_all(ordered_merge_list:list)->pd.DataFrame:
    if len(ordered_merge_list)==1:
        return ordered_merge_list[0]
    else:
        left = ordered_merge_list[0]
        right = ordered_merge_list[1:]
        return merge_all([left.merge(right[0], left_index=True, right_index=True)]+right[1:])

def build_vintage_analysis(c_map:dict)->pd.DataFrame:
    analysis_map = {}
    for yr in c_map:
        test = c_map[yr].copy()
        test['Year']=test['EffectiveDt'].dt.year
        test['Month']=test['EffectiveDt'].dt.month
        # ensure no errant years are part of the analysis
        test = test[test['Year']>=int(yr)]
        # build Pool Balance and Smoothed balance
        df_balances = pool_balances(test)
        # build Pool Amounts
        df_amounts = pool_amounts(test)
        # build Pool Counts
        df_counts = pool_counts(test)
        # Constants
        max_pool_balance = df_balances['resolved_Balance'].max()
        total_prepaid_amount = df_amounts['Prepay Amount'].sum()
        merged = merge_all([df_balances,df_amounts,df_counts])
        merged['Prepaid Pct'] = round((merged['Prepay Amount']/max_pool_balance),5)
        merged['Aggregate Prepaid Pct'] = merged['Prepaid Pct'].cumsum().abs()
        merged.to_csv(f"csvs/analysis-outs/{yr}-analysis.csv")
        analysis_map[yr] = merged.reset_index()
    return analysis_map

def build_excel_wb(a_map:dict)->pd.DataFrame:
    with pd.ExcelWriter("excel_files/analysis-outs/user_totals.xlsx", engine="openpyxl", mode="w", if_sheet_exists="replace") as writer:
        for yr in a_map:
            sheet_name = str(yr)
            df = a_map[yr].copy()
            df.to_excel(writer, sheet_name)
        return(print("excel file built"))
        

In [468]:
merged = build_vintage_analysis(concat_map)


In [471]:
build_excel_wb(merged)

excel file built


In [472]:
for year in concat_map.keys():
    print(f"number of loans in vintage {year}: {len(concat_map[year]['ObservationNmb'].unique())}")

number of loans in vintage 2015: 22747
number of loans in vintage 2016: 29362
number of loans in vintage 2017: 30642
number of loans in vintage 2018: 28433
number of loans in vintage 2019: 23386
number of loans in vintage 2020: 18037
number of loans in vintage 2021: 15887


## Building CPR tables

In [550]:
def format_to_concat(chrt_map:dict)->dict:
    correct_cols = ['ObservationNmb',
    'LoanFundedDt',
    'MaturityDt',
    'EffectiveDt',
    'TransactionCd',
    'GeneralLedgerCd',
    'TransactionAmt',
    'TransactionBalanceAmt',
    'resolved_Amt',
    'resolved_Balance',
    'RLC']
    concat_map = {}
    e = 0
    for yr in chrt_map.keys():
        incorrect_dfs = []
        correct_dfs = []
        for i, df in enumerate(chrt_map[yr]):
            if df.columns.to_list() == correct_cols or 'PIF' in df.columns.to_list():
                # Check that the first row doesn't contain a 0 for resolved Balance
                try:
                    if df.iloc[0]['resolved_Balance']!=0:

                    # Remove multiple payoffs and create payoff column
                        try:    
                            df['PIF'] = 0
                            idx = df[df['resolved_Balance']==0].iloc[0].name
                            
                            df.loc[df.index == idx, 'PIF'] = 1

                            correct_dfs.append(df)
                        except IndexError:
                            e+=1
                            correct_dfs.append(df)
                            continue
                except:
                    continue
            else:
                incorrect_dfs.append(df)
        print(f"{yr} loans without payoff found: {e}")
        concat_map[yr] = pd.concat(correct_dfs)
    return concat_map

def format_cols(df:pd.DataFrame)->pd.DataFrame:
    cols = df.columns
    for c in cols:
        ends_with = c.lower()[-3:]
        if 'dt' in ends_with:
            df[c] = pd.to_datetime(df[c])
        elif 'amt' in ends_with:
            df[c] = pd.to_numeric(df[c])       
        elif 'balance' in c.lower():
            df[c] = pd.to_numeric(df[c])        
    return df

"""This code reorganizes the vintages based on what year they make their first payment"""
def stabilize_pool(c_map:dict)->dict:
    push = False
    yrs = list(c_map.keys())
    to_push = pd.DataFrame(columns=c_map[yrs[0]].columns)
    for year in c_map.keys():
        d = (c_map[year].copy())
        if to_push.shape[0]!=0:
            d = pd.concat([d, to_push])
        subset = d[['EffectiveDt','ObservationNmb']].groupby('ObservationNmb').first()
        subset = subset.loc[(subset['EffectiveDt'].dt.year > int(year))]
        push_list = subset.index.to_list()
        to_push = d[(d['ObservationNmb'].isin(push_list))]
        d = d[~(d['ObservationNmb'].isin(push_list))]
        c_map[year] = d
    return c_map

In [551]:

def addtl_post_processing(c_map:dict)->dict:
    copy = {}
    for yr in c_map:    
        test = c_map[yr]
        # remove revolvers and loans without maturitydts
        # test = test[test['RLC']=='N']
        test = test[test['EffectiveDt'].dt.year>=int(yr)]
        test = test[~test['MaturityDt'].isna()]
        test['Year']=test['EffectiveDt'].dt.year
        test['Month']=test['EffectiveDt'].dt.month
        # Remove loans that have a resolved Balance of zero and a TransactionBAmt greater than zero in the funding window
        # These loans have janky payment histories
        test_1 = test.groupby(['Year','Month','ObservationNmb']).first()[['resolved_Balance','TransactionBalanceAmt']]
        test_1 = test_1.loc[(test_1['resolved_Balance']==0)&(test_1['TransactionBalanceAmt']!=0)].reset_index()
        to_drop = test_1[test_1['Year']==int(yr)]['ObservationNmb'].to_list()
        test = test[~test['ObservationNmb'].isin(to_drop)]
        copy[yr] = test
    return copy

In [552]:
def get_counts(c_map:dict)->None:
    """Get total number of loans and line count in a dictionary that has pointers to lists of """
    tots = 0
    for k in c_map:
        tots+=((c_map[k].shape[0]))
        print(len(c_map[k]['ObservationNmb'].unique()))
    print(f"total prinipal transactions: {tots:,}")

In [553]:
# format and pull from cohort map
cpr_map = format_to_concat(cohort_map)
for year in cpr_map.keys():
    cpr_map[year] = format_cols(cpr_map[year])
cpr_map = stabilize_pool(cpr_map)

2015 loans without payoff found: 9773
2016 loans without payoff found: 22777
2017 loans without payoff found: 40803
2018 loans without payoff found: 60495
2019 loans without payoff found: 79622
2020 loans without payoff found: 97092
2021 loans without payoff found: 109957


In [554]:
cpr_map = addtl_post_processing(cpr_map)

In [555]:
mnths = list(range(1,13))
obs_dict = dict()
for yr in cpr_map.keys():
    test = cpr_map[yr]
    # Differentiate among prepayment and default
    default_codes = ['218','305','410','416']
    test['Default'] = 0
    test['PP'] = 0
    test.loc[(test['PIF']==1)&(test['TransactionCd'].isin(default_codes)),'Default']=1
    test.loc[(test['PIF']==1)&(~test['TransactionCd'].isin(default_codes)),'PP']=1

    # create term table by calculating loan term in years for each obs
    # THIS CODE COULD BE EXPANDED FOR NEW BUCKETS
    # overall, I think I need to push out revised tables with principal pmt histories so that I can build these buckets much faster.
    term_table = test.groupby('ObservationNmb').first()[['Year','Month','LoanFundedDt','MaturityDt']].sort_values(['Year','Month'])
    term_table['Term'] = ((term_table['MaturityDt'] - term_table['LoanFundedDt'])/np.timedelta64(1,'Y')).round(0).astype(int)
    a = term_table.Term.values
    _8_11_idx = np.where(np.logical_and(a>8, a<=11))
    _21_idx = np.where(a>=21)
    _ci = term_table.iloc[_8_11_idx].index.to_list()
    _cre = term_table.iloc[_21_idx].index.to_list()

    # create window on dataframe that groups by the Note Date for each observationNmb
    test_1 = test.groupby('ObservationNmb').first()[['Year','Month']].sort_values(['Year','Month']).reset_index().groupby(['Year','Month','ObservationNmb']).count().reset_index()

    # Split into ci and cre
    ci = test_1[test_1.ObservationNmb.isin(_ci)]
    cre = test_1[test_1.ObservationNmb.isin(_cre)]

    def form_cpr_arr( df:pd.DataFrame, in_arr:np.ndarray )->np.ndarray:
        temp = df[df['ObservationNmb'].isin(in_arr)][['ObservationNmb','Year','Month','PIF','PP','Default']].groupby(['Year','Month']).sum().reset_index()
        temp['Outstanding'] = 0
        temp_x= temp['PIF'].cumsum()
        total = len(in_arr)
        temp['Outstanding'] = total - temp_x
        return temp.drop(columns=['Year','Month'])[['Outstanding','PP','Default']].transpose().values

    for m in mnths:
        obs_dict[yr+str(m)]= {}
        obs_dict[yr+str(m)]['ci'] = test_1[(test_1['Month']==m)&(test_1['ObservationNmb'].isin(ci.ObservationNmb.values))]['ObservationNmb'].to_list()
        obs_dict[yr+str(m)]['cre'] = test_1[(test_1['Month']==m)&(test_1['ObservationNmb'].isin(cre.ObservationNmb.values))]['ObservationNmb'].to_list()
        temp = test[test['ObservationNmb'].isin(obs_dict[yr+str(m)]['ci'])][['ObservationNmb','Year','Month','PIF','PP','Default']].groupby(['Year','Month']).sum().reset_index()
        obs_dict[yr+str(m)]['ci'] = form_cpr_arr(test, obs_dict[yr+str(m)]['ci'])
        obs_dict[yr+str(m)]['cre'] = form_cpr_arr(test, obs_dict[yr+str(m)]['cre'])
    print(f'done {yr}')

done 2015
done 2016
done 2017
done 2018
done 2019
done 2020
done 2021


In [556]:
from csv import writer

ins=0
for t in obs_dict.keys():
    with open('csvs/cprs/_8_11.csv', 'a', newline='') as f_object:
        row = [t]
        row += list(obs_dict[t]['ci'][0])
        writer_obj = writer(f_object)
        writer_obj.writerow(row)
    with open('csvs/cprs/_21.csv', 'a', newline='') as f_object:
        row = [t]
        row += list(obs_dict[t]['cre'][0])
        writer_obj = writer(f_object)
        writer_obj.writerow(row)
    ins+=1    
with open('csvs/cprs/_8_11.csv', 'a', newline='') as f:
    writer_obj = writer(f)
    writer_obj.writerow('\n')
with open('csvs/cprs/_21.csv', 'a', newline='') as f:
    writer_obj = writer(f)
    writer_obj.writerow('\n')

for t in obs_dict.keys():
    with open('csvs/cprs/_8_11_PP.csv', 'a', newline='') as f_object:
        row = [t]
        row += list(obs_dict[t]['ci'][1])
        writer_obj = writer(f_object)
        writer_obj.writerow(row)
    with open('csvs/cprs/_21_PP.csv', 'a', newline='') as f_object:
        row = [t]
        row += list(obs_dict[t]['cre'][1])
        writer_obj = writer(f_object)
        writer_obj.writerow(row)
        
with open('csvs/cprs/_8_11_PP.csv', 'a', newline='') as f:
    writer_obj = writer(f)
    writer_obj.writerow('\n')
with open('csvs/cprs/_21_PP.csv', 'a', newline='') as f:
    writer_obj = writer(f)
    writer_obj.writerow('\n')
    
for t in obs_dict.keys():
    with open('csvs/cprs/_8_11_Default.csv', 'a', newline='') as f_object:
        row = [t]
        row += list(obs_dict[t]['ci'][2])
        writer_obj = writer(f_object)
        writer_obj.writerow(row)
    with open('csvs/cprs/_21_Default.csv', 'a', newline='') as f_object:
        row = [t]
        row += list(obs_dict[t]['cre'][2])
        writer_obj = writer(f_object)
        writer_obj.writerow(row)
        
with open('csvs/cprs/_8_11_Default.csv', 'a', newline='') as f:
    writer_obj = writer(f)
    writer_obj.writerow('\n')
with open('csvs/cprs/_21_Default.csv', 'a', newline='') as f:
    writer_obj = writer(f)
    writer_obj.writerow('\n')


In [106]:
test = cpr_map['2000'].reset_index().drop(columns='index').reset_index()

Unnamed: 0,index,ObservationNmb,LoanFundedDt,MaturityDt,EffectiveDt,TransactionCd,GeneralLedgerCd,TransactionAmt,TransactionBalanceAmt,resolved_Amt,resolved_Balance,RLC
0,0,1469636,2000-01-01,2005-01-31,2000-01-31,455,6031,20000.00,20000.00,20000.00,20000.00,N
1,1,1469636,2000-01-01,2005-01-31,2000-02-29,455,6031,-234.78,19765.20,-234.78,19765.20,N
2,2,1469636,2000-01-01,2005-01-31,2000-03-31,455,6031,-246.86,19518.40,-246.86,19518.40,N
3,3,1469636,2000-01-01,2005-01-31,2000-04-30,455,6031,-234.24,19284.10,-234.24,19284.10,N
4,4,1469636,2000-01-01,2005-01-31,2000-05-31,455,6031,-241.62,19042.50,-241.62,19042.50,N
...,...,...,...,...,...,...,...,...,...,...,...,...
356364,356364,2016641,2000-12-29,2011-01-31,2004-11-30,455,6031,-2219.04,128075.00,-2219.04,128075.00,N
356365,356365,2016641,2000-12-29,2011-01-31,2004-12-31,455,6031,-2247.34,125828.00,-2247.34,125828.00,N
356366,356366,2016641,2000-12-29,2011-01-31,2005-01-31,455,6031,-2205.34,123623.00,-2205.34,123623.00,N
356367,356367,2016641,2000-12-29,2011-01-31,2005-02-28,455,6031,-2095.52,121527.00,-2095.52,121527.00,N


In [None]:
# remove revolvers and loans without maturitydts
test = test[test['RLC']=='N']
test = test[test['EffectiveDt'].dt.year>=2000]
test = test[~test['MaturityDt'].isna()]
test['Year']=test['EffectiveDt'].dt.year
test['Month']=test['EffectiveDt'].dt.month
test_1 = test.groupby(['ObservationNmb','Year','Month','EffectiveDt']).count()[['resolved_Balance']]
test_1 = test_1.loc[test_1.resolved_Balance > 1].reset_index().merge(test, how='left', on=['ObservationNmb','Year','Month','EffectiveDt'])
test_1[['ObservationNmb','Year','Month','TransactionAmt'	,'TransactionBalanceAmt'	,'resolved_Amt',	'resolved_Balance_y']]

In [None]:

test['Year']=test['EffectiveDt'].dt.year
test['Month']=test['EffectiveDt'].dt.month
test = test.groupby(['ObservationNmb','Year','Month']).first().reset_index().groupby(['Year','Month']).first()
test

In [15]:
import plotly.express as px

data = {}
for year in merged.keys():
    data[year]=(merged[year]['Aggregate Prepaid Pct'].values)


speeds = pd.DataFrame.from_dict(data, orient='index').transpose()

fig = px.line(speeds, labels={
            'index':'Months From Origination',
            'value': 'Aggregate Prepaid Pct'
                }, 
              title='Aggregate Prepaid Pct: 2000-2010')

fig.show()

In [380]:
merged.to_csv('csvs/analysis-outs/test_1.csv')

## To csvs below

In [337]:
test = concat_map['2000']
test['Year']=test['EffectiveDt'].dt.year
test['Month']=test['EffectiveDt'].dt.month
test = test.reset_index().drop(columns='index')
test = test[test['Year']>1960]
test = test[['ObservationNmb', 'Year', 'Month', 'resolved_Amt', 'resolved_Balance']].groupby(['ObservationNmb', 'Year', 'Month']).mean()[['resolved_Balance']].reset_index()
test = test.groupby(['Year', 'Month']).sum()
test['smoothed'] = test['resolved_Balance'].rolling(window=3).mean()
# test.to_csv('csvs/12-27/balances.csv')

test['smoothed'].max()

840416425.13

In [None]:
test = concat_map['2000']
test['Year']=test['EffectiveDt'].dt.year
test['Month']=test['EffectiveDt'].dt.month
test = test.reset_index().drop(columns='index')
test = test[test['Year']>1960]

test = test[['ObservationNmb', 'Year', 'Month', 'Prepay Amount']].groupby(['ObservationNmb', 'Year', 'Month']).count().reset_index()
test.groupby(['Year', 'Month']).sum()[['Prepay Amount']]





In [281]:
test = concat_map['2000']
test['Year']=test['EffectiveDt'].dt.year
test['Month']=test['EffectiveDt'].dt.month
test = test[test['Year']>1960]
test['Paid Off'] = 0
test['Not Paid Off'] = 0
test.loc[test['resolved_Balance']==0, 'Paid Off' ] = 1
test.loc[test['resolved_Balance']!=0, 'Not Paid Off' ] = 1
test.groupby(['ObservationNmb','Year','Month']).last()[['Paid Off','Not Paid Off']].reset_index().groupby(['Year','Month']).sum()

In [None]:
test = concat_map['2000']
test['Year']=test['EffectiveDt'].dt.year
test['Month']=test['EffectiveDt'].dt.month
x_Balances = test[['ObservationNmb','resolved_Amt','resolved_Balance']].groupby(['ObservationNmb', test.index]).last()[['resolved_Balance']].reset_index()

x_Balances = x_Balances[x_Balances['Year']>1960]

x_Balances.groupby(['Year','Month']).sum().to_csv('csvs/12-27/balances_t.csv')

In [177]:
test = concat_map['2000']
test = test[test['RLC']=='N']
test['Year']=test['EffectiveDt'].dt.year
test['Month']=test['EffectiveDt'].dt.month
x_Amts = test[['ObservationNmb','Year','Month','','resolved_Balance']].groupby(['ObservationNmb', 'Year', 'Month']).sum()[['resolved_Amt']].reset_index()
x_Amts.groupby(['Year','Month']).sum().to_csv('csvs/12-27/amts.csv')

## Preprocessing Functions

In [None]:
def to_datetime(in_df: pd.DataFrame, c_list: list)->pd.DataFrame:
    for c in c_list:
        in_df[c] = pd.to_datetime(in_df[c])
    return in_df

def to_numeric(in_df: pd.DataFrame, c_list: list)->pd.DataFrame:
    for c in c_list:
        in_df[c] = pd.to_numeric(in_df[c])
    return in_df

def to_str(in_df: pd.DataFrame, c_list: list)->pd.DataFrame:
    for c in c_list:
        in_df[c] = in_df[c].astype(str)
    return in_df

def stratify_df_types(test:pd.DataFrame)->pd.DataFrame:
    """Set dtypes and column names like they will be in the database"""
    test = test.reset_index().drop(columns='index')
    test.columns = ['ObservationNmb', 
                    'LoanFundedDt', 'MaturityDt', 'EffectiveDt',
                    'TransactionCd', 'GeneralLedgerCd',
                    'TransactionAmt', 'TransactionBalanceAmt']
    dates_list = ['LoanFundedDt', 'MaturityDt', 'EffectiveDt']
    nums_list = ['TransactionAmt', 'TransactionBalanceAmt']
    str_list = ['TransactionCd', 'GeneralLedgerCd', 'ObservationNmb']
    test = to_datetime(test, dates_list)
    test = to_numeric(test, nums_list)
    test = to_str(test, str_list)
    return test
    
def set_top_bot(test: pd.DataFrame)-> pd.DataFrame:
    """Re-window a dataframe to the last occurring max TransactionBalanceAmt 
    and the first occurring min TransactionBalanceAmt"""
    ix_top = test[test.TransactionBalanceAmt.values == test.TransactionBalanceAmt.values.max()].index
    ix_bot = test[test.TransactionBalanceAmt.values == test.TransactionBalanceAmt.values.min()].index
    return test.iloc[ix_top[-1]:ix_bot[0]+1]

def remove_end_codes(test: pd.DataFrame) -> pd.DataFrame:
    """Remove increasing balances that occur with an end code"""
    e_codes = set(['195','218','305','410','416'])
    idx = test[(test.TransactionCd.isin(e_codes))&(test.TransactionAmt>0)].index.to_list()
    return test.drop(idx,axis=0)

def find_spikes(test: pd.DataFrame):
    """Find remaining balance spikes"""
    sample = set_top_bot(test)
    sample = remove_end_codes(sample)
    sample = sample.reset_index().drop(columns='index')
    arr = sample['TransactionBalanceAmt'].values
    mins = (np.diff(np.sign(np.diff(arr))) > 0).nonzero()[0] + 1
    maxs = (np.diff(np.sign(np.diff(arr))) < 0).nonzero()[0] + 1
    return sample, mins, maxs


def get_yrs_till_mat(test:pd.DataFrame)->float:
    try:
        return round((test['LoanFundedDt'][0] - test['MaturityDt'][0])/np.timedelta64(1, 'Y'),0)
    except:
        return 0

def hammer(test:pd.DataFrame)-> pd.DataFrame:
    """Force monotonic decreasing on balance and transactions"""
    test['resolved_Balance'] =   np.minimum.accumulate(test['resolved_Balance'])
    test['resolved_Amt'] = test['resolved_Balance'].diff()
    return test

def scalpel(test: pd.DataFrame, idx_min: list, idx_max: list)->pd.DataFrame:
    """Fix simplest form of accounting error"""
    if len(idx_min) == 0:
        return test
    x_max = test.loc[test.index.isin(idx_max)]
    x_min = test.loc[test.index.isin(idx_min)]
    
    min_row = x_min['resolved_Balance'].idxmin()
    max_row = x_max['resolved_Balance'].idxmax()
    test.at[min_row, 'resolved_Balance'] = -1*(test.at[min_row, 'resolved_Amt'])
    test.at[max_row, 'resolved_Amt'] = -1*(test.at[min_row, 'resolved_Balance'] - test.at[max_row, 'resolved_Balance'])
    test.at[min_row, 'resolved_Amt'] = -1*(test.at[min_row-1, 'resolved_Balance'] - test.at[min_row, 'resolved_Balance'])
    test['resolved_Amt'] = test['resolved_Balance'].diff()
    return test


def peak_handler(test: pd.DataFrame, idx_min: list, idx_max: list, yrs_till_mat: float)-> str:
    test['resolved_Amt'] = test['TransactionAmt']
    test['resolved_Balance'] = test['TransactionBalanceAmt']
    
    if test['resolved_Balance'].is_monotonic_decreasing:
        return test, 'N'
    elif len(idx_max)==0:
        return test, 'N'
    elif len(idx_max) > 3 and yrs_till_mat<=10:
        return test, 'Y'
    elif len(idx_max) <= 3:
        test = scalpel(test, idx_min, idx_max)
        if test['resolved_Balance'].is_monotonic_decreasing:
            return test, 'N'
        else:
            return hammer(test), 'N'
      
def preprocess_cohorts(c_map: dict):
    """Reassign GPs based on when their principal payments begin"""
    yr_list = list(c_map.keys())
    

def verify_vintage(test:pd.DataFrame, c_map: dict, curr_k: str, curr_idx: int) -> None:
    """Verify that a loan's payment history starts in the vintage it was originally assigned to before preprocessing"""
    v = str(test['EffectiveDt'].dt.year[0])
    if v != curr_k:
        to_change = c_map[curr_k].pop(curr_idx)
    if v in [str(k) for k in c_map.keys()]:
        c_map[v].append(test)
    return None

def remove_revolver(test: pd.DataFrame, c_map: dict, curr_k: str, curr_idx: int)-> False:
    """Remove loans that have now been labeled revolvers post-preprocessing"""
    if test.name == 'Y':
        c_map[curr_k].pop(curr_idx)
        return True
    else:
        return False

def preprocess_history(test: pd.DataFrame):
        test = stratify_df_types(test)
        test, mins, maxs = find_spikes(test)
        yrs = get_yrs_till_mat(test)

        test, is_revolver = peak_handler(test, mins, maxs, yrs)
        test.name = is_revolver
        
        return test



### Calling the preprocessing program's functions 

In [None]:
print(f"first index of 2000: {list(cohort_map['2000'].keys())[0]}")

def testing():
    count = 0
    for k in cohort_map.keys():
        ls = cohort_map[k].keys()
        for i in tqdm(ls):
            if type(cohort_map[k][i]) == None:
                print(f"found NoneType at{k} {i}")
            elif not cohort_map[k][i].empty:

                test = preprocess_pmts.stratify_df_types(cohort_map[k][i])
                test, mins, maxs = preprocess_pmts.find_spikes(test)
                yrs = preprocess_pmts.get_yrs_till_mat(test)
                try:
                    test, is_Revolver = preprocess_pmts.peak_handler(test, mins, maxs, yrs)
                except ValueError:
                    display(mins)
                    display(test)
                    break
    return None   

testing()