### Basic imports

In [2]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
from dotenv import load_dotenv
load_dotenv()

engine = sa.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
    .format(host=os.getenv('host'), db=os.getenv('db'), user=os.getenv('uname'), pw=os.getenv('password')))


### 1. Read in Data from csv --> could also use pd.read_sql and use .env to load auth info

In [3]:
loan_data = pd.read_csv("raw_data/master_loan_tape.csv")
# format date columns to datetime data types
date_cols = [c for c in loan_data.columns if str(c)[-2:]=='Dt']
for col in date_cols:
    loan_data[col] = pd.to_datetime(loan_data[col])

# Take a look at the top of the dataframe
loan_data.head()

Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name
0,1502649,2000,7,2000-07-31,20600.0,2007-07-31,84,2008-10-08,,2006-03-06,67.0,67.0,0-8,453110.0,Florists,0.04,0.0275+,CT,Connecticut
1,1503079,2001,1,2001-01-31,14288.55,2008-01-31,84,2010-11-30,,2004-03-02,37.0,37.0,0-8,-1.0,,0.0275,0.025 - 0.0275,OH,Ohio
2,2019667,2001,2,2001-02-28,3600.0,2003-02-28,24,2002-01-30,11.0,NaT,,11.0,0-8,441310.0,Automotive Parts and Accessories Stores,,,MS,Mississippi
3,2022516,2001,2,2001-02-28,60000.0,2011-02-27,119,2011-09-13,,2010-01-12,106.0,106.0,8-11,722212.0,"Cafeterias, Grill Buffets, and Buffets",0.015,0.0125 - 0.015,NY,New York
4,2039790,2001,7,2001-07-31,20000.02,2006-07-31,60,2005-05-12,45.0,2003-12-31,29.0,29.0,0-8,561499.0,All Other Business Support Services,,,NC,North Carolina


In [4]:
# Slice the data frame to only 21+ year maturity loans
data_slice = loan_data[loan_data['MatBucket']=='21+']

In [5]:

show_problem = False

if show_problem:
    data_slice = data_slice[data_slice['LoanAmt']>=2_500_000]

### 2. Use pool.py to organize cohorts --> loans are split into yyyy.mm cohorts

In [6]:
from pooler import pool

# Define a function to create static pools of Loans from the DataFrame
def create_pooler(in_df:pd.DataFrame)-> pool.Pooler:
    temp = in_df.set_index('GP')
    temp = temp.to_dict()
    loans_dict = {}
    for gp in temp['NoteDt'].keys():
        loans_dict[str(gp)] = pool.Loan(gp, pd.to_datetime(temp['NoteDt'][gp]))
        loans_dict[str(gp)].maturity_dt = temp['MaturityDt'][gp]
        loans_dict[str(gp)].maturity_mths_qty = temp['MaturityMthsQty'][gp]
        loans_dict[str(gp)].default_dt = temp['DefaultDt'][gp]
        loans_dict[str(gp)].default_mths_qty = temp['DefaultMthsQty'][gp]
        loans_dict[str(gp)].prepay_dt = temp['PrepayDt'][gp]
        loans_dict[str(gp)].prepay_mths_qty = temp['PrepayMthsQty'][gp]

    return pool.Pooler(loans_dict)

my_pooler = create_pooler(data_slice)
my_pooler.build_triangles_counts()

pool_dict = {}
for k, v in my_pooler.triangles.items():
    pool_dict[k] = dict(outstanding=v[0], prepayments=v[1], defaults=v[2])


### 3. Convert Pool dictionaries into Dataframe

In [7]:
# Create Pool Dataframe
df_pool = pd.DataFrame.from_dict( pool_dict, orient='index')
df_pool.index = [float(e) for e in df_pool.index.to_list()]
df_pool = df_pool.sort_index()
# Show the dataframe here
df_pool.head()

Unnamed: 0,outstanding,prepayments,defaults
2000.01,"[443, 443, 443, 443, 442, 441, 441, 441, 438, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 6, 0, 2, 7, 0, 4, ...","[0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 1, ..."
2000.02,"[329, 329, 329, 329, 329, 328, 328, 327, 326, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 2, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 1, ..."
2000.03,"[454, 454, 454, 454, 454, 452, 450, 450, 448, ...","[0, 0, 0, 0, 2, 1, 0, 2, 0, 3, 1, 1, 5, 0, 7, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 3, 0, 1, ..."
2000.04,"[418, 418, 418, 418, 418, 418, 415, 414, 411, ...","[0, 0, 0, 0, 0, 3, 1, 2, 1, 3, 0, 1, 4, 2, 3, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 1, ..."
2000.05,"[476, 476, 476, 476, 475, 475, 475, 474, 473, ...","[0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 6, 0, 6, 0, 2, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, ..."


In [8]:
# Format array lengths
max_row_length = df_pool.shape[0]
count = -1
for i, row in df_pool.iterrows():
    count+=1
    for col in df_pool.columns:
        # ----------------------------------------------------------------------------------------------
        #Method 1: This line will simply truncate the array to the right length:
        # ----------------------------------------------------------------------------------------------
        # df_pool.at[i,col] = row[col][:(max_row_length-count)]
        # ----------------------------------------------------------------------------------------------
        #Method 2: Pad arrays with NaNs --> this will truncate the array then fill it back in with NaNs
        # ----------------------------------------------------------------------------------------------
        arr = row[col][:(max_row_length-count)].astype(float)
        padded_arr = np.pad(arr, (0, max_row_length - (max_row_length-count) ), mode='constant', constant_values=np.nan)
        df_pool.at[i,col] = padded_arr


### 4. Group into Annual cohorts and calculate SMM and CPR

In [9]:
# This is where we start abstracting away from the Data --> you're now looking at Annual Cohorts grouped together
df_pool['Year'] = df_pool.index.astype(int)
year_grouped = df_pool.groupby('Year')
year_grouped = year_grouped.agg(np.nansum)
year_grouped['smm'] = (year_grouped['prepayments']+year_grouped['defaults'])/year_grouped['outstanding']
year_grouped['cpr'] = (1-(1-year_grouped['smm'])**12)

### Reshape the Yearly cohorts data from Months on Book to Year on Book  
`[i,...........,n=MoB] ---> applymap(aggregate_method)`  
 
`[[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]...n_years=YoB]`

In [10]:
def aggregate_annual_sums(in_arr):
    n_years = len(in_arr) // 12
    arr_2d = in_arr[:n_years*12].reshape(n_years,12)
    return np.nansum(arr_2d,axis=1)

def aggregate_annual_averages(in_arr):
    n_years = len(in_arr) // 12
    arr_2d = in_arr[:n_years*12].reshape(n_years,12)
    return np.nanmean(arr_2d,axis=1)

arr = year_grouped[['cpr']]
cpr_heat = arr.applymap(aggregate_annual_averages).to_dict()
cpr_heat = cpr_heat['cpr']


cpr_heat = pd.DataFrame.from_dict(cpr_heat, orient='index')

### 5. Get Line Plots (or just the data for line plots)

In [11]:
# First attempt --> 
cumsum = np.nancumsum(cpr_heat, axis=1)
lifetime_avg= pd.DataFrame(cumsum/np.arange(1,cpr_heat.shape[1]+1), columns=cpr_heat.columns, index=cpr_heat.index)

In [12]:
def create_lifetime_averages(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

lifetime_cprs = create_lifetime_averages(cpr_heat)

lifetime_cprs.fillna('')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
2000,0.034905,0.072955,0.095927,0.110735,0.124243,0.144174,0.161668,0.166428,0.161164,0.152434,...,0.136539,0.134998,0.132013,0.130625,0.128849,0.129421,0.128487,0.129756,0.130627,0.133767
2001,0.017427,0.04251,0.068683,0.100937,0.134554,0.156467,0.162477,0.158905,0.149772,0.142686,...,0.132019,0.128966,0.12798,0.12669,0.126271,0.127358,0.126252,0.126723,0.129046,
2002,0.016361,0.037624,0.069365,0.135951,0.17511,0.189455,0.180977,0.166923,0.156232,0.147895,...,0.139107,0.136788,0.136694,0.135983,0.136011,0.138485,0.136654,0.138903,,
2003,0.015101,0.044474,0.105771,0.170605,0.186682,0.177787,0.162866,0.153022,0.145525,0.142487,...,0.134394,0.133989,0.133725,0.133722,0.13259,0.131152,0.132127,,,
2004,0.020027,0.064827,0.13097,0.171566,0.164545,0.153026,0.1449,0.138642,0.134615,0.131418,...,0.129835,0.130404,0.130948,0.13214,0.131898,0.13326,,,,
2005,0.019747,0.067812,0.107076,0.119516,0.114168,0.108553,0.1046,0.104327,0.104116,0.10496,...,0.110453,0.111828,0.113236,0.114453,0.118152,,,,,
2006,0.026801,0.063664,0.087736,0.093044,0.094034,0.095111,0.095981,0.096069,0.096079,0.095796,...,0.106976,0.1098,0.110216,0.112415,,,,,,
2007,0.020781,0.05455,0.069776,0.077087,0.083174,0.086977,0.086327,0.088144,0.089517,0.092242,...,0.101438,0.101563,0.106924,,,,,,,
2008,0.020033,0.043336,0.051446,0.058439,0.06627,0.071847,0.078458,0.082477,0.086333,0.091101,...,0.100628,0.105791,,,,,,,,
2009,0.007184,0.018549,0.028984,0.052821,0.067235,0.076442,0.085919,0.094691,0.10394,0.11233,...,0.121527,,,,,,,,,


In [13]:
import plotly.express as px

fig = px.line(lifetime_cprs.transpose(), line_shape= 'spline', title="Lifetime Average CPR by Year from Origination", markers=True)
fig.update_layout(yaxis_title='CPR', xaxis_title= 'Year from Origination', yaxis=dict(tickformat='0.0%'))