### Basic imports

In [36]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
from dotenv import load_dotenv
load_dotenv()

engine = sa.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
    .format(host=os.getenv('host'), db=os.getenv('db'), user=os.getenv('uname'), pw=os.getenv('password')))


### 1. Read in Data from csv --> could also use pd.read_sql and use .env to load auth info

In [104]:
loan_data = pd.read_csv("raw_data/loans_v2.csv")
# format date columns to datetime data types
date_cols = [c for c in loan_data.columns if str(c)[-2:]=='Dt']
for col in date_cols:
    loan_data[col] = pd.to_datetime(loan_data[col])


In [115]:
test_slice = loan_data[loan_data['MatBucket']=='21+']
test_slice = test_slice[test_slice['Note_Yr']<=2021]

In [144]:
test_slice[(test_slice['Note_Yr']==2010)&((test_slice['LoanAmt']>2000000))]

Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name,yr
31286,4605642,2010,12,2010-12-03,4065000.0,2035-12-03,300,2013-07-31,31.0,NaT,,31.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0275,0.025 - 0.0275,IL,Illinois,2010.0
49747,4601691,2010,11,2010-11-08,4300000.0,2035-11-08,300,2014-02-28,39.0,NaT,,39.0,21+,337121.0,Upholstered Household Furniture Manufacturing,0.0275,0.025 - 0.0275,NC,North Carolina,2010.0
49748,4601938,2010,12,2010-12-01,3000000.0,2035-12-01,300,NaT,,NaT,,,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0000,,MO,Missouri,2010.0
66011,4606559,2010,12,2010-12-09,4000000.0,2035-12-09,300,2013-12-31,36.0,NaT,,36.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0175,0.015 - 0.0175,CA,California,2010.0
67025,4603949,2010,11,2010-11-24,4000000.0,2033-11-24,276,2014-11-30,48.0,NaT,,48.0,21+,721211.0,RV (Recreational Vehicle) Parks and Campgrounds,0.0175,0.015 - 0.0175,MS,Mississippi,2010.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719090,4607449,2010,12,2010-12-10,2840000.0,2035-12-10,300,NaT,,NaT,,,21+,424990.0,Other Miscellaneous Nondurable Goods Merchant ...,0.0275,0.025 - 0.0275,AZ,Arizona,2010.0
719092,4607532,2010,12,2010-12-13,4200000.0,2035-12-13,300,2015-09-30,57.0,NaT,,57.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0225,0.02 - 0.0225,AZ,Arizona,2010.0
724971,4607138,2010,12,2010-12-07,3000000.0,2035-12-07,300,2016-11-30,71.0,NaT,,71.0,21+,441120.0,Used Car Dealers,0.0200,0.0175 - 0.02,CA,California,2010.0
736439,4604702,2010,11,2010-11-15,2300000.0,2035-11-15,300,2014-04-30,41.0,NaT,,41.0,21+,315233.0,Women's and Girls' Cut and Sew Dress Manufactu...,0.0200,0.0175 - 0.02,CA,California,2010.0


In [146]:
test_slice[(test_slice['LoanAmt']<=2_000_000)]

Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name,yr
190,4435462,2008,4,2008-04-30,378000.00,2031-03-30,275,2019-12-11,139.0,2019-08-23,135.0,135.0,21+,811198.0,All Other Automotive Repair and Maintenance,0.02000,0.0175 - 0.02,GA,Georgia,2008.0
221,1483193,2000,2,2000-02-29,1000000.00,2025-01-27,298,2002-10-22,31.0,2000-12-20,9.0,9.0,21+,722511.0,Full-Service Restaurants,0.01750,0.015 - 0.0175,IN,Indiana,2000.0
225,887167,2000,8,2000-08-30,1108000.00,2024-11-30,291,2002-07-28,22.0,NaT,,22.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.00175,0.0 - 0.01,NM,New Mexico,2000.0
228,1477340,2000,1,2000-01-31,1500000.00,2022-01-31,264,2009-05-27,111.0,2000-09-22,7.0,7.0,21+,441222.0,Boat Dealers,0.01250,0.01 - 0.0125,NY,New York,2000.0
235,1478838,2000,3,2000-03-31,1000000.00,2025-03-31,300,2007-07-16,87.0,2001-01-18,9.0,9.0,21+,457120.0,Other Gasoline Stations,0.02750,0.025 - 0.0275,TX,Texas,2000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820395,17980407,2021,9,2021-09-30,1700000.00,2046-09-17,299,NaT,,NaT,,,21+,,,0.01500,0.0125 - 0.015,NM,New Mexico,
820396,17982124,2021,9,2021-09-30,215000.00,2046-09-21,299,NaT,,NaT,,,21+,,,0.02750,0.025 - 0.0275,AZ,Arizona,
820404,17985370,2021,9,2021-09-30,499948.12,2046-09-27,299,NaT,,NaT,,,21+,,,0.02500,0.0225 - 0.025,TX,Texas,
820422,17992033,2021,12,2021-12-16,1007700.00,2046-12-09,299,NaT,,NaT,,,21+,,,0.01600,0.015 - 0.0175,OH,Ohio,


In [140]:
test_slice[(test_slice['LoanAmt']>2_000_000)&(test_slice['Note_Yr']>2010)]

Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name,yr
2422,4648007,2012,5,2012-05-01,3494000.0,2036-09-29,292,2018-02-28,69.0,2014-11-30,30.0,30.0,21+,445110.0,Supermarkets and Other Grocery Retailers (exce...,0.0275,0.025 - 0.0275,TN,Tennessee,2012.0
3461,4634362,2011,8,2011-08-01,2634600.0,2036-08-01,300,2021-12-31,124.0,NaT,,124.0,21+,424480.0,Fresh Fruit and Vegetable Merchant Wholesalers,0.0000,,AZ,Arizona,2011.0
3609,4676072,2012,6,2012-06-01,2726895.0,2037-06-01,300,2018-02-28,68.0,NaT,,68.0,21+,531130.0,Lessors of Miniwarehouses and Self-Storage Units,0.0175,0.015 - 0.0175,OK,Oklahoma,2012.0
3670,4691573,2012,10,2012-10-01,2324271.0,2037-10-01,300,2018-06-30,68.0,NaT,,68.0,21+,524210.0,Insurance Agencies and Brokerages,0.0000,,MN,Minnesota,2012.0
3676,4692126,2013,9,2013-09-01,2534000.0,2039-09-01,312,2015-06-30,21.0,NaT,,21.0,21+,621210.0,Offices of Dentists,0.0175,0.015 - 0.0175,CA,California,2013.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820228,4749044,2014,1,2014-01-02,2867000.0,2039-01-05,300,NaT,,NaT,,,21+,,,0.0000,,KY,Kentucky,
820253,4885214,2016,1,2016-01-01,3050000.0,2042-01-01,312,2021-07-31,66.0,NaT,,66.0,21+,,,0.0100,0.0 - 0.01,CO,Colorado,
820260,4900916,2016,5,2016-05-01,2095000.0,2042-05-31,312,NaT,,NaT,,,21+,,,0.0275,0.025 - 0.0275,OH,Ohio,
820386,17973344,2021,8,2021-08-31,3570000.0,2046-08-23,299,NaT,,NaT,,,21+,,,0.0275,0.025 - 0.0275,NV,Nevada,


In [132]:
test_slice[(~test_slice['LoanAmt'].isna())&(test_slice['LoanAmt']>2_000_000)&(test_slice['Note_Yr']!=2010)]

Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name,yr
2422,4648007,2012,5,2012-05-01,3494000.0,2036-09-29,292,2018-02-28,69.0,2014-11-30,30.0,30.0,21+,445110.0,Supermarkets and Other Grocery Retailers (exce...,0.0275,0.025 - 0.0275,TN,Tennessee,2012.0
3461,4634362,2011,8,2011-08-01,2634600.0,2036-08-01,300,2021-12-31,124.0,NaT,,124.0,21+,424480.0,Fresh Fruit and Vegetable Merchant Wholesalers,0.0000,,AZ,Arizona,2011.0
3609,4676072,2012,6,2012-06-01,2726895.0,2037-06-01,300,2018-02-28,68.0,NaT,,68.0,21+,531130.0,Lessors of Miniwarehouses and Self-Storage Units,0.0175,0.015 - 0.0175,OK,Oklahoma,2012.0
3670,4691573,2012,10,2012-10-01,2324271.0,2037-10-01,300,2018-06-30,68.0,NaT,,68.0,21+,524210.0,Insurance Agencies and Brokerages,0.0000,,MN,Minnesota,2012.0
3676,4692126,2013,9,2013-09-01,2534000.0,2039-09-01,312,2015-06-30,21.0,NaT,,21.0,21+,621210.0,Offices of Dentists,0.0175,0.015 - 0.0175,CA,California,2013.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820228,4749044,2014,1,2014-01-02,2867000.0,2039-01-05,300,NaT,,NaT,,,21+,,,0.0000,,KY,Kentucky,
820253,4885214,2016,1,2016-01-01,3050000.0,2042-01-01,312,2021-07-31,66.0,NaT,,66.0,21+,,,0.0100,0.0 - 0.01,CO,Colorado,
820260,4900916,2016,5,2016-05-01,2095000.0,2042-05-31,312,NaT,,NaT,,,21+,,,0.0275,0.025 - 0.0275,OH,Ohio,
820386,17973344,2021,8,2021-08-31,3570000.0,2046-08-23,299,NaT,,NaT,,,21+,,,0.0275,0.025 - 0.0275,NV,Nevada,


In [39]:
# Slice the data frame to only 21+ year maturity loans
data_slice = loan_data[loan_data['MatBucket']=='21+']

### 2. Use pool.py to organize cohorts --> loans are split into yyyy.mm cohorts

In [62]:
from pooler import pool

# Define a function to create static pools of Loans from the DataFrame
def create_pooler(in_df:pd.DataFrame)-> pool.Pooler:
    temp = in_df.set_index('GP')
    temp = temp.to_dict()
    loans_dict = {}
    for gp in temp['NoteDt'].keys():
        loans_dict[str(gp)] = pool.Loan(gp, pd.to_datetime(temp['NoteDt'][gp]))
        loans_dict[str(gp)].maturity_dt = temp['MaturityDt'][gp]
        loans_dict[str(gp)].maturity_mths_qty = temp['MaturityMthsQty'][gp]
        loans_dict[str(gp)].default_dt = temp['DefaultDt'][gp]
        loans_dict[str(gp)].default_mths_qty = temp['DefaultMthsQty'][gp]
        loans_dict[str(gp)].prepay_dt = temp['PrepayDt'][gp]
        loans_dict[str(gp)].prepay_mths_qty = temp['PrepayMthsQty'][gp]

    return pool.Pooler(loans_dict)

data_slice = loan_data[loan_data['MatBucket']=='21+']
data_slice = data_slice[(data_slice['LoanAmt']>0)&(data_slice['LoanAmt']<=500_000)]
my_pooler = create_pooler(data_slice)
my_pooler.build_triangles_counts()

pool_dict = {}
for k, v in my_pooler.triangles.items():
    pool_dict[k] = dict(outstanding=v[0], prepayments=v[1], defaults=v[2])


### 3. Convert Pool dictionaries into Dataframe

In [88]:
# Create Pool Dataframe
df_pool = pd.DataFrame.from_dict( pool_dict, orient='index')
df_pool.index = [float(e) for e in df_pool.index.to_list()]
df_pool = df_pool.sort_index()
# Show the dataframe here
df_pool.head()

Unnamed: 0,outstanding,prepayments,defaults
2000.01,"[246, 246, 246, 246, 245, 244, 244, 244, 243, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 1, 4, 0, 1, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, ..."
2000.02,"[195, 195, 195, 195, 195, 195, 195, 194, 194, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ..."
2000.03,"[241, 241, 241, 241, 241, 241, 240, 240, 238, ...","[0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 1, 2, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."
2000.04,"[249, 249, 249, 249, 249, 249, 246, 246, 245, ...","[0, 0, 0, 0, 0, 3, 0, 1, 1, 1, 0, 1, 2, 2, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ..."
2000.05,"[268, 268, 268, 268, 268, 268, 268, 268, 267, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 5, 0, 4, 0, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, ..."


In [103]:
test1 = df_pool.copy(deep=True)
test1['yr'] = test1.index.astype(int)
test1 = pd.DataFrame(test1[test1['yr']==2021]['prepayments'].to_dict()).transpose()
test1[test1.columns[0:12]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
2021.01,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
2021.03,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0
2021.04,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2021.05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0,0.0
2021.06,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,2.0,0.0
2021.07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2021.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2021.09,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2021.1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [91]:
# Format array lengths
max_row_length = df_pool.shape[0]
count = -1
for i, row in df_pool.iterrows():
    count+=1
    for col in df_pool.columns:
        # ----------------------------------------------------------------------------------------------
        #Method 1: This line will simply truncate the array to the right length:
        # ----------------------------------------------------------------------------------------------
        # df_pool.at[i,col] = row[col][:(max_row_length-count)]
        # ----------------------------------------------------------------------------------------------
        #Method 2: Pad arrays with NaNs --> this will truncate the array then fill it back in with NaNs
        # ----------------------------------------------------------------------------------------------
        arr = row[col][:(max_row_length-count)].astype(float)
        padded_arr = np.pad(arr, (0, max_row_length - (max_row_length-count) ), mode='constant', constant_values=np.nan)
        df_pool.at[i,col] = padded_arr


### 4. Group into Annual cohorts and calculate SMM and CPR

### Reshape the Yearly cohorts data from Months on Book to Year on Book  
`[i,...........,n=MoB] ---> applymap(aggregate_method)`  
 
`[[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]...n_years=YoB]`

In [92]:

# First try here
# def aggregate_annual_sums(in_arr):
#     n_years = len(in_arr) // 12
#     arr_2d = in_arr[:n_years*12].reshape(n_years,12)
#     return np.nansum(arr_2d, axis=1)

def outstanding_annual_rundown(in_arr):
    return in_arr[::11]

def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.sum(subsections, axis=1))

def aggregate_annual_averages(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmean(arr_2d, axis=1)

def aggregate_annual_median(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmedian(arr_2d, axis=1)




In [97]:
# This is where we start abstracting away from the Data --> you're now looking at Annual Cohorts grouped together
df_pool['Year'] = df_pool.index.astype(int)
# Filter out years without 12 months of history
vals = df_pool['Year'].value_counts().to_dict()

yr_range = []
for k, v in vals.items():
    if v == 12:
        yr_range.append(k)

df_pool = df_pool[df_pool['Year'].isin(yr_range)]

# Switch to year group
year_grouped = df_pool.groupby('Year')
year_grouped = year_grouped.agg(np.nansum)


In [102]:
def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.nansum(subsections, axis=1))


year_grouped['prepayments'].apply(aggregate_annual_sums)

Year
2000    [67.0, 201.0, 239.0, 244.0, 239.0, 299.0, 257....
2001    [17.0, 95.0, 199.0, 279.0, 304.0, 302.0, 176.0...
2002    [25.0, 108.0, 234.0, 554.0, 433.0, 300.0, 107....
2003    [29.0, 138.0, 432.0, 550.0, 296.0, 138.0, 31.0...
2004    [40.0, 194.0, 402.0, 383.0, 124.0, 40.0, 26.0,...
2005    [41.0, 165.0, 198.0, 137.0, 41.0, 23.0, 28.0, ...
2006    [23.0, 71.0, 73.0, 30.0, 11.0, 27.0, 29.0, 51....
2007    [8.0, 19.0, 20.0, 28.0, 23.0, 40.0, 27.0, 30.0...
2008    [7.0, 6.0, 14.0, 19.0, 40.0, 51.0, 57.0, 56.0,...
2009    [4.0, 7.0, 20.0, 84.0, 66.0, 76.0, 84.0, 91.0,...
2010    [4.0, 22.0, 60.0, 109.0, 132.0, 113.0, 123.0, ...
2011    [10.0, 28.0, 69.0, 119.0, 134.0, 123.0, 117.0,...
2012    [10.0, 32.0, 68.0, 134.0, 116.0, 137.0, 130.0,...
2013    [15.0, 38.0, 95.0, 139.0, 151.0, 143.0, 114.0,...
2014    [1.0, 35.0, 89.0, 194.0, 182.0, 160.0, 117.0, ...
2015    [14.0, 73.0, 143.0, 189.0, 201.0, 156.0, 169.0...
2016    [16.0, 66.0, 122.0, 194.0, 160.0, 181.0, 86.0,...
2017    [

In [95]:

year_grouped['outstanding'] = year_grouped['outstanding'].apply(outstanding_annual_rundown)
year_grouped[['prepayments','defaults']] = year_grouped[['prepayments','defaults']].applymap(aggregate_annual_sums)
year_grouped['cpr'] = (year_grouped['prepayments']+year_grouped['defaults'])/year_grouped['outstanding']
# year_grouped['cpr'] = (1-(1-year_grouped['smm'])**12)
cpr_heat = pd.DataFrame.from_dict(year_grouped['cpr'].to_dict(), orient='index')

In [96]:
year_grouped

Unnamed: 0_level_0,outstanding,prepayments,defaults,cpr
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000,"[2740.0, 2657.0, 2414.0, 2127.0, 1855.0, 1597....","[67.0, 201.0, 239.0, 244.0, 239.0, 299.0, 257....","[16.0, 42.0, 48.0, 28.0, 19.0, 20.0, 17.0, 13....","[0.030291970802919708, 0.0914565299209635, 0.1..."
2001,"[2353.0, 2321.0, 2183.0, 1944.0, 1637.0, 1323....","[17.0, 95.0, 199.0, 279.0, 304.0, 302.0, 176.0...","[15.0, 43.0, 40.0, 28.0, 10.0, 22.0, 13.0, 19....","[0.013599660008499787, 0.05945713054717794, 0...."
2002,"[2607.0, 2576.0, 2444.0, 2186.0, 1610.0, 1165....","[25.0, 108.0, 234.0, 554.0, 433.0, 300.0, 107....","[6.0, 24.0, 24.0, 22.0, 12.0, 18.0, 20.0, 21.0...","[0.011891062523973917, 0.05124223602484472, 0...."
2003,"[2585.0, 2551.0, 2391.0, 1922.0, 1343.0, 1029....","[29.0, 138.0, 432.0, 550.0, 296.0, 138.0, 31.0...","[5.0, 22.0, 37.0, 29.0, 18.0, 40.0, 28.0, 28.0...","[0.013152804642166345, 0.06272050176401411, 0...."
2004,"[2303.0, 2258.0, 2048.0, 1598.0, 1178.0, 1011....","[40.0, 194.0, 402.0, 383.0, 124.0, 40.0, 26.0,...","[5.0, 16.0, 48.0, 37.0, 43.0, 45.0, 33.0, 34.0...","[0.019539730785931395, 0.09300265721877768, 0...."
2005,"[1942.0, 1896.0, 1698.0, 1452.0, 1249.0, 1128....","[41.0, 165.0, 198.0, 137.0, 41.0, 23.0, 28.0, ...","[5.0, 33.0, 48.0, 66.0, 80.0, 50.0, 38.0, 39.0...","[0.02368692070030896, 0.10443037974683544, 0.1..."
2006,"[1412.0, 1375.0, 1265.0, 1121.0, 1003.0, 925.0...","[23.0, 71.0, 73.0, 30.0, 11.0, 27.0, 29.0, 51....","[14.0, 39.0, 71.0, 88.0, 67.0, 54.0, 33.0, 18....","[0.026203966005665724, 0.08, 0.113833992094861..."
2007,"[1127.0, 1109.0, 1028.0, 948.0, 856.0, 785.0, ...","[8.0, 19.0, 20.0, 28.0, 23.0, 40.0, 27.0, 30.0...","[10.0, 62.0, 60.0, 64.0, 48.0, 40.0, 29.0, 10....","[0.015971606033717833, 0.07303877366997295, 0...."
2008,"[1132.0, 1115.0, 1058.0, 999.0, 940.0, 862.0, ...","[7.0, 6.0, 14.0, 19.0, 40.0, 51.0, 57.0, 56.0,...","[10.0, 51.0, 45.0, 40.0, 38.0, 25.0, 16.0, 15....","[0.015017667844522967, 0.051121076233183856, 0..."
2009,"[1195.0, 1190.0, 1169.0, 1121.0, 1014.0, 924.0...","[4.0, 7.0, 20.0, 84.0, 66.0, 76.0, 84.0, 91.0,...","[1.0, 14.0, 28.0, 23.0, 24.0, 13.0, 12.0, 16.0...","[0.0041841004184100415, 0.01764705882352941, 0..."


In [67]:
def enforce_shape(in_df:pd.DataFrame)->pd.DataFrame:
    data = in_df.copy(deep=True)
    last_year = data.index.max()+1
    for i in range(len(data)):
        max_col = last_year - data.index[i]
        data.iloc[i,(max_col):] = np.NaN
    return data

enforce_shape(cpr_heat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,0.030292,0.091457,0.11889,0.12788,0.139084,0.19975,0.214397,0.218127,0.141401,0.0727,...,0.079518,0.089005,0.091954,0.126582,0.097826,0.128514,0.156682,,,
2001,0.0136,0.059457,0.109482,0.157922,0.191814,0.244898,0.189189,0.159259,0.091043,0.059774,...,0.101299,0.109827,0.090909,0.110714,0.108434,0.099099,,,,
2002,0.011891,0.051242,0.105565,0.263495,0.276398,0.272961,0.149941,0.081944,0.043873,0.06962,...,0.104,0.145833,0.101045,0.139535,0.148649,,,,,
2003,0.013153,0.062721,0.196152,0.301249,0.233805,0.172983,0.06933,0.065657,0.067568,0.072464,...,0.114883,0.115044,0.12,0.106061,,,,,,
2004,0.01954,0.093003,0.219727,0.262829,0.141766,0.084075,0.063715,0.081892,0.089196,0.088276,...,0.099217,0.162319,0.107266,,,,,,,
2005,0.023687,0.10443,0.144876,0.139807,0.096878,0.064716,0.062559,0.072801,0.112323,0.084767,...,0.143852,0.094851,,,,,,,,
2006,0.026204,0.08,0.113834,0.105263,0.077767,0.087568,0.07346,0.088235,0.072931,0.086233,...,0.113095,,,,,,,,,
2007,0.015972,0.073039,0.077821,0.097046,0.082944,0.101911,0.079433,0.061633,0.083744,0.089606,...,,,,,,,,,,
2008,0.015018,0.051121,0.055766,0.059059,0.082979,0.088167,0.092875,0.099579,0.08567,0.088586,...,,,,,,,,,,
2009,0.004184,0.017647,0.041061,0.09545,0.088757,0.09632,0.11497,0.14479,0.131329,0.136612,...,,,,,,,,,,


In [30]:
pd.DataFrame.from_dict({'max': cpr_heat.max(axis=0), 'median': cpr_heat.median(axis=0), 'avg': cpr_heat.mean(axis=0), 'min': cpr_heat.min(axis=0)}).transpose().to_csv('811_mids.csv')


### 5. Create basic triangles

In [31]:
prepays = year_grouped[['prepayments','defaults']].applymap(aggregate_annual_sums).to_dict()
totals = year_grouped[['outstanding']].applymap(outstanding_annual_rundown).to_dict()
totals = totals['outstanding']
totals = (pd.DataFrame.from_dict(totals, orient='index'))
triangles = [totals]
triangles += [pd.DataFrame.from_dict(prepays[k], orient='index') for k in prepays.keys()]

# pd.concat(triangles, axis=0).to_csv('test.csv')

In [32]:
pd.concat(triangles,axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,4939.0,4784.0,4306.0,3765.0,3224.0,2745.0,2159.0,1655.0,1259.0,1059.0,...,630.0,574.0,519.0,466.0,406.0,364.0,318.0,274.0,236.0,
2001,4224.0,4163.0,3922.0,3492.0,2926.0,2279.0,1664.0,1317.0,1081.0,972.0,...,581.0,521.0,467.0,417.0,366.0,320.0,289.0,249.0,,
2002,4929.0,4862.0,4607.0,4107.0,2967.0,2051.0,1429.0,1185.0,1076.0,1006.0,...,570.0,506.0,437.0,388.0,330.0,277.0,255.0,,,
2003,4882.0,4816.0,4518.0,3644.0,2465.0,1800.0,1469.0,1346.0,1253.0,1147.0,...,610.0,533.0,467.0,413.0,372.0,325.0,,,,
2004,4815.0,4724.0,4278.0,3279.0,2350.0,1957.0,1766.0,1613.0,1479.0,1339.0,...,662.0,575.0,490.0,439.0,379.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,21.0,58.0,103.0,51.0,39.0,,,,,,...,,,,,,,,,,
2018,13.0,53.0,38.0,36.0,,,,,,,...,,,,,,,,,,
2019,4.0,13.0,29.0,,,,,,,,...,,,,,,,,,,
2020,1.0,11.0,,,,,,,,,...,,,,,,,,,,


### 6. Get Line Plots (or just the data for line plots)

In [33]:
def generate_lifetime(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

lifetime_cprs = generate_lifetime(cpr_heat)

lifetime_cprs.fillna('')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,0.028908,0.067926,0.095588,0.109065,0.119703,0.135512,0.152552,0.165773,0.166687,0.159305,...,0.133122,0.131684,0.130753,0.130462,0.12858,0.128371,0.128942,0.128907,0.129513,
2001,0.011315,0.036692,0.065382,0.094292,0.12091,0.152553,0.161985,0.166114,0.159212,0.151749,...,0.133065,0.130472,0.129956,0.129552,0.13082,0.12836,0.129758,0.130564,,
2002,0.01456,0.032773,0.065331,0.121369,0.170796,0.197208,0.197018,0.185433,0.173238,0.16325,...,0.141801,0.142,0.140098,0.140672,0.143116,0.140337,0.14496,,,
2003,0.014679,0.042629,0.10217,0.16289,0.186757,0.186395,0.171398,0.158603,0.150871,0.143025,...,0.133326,0.132512,0.132195,0.131106,0.130857,0.130011,,,,
2004,0.017335,0.060835,0.138176,0.182227,0.181021,0.166228,0.156359,0.147618,0.14191,0.138658,...,0.13095,0.132926,0.129882,0.130437,0.131372,,,,,
2005,0.015571,0.065601,0.095944,0.110743,0.109712,0.105464,0.101661,0.099709,0.101247,0.100301,...,0.110234,0.10961,0.112706,0.116785,,,,,,
2006,0.016346,0.059518,0.083044,0.091608,0.093937,0.094837,0.096415,0.098166,0.096982,0.095589,...,0.10784,0.109396,0.11035,,,,,,,
2007,0.015645,0.049623,0.065859,0.076817,0.080279,0.086406,0.086577,0.085639,0.086938,0.087446,...,0.105726,0.108164,,,,,,,,
2008,0.007463,0.033527,0.047035,0.052094,0.060996,0.067995,0.073182,0.079423,0.08271,0.084806,...,0.104132,,,,,,,,,
2009,0.007841,0.015908,0.025247,0.051343,0.066114,0.074524,0.080168,0.089195,0.095134,0.103715,...,,,,,,,,,,


In [34]:
import plotly.express as px

fig = px.line(lifetime_cprs.transpose(), line_shape= 'spline', title="Lifetime Average CPR by Year from Origination", markers=True)
fig.update_layout(yaxis_title='CPR', xaxis_title= 'Year from Origination', yaxis=dict(tickformat='0.0%'))

In [17]:
loan_data['MarginBucket'].value_counts().index.to_list()

['0.025 - 0.0275',
 '0.0275+',
 '0.0175 - 0.02',
 '0.0 - 0.01',
 '0.02 - 0.0225',
 '0.0225 - 0.025',
 '0.0125 - 0.015',
 '0.015 - 0.0175',
 '0.01 - 0.0125']

In [18]:
def subset_dataframe(df, conditions):
    mask = pd.Series(True, index=df.index)
    for col, cond in conditions.items():
        mask &= df[col].isin(cond)
    return df[mask]


In [114]:
# Define a sample dataframe
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Dave'],
    'age': [25, 30, 35, 40],
    'gender': ['F', 'M', 'M', 'M']
}
df = pd.DataFrame(data)

# Subset the dataframe based on multiple conditions
subset = subset_dataframe(df, {'age': [25, 30], 'gender': ['F']})
print(subset)


    name  age gender
0  Alice   25      F
