### Basic imports

In [19]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
from dotenv import load_dotenv
load_dotenv()

engine = sa.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
    .format(host=os.getenv('host'), db=os.getenv('db'), user=os.getenv('uname'), pw=os.getenv('password')))


### 1. Read in Data from csv --> could also use pd.read_sql and use .env to load auth info

In [20]:
loan_data = pd.read_csv("raw_data/master_loan_tape.csv")
# format date columns to datetime data types
date_cols = [c for c in loan_data.columns if str(c)[-2:]=='Dt']
for col in date_cols:
    loan_data[col] = pd.to_datetime(loan_data[col])


In [21]:
loan_data['MarginBucket'].value_counts()

0.025 - 0.0275    150217
0.0275+           116308
0.0175 - 0.02      84478
0.0 - 0.01         72243
0.02 - 0.0225      59692
0.0225 - 0.025     49980
0.0125 - 0.015     42966
0.015 - 0.0175     31516
0.01 - 0.0125      17405
Name: MarginBucket, dtype: int64

In [22]:
# Slice the data frame to only 21+ year maturity loans
data_slice = loan_data[loan_data['MatBucket']=='21+']

### 2. Use pool.py to organize cohorts --> loans are split into yyyy.mm cohorts

In [23]:
from pooler import pool

# Define a function to create static pools of Loans from the DataFrame
def create_pooler(in_df:pd.DataFrame)-> pool.Pooler:
    temp = in_df.set_index('GP')
    temp = temp.to_dict()
    loans_dict = {}
    for gp in temp['NoteDt'].keys():
        loans_dict[str(gp)] = pool.Loan(gp, pd.to_datetime(temp['NoteDt'][gp]))
        loans_dict[str(gp)].maturity_dt = temp['MaturityDt'][gp]
        loans_dict[str(gp)].maturity_mths_qty = temp['MaturityMthsQty'][gp]
        loans_dict[str(gp)].default_dt = temp['DefaultDt'][gp]
        loans_dict[str(gp)].default_mths_qty = temp['DefaultMthsQty'][gp]
        loans_dict[str(gp)].prepay_dt = temp['PrepayDt'][gp]
        loans_dict[str(gp)].prepay_mths_qty = temp['PrepayMthsQty'][gp]

    return pool.Pooler(loans_dict)

data_slice = loan_data[loan_data['MatBucket']=='21+']
# data_slice = data_slice[data_slice['MarginBucket']=='0.01 - 0.0125']
my_pooler = create_pooler(data_slice)
my_pooler.build_triangles_counts()

pool_dict = {}
for k, v in my_pooler.triangles.items():
    pool_dict[k] = dict(outstanding=v[0], prepayments=v[1], defaults=v[2])


### 3. Convert Pool dictionaries into Dataframe

In [24]:
# Create Pool Dataframe
df_pool = pd.DataFrame.from_dict( pool_dict, orient='index')
df_pool.index = [float(e) for e in df_pool.index.to_list()]
df_pool = df_pool.sort_index()
# Show the dataframe here
df_pool.head()

Unnamed: 0,outstanding,prepayments,defaults
2000.01,"[443, 443, 443, 443, 442, 441, 441, 441, 438, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 6, 0, 2, 7, 0, 4, ...","[0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 1, ..."
2000.02,"[329, 329, 329, 329, 329, 328, 328, 327, 326, ...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 2, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 1, ..."
2000.03,"[454, 454, 454, 454, 454, 452, 450, 450, 448, ...","[0, 0, 0, 0, 2, 1, 0, 2, 0, 3, 1, 1, 5, 0, 7, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 3, 0, 1, ..."
2000.04,"[418, 418, 418, 418, 418, 418, 415, 414, 411, ...","[0, 0, 0, 0, 0, 3, 1, 2, 1, 3, 0, 1, 4, 2, 3, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 1, ..."
2000.05,"[476, 476, 476, 476, 475, 475, 475, 474, 473, ...","[0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 6, 0, 6, 0, 2, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, ..."


In [25]:
# Format array lengths
max_row_length = df_pool.shape[0]
count = -1
for i, row in df_pool.iterrows():
    count+=1
    for col in df_pool.columns:
        # ----------------------------------------------------------------------------------------------
        #Method 1: This line will simply truncate the array to the right length:
        # ----------------------------------------------------------------------------------------------
        # df_pool.at[i,col] = row[col][:(max_row_length-count)]
        # ----------------------------------------------------------------------------------------------
        #Method 2: Pad arrays with NaNs --> this will truncate the array then fill it back in with NaNs
        # ----------------------------------------------------------------------------------------------
        arr = row[col][:(max_row_length-count)].astype(float)
        padded_arr = np.pad(arr, (0, max_row_length - (max_row_length-count) ), mode='constant', constant_values=np.nan)
        df_pool.at[i,col] = padded_arr


### 4. Group into Annual cohorts and calculate SMM and CPR

In [26]:
# This is where we start abstracting away from the Data --> you're now looking at Annual Cohorts grouped together
df_pool['Year'] = df_pool.index.astype(int)
# Filter out years without 12 months of history
vals = df_pool['Year'].value_counts().to_dict()

yr_range = []
for k, v in vals.items():
    if v == 12:
        yr_range.append(k)

df_pool = df_pool[df_pool['Year'].isin(yr_range)]

# Switch to year group
year_grouped = df_pool.groupby('Year')
year_grouped = year_grouped.agg(np.nansum)
year_grouped['smm'] = (year_grouped['prepayments']+year_grouped['defaults'])/year_grouped['outstanding']
year_grouped['cpr'] = (1-(1-year_grouped['smm'])**12)

### Reshape the Yearly cohorts data from Months on Book to Year on Book  
`[i,...........,n=MoB] ---> applymap(aggregate_method)`  
 
`[[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]...n_years=YoB]`

In [27]:
year_grouped

Unnamed: 0_level_0,outstanding,prepayments,defaults,smm,cpr
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000,"[4939.0, 4939.0, 4938.0, 4935.0, 4926.0, 4915....","[0.0, 1.0, 3.0, 6.0, 8.0, 9.0, 9.0, 20.0, 7.0,...","[0.0, 0.0, 0.0, 3.0, 3.0, 3.0, 4.0, 6.0, 11.0,...","[0.0, 0.00020247013565499088, 0.00060753341433...","[0.0, 0.0024269378387637985, 0.007266089845122..."
2001,"[4224.0, 4223.0, 4222.0, 4220.0, 4216.0, 4215....","[1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 4.0, 3.0, ...","[0.0, 0.0, 0.0, 2.0, 0.0, 4.0, 2.0, 5.0, 8.0, ...","[0.00023674242424242425, 0.0002367984844896992...","[0.0028372129080893593, 0.002837883881038561, ..."
2002,"[4929.0, 4929.0, 4927.0, 4925.0, 4922.0, 4921....","[0.0, 2.0, 2.0, 3.0, 0.0, 9.0, 3.0, 6.0, 10.0,...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 3.0, 7.0, 3.0, ...","[0.0, 0.0004057618178129438, 0.000405926527298...","[0.0, 0.004858290082515815, 0.0048602577905038..."
2003,"[4882.0, 4881.0, 4879.0, 4878.0, 4872.0, 4872....","[1.0, 2.0, 1.0, 6.0, 0.0, 6.0, 4.0, 11.0, 6.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, ...","[0.00020483408439164277, 0.0004097520999795124...","[0.0024552417404174465, 0.004905959133235771, ..."
2004,"[4815.0, 4815.0, 4815.0, 4812.0, 4807.0, 4800....","[0.0, 0.0, 3.0, 5.0, 2.0, 13.0, 6.0, 11.0, 6.0...","[0.0, 0.0, 0.0, 0.0, 5.0, 2.0, 0.0, 2.0, 2.0, ...","[0.0, 0.0, 0.0006230529595015577, 0.0010390689...","[0.0, 0.0, 0.007451067780664955, 0.01239781631..."
2005,"[4631.0, 4631.0, 4628.0, 4627.0, 4626.0, 4619....","[0.0, 3.0, 1.0, 0.0, 7.0, 6.0, 3.0, 12.0, 6.0,...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 4.0, 3.0, 0.0, ...","[0.0, 0.0006478082487583676, 0.000216076058772...","[0.0, 0.007746061441611496, 0.0025898334586604..."
2006,"[3659.0, 3658.0, 3658.0, 3654.0, 3654.0, 3647....","[1.0, 0.0, 4.0, 0.0, 6.0, 4.0, 2.0, 12.0, 9.0,...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 3.0, 5.0, 7.0, ...","[0.00027329871549603714, 0.0, 0.00109349371241...","[0.0032746593897131637, 0.0, 0.013043293416247..."
2007,"[3058.0, 3058.0, 3057.0, 3056.0, 3055.0, 3051....","[0.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 8.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 2.0, 4.0, 2.0, 3.0, 5.0, ...","[0.0, 0.0003270111183780249, 0.000327118089630...","[0.0, 0.003917083314214476, 0.0039183623585675..."
2008,"[3209.0, 3209.0, 3207.0, 3206.0, 3205.0, 3203....","[0.0, 2.0, 1.0, 1.0, 0.0, 4.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 2.0, 3.0, 0.0, 6.0, 1.0, ...","[0.0, 0.0006232471174820816, 0.000311817898347...","[0.0, 0.007453381755477162, 0.0037354042389760..."
2009,"[3057.0, 3057.0, 3056.0, 3054.0, 3054.0, 3051....","[0.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 2.0, 1.0, 2.0, 1.0, 4.0, ...","[0.0, 0.00032711808963035657, 0.00065445026178...","[0.0, 0.003918362358567573, 0.0078251965780427..."


In [28]:

# First try here
# def aggregate_annual_sums(in_arr):
#     n_years = len(in_arr) // 12
#     arr_2d = in_arr[:n_years*12].reshape(n_years,12)
#     return np.nansum(arr_2d, axis=1)

def outstanding_annual_rundown(in_arr):
    return in_arr[::11]

def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.sum(subsections, axis=1))

def aggregate_annual_averages(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmean(arr_2d, axis=1)

def aggregate_annual_median(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmedian(arr_2d, axis=1)

arr = year_grouped[['cpr']]
cpr_heat = arr.applymap(aggregate_annual_median).to_dict()
cpr_heat = cpr_heat['cpr']

cpr_heat = pd.DataFrame.from_dict(cpr_heat, orient='index')

In [29]:
cpr_heat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,0.028908,0.106945,0.150911,0.149497,0.162254,0.214561,0.254789,0.258323,0.173995,0.092864,...,0.096462,0.108682,0.114919,0.125218,0.092834,0.124183,0.140926,0.128156,0.143449,
2001,0.011315,0.062068,0.122763,0.181021,0.227384,0.310767,0.218576,0.195016,0.103998,0.084582,...,0.118275,0.088993,0.121178,0.122287,0.154905,0.079165,0.159105,0.148311,,
2002,0.01456,0.050985,0.130447,0.289482,0.368504,0.329267,0.195881,0.104339,0.075672,0.073361,...,0.1113,0.145181,0.107756,0.151015,0.189548,0.084753,0.242058,,,
2003,0.014679,0.07058,0.221252,0.34505,0.282223,0.184586,0.08142,0.069037,0.089011,0.072412,...,0.125698,0.119491,0.126804,0.111506,0.126138,0.113083,,,,
2004,0.017335,0.104335,0.292858,0.314378,0.176201,0.092263,0.097142,0.086431,0.096243,0.109391,...,0.122913,0.164545,0.078135,0.140433,0.149142,,,,,
2005,0.015571,0.115632,0.156628,0.155143,0.105587,0.084225,0.07884,0.086044,0.113559,0.091786,...,0.141258,0.099637,0.165336,0.190212,,,,,,
2006,0.016346,0.102689,0.130098,0.117299,0.103255,0.099335,0.105882,0.110424,0.087509,0.083048,...,0.102762,0.134292,0.126581,,,,,,,
2007,0.015645,0.083601,0.09833,0.109691,0.094127,0.117041,0.087607,0.07907,0.097328,0.092015,...,0.181001,0.147178,,,,,,,,
2008,0.007463,0.059591,0.074051,0.067271,0.096603,0.102993,0.104299,0.123112,0.10901,0.10367,...,0.11677,,,,,,,,,
2009,0.007841,0.023975,0.043927,0.12963,0.125199,0.116571,0.114034,0.152379,0.142648,0.180949,...,,,,,,,,,,


In [30]:
pd.DataFrame.from_dict({'max': cpr_heat.max(axis=0), 'median': cpr_heat.median(axis=0), 'avg': cpr_heat.mean(axis=0), 'min': cpr_heat.min(axis=0)}).transpose().to_csv('811_mids.csv')


### 5. Create basic triangles

In [31]:
prepays = year_grouped[['prepayments','defaults']].applymap(aggregate_annual_sums).to_dict()
totals = year_grouped[['outstanding']].applymap(outstanding_annual_rundown).to_dict()
totals = totals['outstanding']
totals = (pd.DataFrame.from_dict(totals, orient='index'))
triangles = [totals]
triangles += [pd.DataFrame.from_dict(prepays[k], orient='index') for k in prepays.keys()]

# pd.concat(triangles, axis=0).to_csv('test.csv')

In [32]:
pd.concat(triangles,axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,4939.0,4784.0,4306.0,3765.0,3224.0,2745.0,2159.0,1655.0,1259.0,1059.0,...,630.0,574.0,519.0,466.0,406.0,364.0,318.0,274.0,236.0,
2001,4224.0,4163.0,3922.0,3492.0,2926.0,2279.0,1664.0,1317.0,1081.0,972.0,...,581.0,521.0,467.0,417.0,366.0,320.0,289.0,249.0,,
2002,4929.0,4862.0,4607.0,4107.0,2967.0,2051.0,1429.0,1185.0,1076.0,1006.0,...,570.0,506.0,437.0,388.0,330.0,277.0,255.0,,,
2003,4882.0,4816.0,4518.0,3644.0,2465.0,1800.0,1469.0,1346.0,1253.0,1147.0,...,610.0,533.0,467.0,413.0,372.0,325.0,,,,
2004,4815.0,4724.0,4278.0,3279.0,2350.0,1957.0,1766.0,1613.0,1479.0,1339.0,...,662.0,575.0,490.0,439.0,379.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,21.0,58.0,103.0,51.0,39.0,,,,,,...,,,,,,,,,,
2018,13.0,53.0,38.0,36.0,,,,,,,...,,,,,,,,,,
2019,4.0,13.0,29.0,,,,,,,,...,,,,,,,,,,
2020,1.0,11.0,,,,,,,,,...,,,,,,,,,,


### 6. Get Line Plots (or just the data for line plots)

In [33]:
def generate_lifetime(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

lifetime_cprs = generate_lifetime(cpr_heat)

lifetime_cprs.fillna('')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,0.028908,0.067926,0.095588,0.109065,0.119703,0.135512,0.152552,0.165773,0.166687,0.159305,...,0.133122,0.131684,0.130753,0.130462,0.12858,0.128371,0.128942,0.128907,0.129513,
2001,0.011315,0.036692,0.065382,0.094292,0.12091,0.152553,0.161985,0.166114,0.159212,0.151749,...,0.133065,0.130472,0.129956,0.129552,0.13082,0.12836,0.129758,0.130564,,
2002,0.01456,0.032773,0.065331,0.121369,0.170796,0.197208,0.197018,0.185433,0.173238,0.16325,...,0.141801,0.142,0.140098,0.140672,0.143116,0.140337,0.14496,,,
2003,0.014679,0.042629,0.10217,0.16289,0.186757,0.186395,0.171398,0.158603,0.150871,0.143025,...,0.133326,0.132512,0.132195,0.131106,0.130857,0.130011,,,,
2004,0.017335,0.060835,0.138176,0.182227,0.181021,0.166228,0.156359,0.147618,0.14191,0.138658,...,0.13095,0.132926,0.129882,0.130437,0.131372,,,,,
2005,0.015571,0.065601,0.095944,0.110743,0.109712,0.105464,0.101661,0.099709,0.101247,0.100301,...,0.110234,0.10961,0.112706,0.116785,,,,,,
2006,0.016346,0.059518,0.083044,0.091608,0.093937,0.094837,0.096415,0.098166,0.096982,0.095589,...,0.10784,0.109396,0.11035,,,,,,,
2007,0.015645,0.049623,0.065859,0.076817,0.080279,0.086406,0.086577,0.085639,0.086938,0.087446,...,0.105726,0.108164,,,,,,,,
2008,0.007463,0.033527,0.047035,0.052094,0.060996,0.067995,0.073182,0.079423,0.08271,0.084806,...,0.104132,,,,,,,,,
2009,0.007841,0.015908,0.025247,0.051343,0.066114,0.074524,0.080168,0.089195,0.095134,0.103715,...,,,,,,,,,,


In [34]:
import plotly.express as px

fig = px.line(lifetime_cprs.transpose(), line_shape= 'spline', title="Lifetime Average CPR by Year from Origination", markers=True)
fig.update_layout(yaxis_title='CPR', xaxis_title= 'Year from Origination', yaxis=dict(tickformat='0.0%'))

In [17]:
loan_data['MarginBucket'].value_counts().index.to_list()

['0.025 - 0.0275',
 '0.0275+',
 '0.0175 - 0.02',
 '0.0 - 0.01',
 '0.02 - 0.0225',
 '0.0225 - 0.025',
 '0.0125 - 0.015',
 '0.015 - 0.0175',
 '0.01 - 0.0125']

In [18]:
def subset_dataframe(df, conditions):
    mask = pd.Series(True, index=df.index)
    for col, cond in conditions.items():
        mask &= df[col].isin(cond)
    return df[mask]


In [114]:
# Define a sample dataframe
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Dave'],
    'age': [25, 30, 35, 40],
    'gender': ['F', 'M', 'M', 'M']
}
df = pd.DataFrame(data)

# Subset the dataframe based on multiple conditions
subset = subset_dataframe(df, {'age': [25, 30], 'gender': ['F']})
print(subset)


    name  age gender
0  Alice   25      F
