### Basic imports

In [3]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
from dotenv import load_dotenv
load_dotenv()

engine = sa.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
    .format(host=os.getenv('host'), db=os.getenv('db'), user=os.getenv('uname'), pw=os.getenv('password')))


### 1. Read in Data from csv --> could also use pd.read_sql and use .env to load auth info

In [4]:
loan_data = pd.read_csv("raw_data/master_loan_tape.csv")
# format date columns to datetime data types
date_cols = [c for c in loan_data.columns if str(c)[-2:]=='Dt']
for col in date_cols:
    loan_data[col] = pd.to_datetime(loan_data[col])


In [5]:
loan_data['MarginBucket'].value_counts()

0.025 - 0.0275    150217
0.0275+           116308
0.0175 - 0.02      84478
0.0 - 0.01         72243
0.02 - 0.0225      59692
0.0225 - 0.025     49980
0.0125 - 0.015     42966
0.015 - 0.0175     31516
0.01 - 0.0125      17405
Name: MarginBucket, dtype: int64

In [6]:
# Slice the data frame to only 21+ year maturity loans
data_slice = loan_data[loan_data['MatBucket']=='21+']

### 2. Use pool.py to organize cohorts --> loans are split into yyyy.mm cohorts

In [137]:
from pooler import pool

# Define a function to create static pools of Loans from the DataFrame
def create_pooler(in_df:pd.DataFrame)-> pool.Pooler:
    temp = in_df.set_index('GP')
    temp = temp.to_dict()
    loans_dict = {}
    for gp in temp['NoteDt'].keys():
        loans_dict[str(gp)] = pool.Loan(gp, pd.to_datetime(temp['NoteDt'][gp]))
        loans_dict[str(gp)].maturity_dt = temp['MaturityDt'][gp]
        loans_dict[str(gp)].maturity_mths_qty = temp['MaturityMthsQty'][gp]
        loans_dict[str(gp)].default_dt = temp['DefaultDt'][gp]
        loans_dict[str(gp)].default_mths_qty = temp['DefaultMthsQty'][gp]
        loans_dict[str(gp)].prepay_dt = temp['PrepayDt'][gp]
        loans_dict[str(gp)].prepay_mths_qty = temp['PrepayMthsQty'][gp]

    return pool.Pooler(loans_dict)

data_slice = loan_data[loan_data['MatBucket']=='8-11']
data_slice = data_slice[data_slice['MarginBucket']=='0.01 - 0.0125']
my_pooler = create_pooler(data_slice)
my_pooler.build_triangles_counts()

pool_dict = {}
for k, v in my_pooler.triangles.items():
    pool_dict[k] = dict(outstanding=v[0], prepayments=v[1], defaults=v[2])


### 3. Convert Pool dictionaries into Dataframe

In [138]:
# Create Pool Dataframe
df_pool = pd.DataFrame.from_dict( pool_dict, orient='index')
df_pool.index = [float(e) for e in df_pool.index.to_list()]
df_pool = df_pool.sort_index()
# Show the dataframe here
df_pool.head()

Unnamed: 0,outstanding,prepayments,defaults
2000.01,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2000.02,"[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2000.03,"[17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2000.04,"[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2000.05,"[19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, ..."


In [139]:
# Format array lengths
max_row_length = df_pool.shape[0]
count = -1
for i, row in df_pool.iterrows():
    count+=1
    for col in df_pool.columns:
        # ----------------------------------------------------------------------------------------------
        #Method 1: This line will simply truncate the array to the right length:
        # ----------------------------------------------------------------------------------------------
        # df_pool.at[i,col] = row[col][:(max_row_length-count)]
        # ----------------------------------------------------------------------------------------------
        #Method 2: Pad arrays with NaNs --> this will truncate the array then fill it back in with NaNs
        # ----------------------------------------------------------------------------------------------
        arr = row[col][:(max_row_length-count)].astype(float)
        padded_arr = np.pad(arr, (0, max_row_length - (max_row_length-count) ), mode='constant', constant_values=np.nan)
        df_pool.at[i,col] = padded_arr


### 4. Group into Annual cohorts and calculate SMM and CPR

In [140]:
# This is where we start abstracting away from the Data --> you're now looking at Annual Cohorts grouped together
df_pool['Year'] = df_pool.index.astype(int)
# Filter out years without 12 months of history
vals = df_pool['Year'].value_counts().to_dict()

yr_range = []
for k, v in vals.items():
    if v == 12:
        yr_range.append(k)

df_pool = df_pool[df_pool['Year'].isin(yr_range)]

# Switch to year group
year_grouped = df_pool.groupby('Year')
year_grouped = year_grouped.agg(np.nansum)
year_grouped['smm'] = (year_grouped['prepayments']+year_grouped['defaults'])/year_grouped['outstanding']
year_grouped['cpr'] = (1-(1-year_grouped['smm'])**12)

### Reshape the Yearly cohorts data from Months on Book to Year on Book  
`[i,...........,n=MoB] ---> applymap(aggregate_method)`  
 
`[[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]...n_years=YoB]`

In [143]:
year_grouped

Unnamed: 0_level_0,outstanding,prepayments,defaults,smm,cpr
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000,"[158.0, 158.0, 157.0, 157.0, 157.0, 156.0, 156...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.006329113924050633, 0.0, 0.0, 0.006369...","[0.0, 0.07336055022556498, 0.0, 0.0, 0.0738115..."
2001,"[140.0, 140.0, 140.0, 140.0, 138.0, 138.0, 138...","[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.014285714285714285, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.15858043015422008, 0.0, 0.0,..."
2003,"[67.0, 67.0, 67.0, 67.0, 67.0, 67.0, 67.0, 67....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2004,"[134.0, 134.0, 134.0, 134.0, 134.0, 134.0, 134...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02238805...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23792455..."
2005,"[284.0, 284.0, 284.0, 284.0, 283.0, 282.0, 281...","[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 3.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0035211267605633804, 0.00353...","[0.0, 0.0, 0.0, 0.04144475978409767, 0.0415883..."
2006,"[314.0, 314.0, 314.0, 314.0, 314.0, 314.0, 313...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 3.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.003184713375796178...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.037554217635019094..."
2007,"[390.0, 389.0, 389.0, 389.0, 389.0, 388.0, 387...","[1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 4.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[0.002564102564102564, 0.0, 0.0, 0.0, 0.002570...","[0.03033899317184685, 0.0, 0.0, 0.0, 0.0304158..."
2008,"[262.0, 262.0, 262.0, 262.0, 261.0, 260.0, 260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.003816793893129771, 0.003831...","[0.0, 0.0, 0.0, 0.04485217247509066, 0.0450204..."
2012,"[58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58.0, 58....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2013,"[87.0, 87.0, 87.0, 84.0, 83.0, 83.0, 83.0, 83....","[0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.034482758620689655, 0.01190476190...","[0.0, 0.0, 0.3436728025172492, 0.1338648260254..."


In [141]:

# First try here
# def aggregate_annual_sums(in_arr):
#     n_years = len(in_arr) // 12
#     arr_2d = in_arr[:n_years*12].reshape(n_years,12)
#     return np.nansum(arr_2d, axis=1)

def outstanding_annual_rundown(in_arr):
    return in_arr[::11]

def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.sum(subsections, axis=1))

def aggregate_annual_averages(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmean(arr_2d, axis=1)

def aggregate_annual_median(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmedian(arr_2d, axis=1)

arr = year_grouped[['cpr']]
cpr_heat = arr.applymap(aggregate_annual_median).to_dict()
cpr_heat = cpr_heat['cpr']

cpr_heat = pd.DataFrame.from_dict(cpr_heat, orient='index')

In [142]:
cpr_heat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
2000,0.0,0.078135,0.170974,0.103798,0.121668,0.135372,0.165109,0.204336,0.256444,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001,0.0,0.086586,0.092595,0.110481,0.236372,0.177269,0.200929,0.227462,0.308754,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2003,0.0,0.0,0.197633,0.0,0.223253,0.0,0.286841,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2004,0.0,0.169768,0.215283,0.133865,0.264871,0.172197,0.200929,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
2005,0.041445,0.084455,0.179249,0.107035,0.061677,0.069136,0.071186,0.224639,0.225338,0.364208,...,0.0,0.0,0.0,0.0,,,,,,
2006,0.0,0.079691,0.08566,0.090502,0.053439,0.157887,0.126138,0.155505,0.179919,0.215283,...,0.0,0.0,0.0,,,,,,,
2007,0.030416,0.092122,0.109474,0.082144,0.088501,0.091189,0.148162,0.179919,0.19872,0.273865,...,0.0,0.0,,,,,,,,
2008,0.044852,0.047895,0.049679,0.102909,0.166634,0.142598,0.080223,0.173438,0.122913,0.253752,...,0.0,,,,,,,,,
2012,0.0,0.0,0.0,0.20786,0.0,0.316811,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2013,0.0,0.0,0.150636,0.165109,0.188362,0.0,0.256444,0.308754,0.364208,0.0,...,,,,,,,,,,


In [136]:
pd.DataFrame.from_dict({'max': cpr_heat.max(axis=0), 'median': cpr_heat.median(axis=0), 'avg': cpr_heat.mean(axis=0), 'min': cpr_heat.min(axis=0)}).transpose().to_csv('811_mids.csv')


### 5. Create basic triangles

In [123]:
prepays = year_grouped[['prepayments','defaults']].applymap(aggregate_annual_sums).to_dict()
totals = year_grouped[['outstanding']].applymap(outstanding_annual_rundown).to_dict()
totals = totals['outstanding']
totals = (pd.DataFrame.from_dict(totals, orient='index'))
triangles = [totals]
triangles += [pd.DataFrame.from_dict(prepays[k], orient='index') for k in prepays.keys()]

# pd.concat(triangles, axis=0).to_csv('test.csv')

In [125]:
pd.concat(triangles,axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,4939.0,4784.0,4306.0,3765.0,3224.0,2745.0,2159.0,1655.0,1259.0,1059.0,...,630.0,574.0,519.0,466.0,406.0,364.0,318.0,274.0,236.0,
2001,4224.0,4163.0,3922.0,3492.0,2926.0,2279.0,1664.0,1317.0,1081.0,972.0,...,581.0,521.0,467.0,417.0,366.0,320.0,289.0,249.0,,
2002,4929.0,4862.0,4607.0,4107.0,2967.0,2051.0,1429.0,1185.0,1076.0,1006.0,...,570.0,506.0,437.0,388.0,330.0,277.0,255.0,,,
2003,4882.0,4816.0,4518.0,3644.0,2465.0,1800.0,1469.0,1346.0,1253.0,1147.0,...,610.0,533.0,467.0,413.0,372.0,325.0,,,,
2004,4815.0,4724.0,4278.0,3279.0,2350.0,1957.0,1766.0,1613.0,1479.0,1339.0,...,662.0,575.0,490.0,439.0,379.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,22.0,75.0,98.0,57.0,36.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2018,16.0,56.0,39.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2019,6.0,13.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
2020,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


### 6. Get Line Plots (or just the data for line plots)

In [106]:
def generate_lifetime(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

lifetime_cprs = generate_lifetime(cpr_heat)

lifetime_cprs.fillna('')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,0.028908,0.067926,0.095588,0.109065,0.119703,0.135512,0.152552,0.165773,0.166687,0.159305,...,0.133122,0.131684,0.130753,0.130462,0.12858,0.128371,0.128942,0.128907,0.129513,
2001,0.011315,0.036692,0.065382,0.094292,0.12091,0.152553,0.161985,0.166114,0.159212,0.151749,...,0.133065,0.130472,0.129956,0.129552,0.13082,0.12836,0.129758,0.130564,,
2002,0.01456,0.032773,0.065331,0.121369,0.170796,0.197208,0.197018,0.185433,0.173238,0.16325,...,0.141801,0.142,0.140098,0.140672,0.143116,0.140337,0.14496,,,
2003,0.014679,0.042629,0.10217,0.16289,0.186757,0.186395,0.171398,0.158603,0.150871,0.143025,...,0.133326,0.132512,0.132195,0.131106,0.130857,0.130011,,,,
2004,0.017335,0.060835,0.138176,0.182227,0.181021,0.166228,0.156359,0.147618,0.14191,0.138658,...,0.13095,0.132926,0.129882,0.130437,0.131372,,,,,
2005,0.015571,0.065601,0.095944,0.110743,0.109712,0.105464,0.101661,0.099709,0.101247,0.100301,...,0.110234,0.10961,0.112706,0.116785,,,,,,
2006,0.016346,0.059518,0.083044,0.091608,0.093937,0.094837,0.096415,0.098166,0.096982,0.095589,...,0.10784,0.109396,0.11035,,,,,,,
2007,0.015645,0.049623,0.065859,0.076817,0.080279,0.086406,0.086577,0.085639,0.086938,0.087446,...,0.105726,0.108164,,,,,,,,
2008,0.007463,0.033527,0.047035,0.052094,0.060996,0.067995,0.073182,0.079423,0.08271,0.084806,...,0.104132,,,,,,,,,
2009,0.007841,0.015908,0.025247,0.051343,0.066114,0.074524,0.080168,0.089195,0.095134,0.103715,...,,,,,,,,,,


In [107]:
import plotly.express as px

fig = px.line(lifetime_cprs.transpose(), line_shape= 'spline', title="Lifetime Average CPR by Year from Origination", markers=True)
fig.update_layout(yaxis_title='CPR', xaxis_title= 'Year from Origination', yaxis=dict(tickformat='0.0%'))

In [121]:
loan_data['MarginBucket'].value_counts().index.to_list()

['0.025 - 0.0275',
 '0.0275+',
 '0.0175 - 0.02',
 '0.0 - 0.01',
 '0.02 - 0.0225',
 '0.0225 - 0.025',
 '0.0125 - 0.015',
 '0.015 - 0.0175',
 '0.01 - 0.0125']

In [113]:
def subset_dataframe(df, conditions):
    mask = pd.Series(True, index=df.index)
    for col, cond in conditions.items():
        mask &= df[col].isin(cond)
    return df[mask]


In [114]:
# Define a sample dataframe
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Dave'],
    'age': [25, 30, 35, 40],
    'gender': ['F', 'M', 'M', 'M']
}
df = pd.DataFrame(data)

# Subset the dataframe based on multiple conditions
subset = subset_dataframe(df, {'age': [25, 30], 'gender': ['F']})
print(subset)


    name  age gender
0  Alice   25      F
