### Basic imports

In [179]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
from dotenv import load_dotenv
load_dotenv()

engine = sa.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
    .format(host=os.getenv('host'), db=os.getenv('db'), user=os.getenv('uname'), pw=os.getenv('password')))


### 1. Read in Data from csv --> could also use pd.read_sql and use .env to load auth info

In [180]:
loan_data = pd.read_csv("raw_data/loans_v2.csv")
# format date columns to datetime data types
date_cols = [c for c in loan_data.columns if str(c)[-2:]=='Dt']
for col in date_cols:
    loan_data[col] = pd.to_datetime(loan_data[col])


In [181]:
def subset_dataframe(df, conditions, inverse=False):
    """return a subset of a dataframe based on multiple columns and conditions"""
    if not inverse:
        mask = pd.Series(True, index=df.index)
        for col, cond in conditions.items():
            mask &= df[col].isin(cond)
    else:
        mask = pd.Series(True, index=df.index)
        for col, cond in conditions.items():
            mask &= ~df[col].isin(cond)
    return df[mask]

In [200]:
test_slice = loan_data[loan_data['MatBucket']=='21+']
test_slice = test_slice[test_slice['Note_Yr']<=2021]
conditions = {'Code':[721110]}

test1 = subset_dataframe(test_slice,conditions, inverse=True)
test1[test1['LoanAmt']<=2500000]


Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name,yr
190,4435462,2008,4,2008-04-30,378000.00,2031-03-30,275,2019-12-11,139.0,2019-08-23,135.0,135.0,21+,811198.0,All Other Automotive Repair and Maintenance,0.0200,0.0175 - 0.02,GA,Georgia,2008.0
221,1483193,2000,2,2000-02-29,1000000.00,2025-01-27,298,2002-10-22,31.0,2000-12-20,9.0,9.0,21+,722511.0,Full-Service Restaurants,0.0175,0.015 - 0.0175,IN,Indiana,2000.0
228,1477340,2000,1,2000-01-31,1500000.00,2022-01-31,264,2009-05-27,111.0,2000-09-22,7.0,7.0,21+,441222.0,Boat Dealers,0.0125,0.01 - 0.0125,NY,New York,2000.0
235,1478838,2000,3,2000-03-31,1000000.00,2025-03-31,300,2007-07-16,87.0,2001-01-18,9.0,9.0,21+,457120.0,Other Gasoline Stations,0.0275,0.025 - 0.0275,TX,Texas,2000.0
358,4417863,2008,1,2008-01-31,1910922.62,2031-03-15,277,2018-10-24,128.0,2009-10-26,20.0,20.0,21+,812910.0,Pet Care (except Veterinary) Services,0.0000,,CO,Colorado,2008.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820396,17982124,2021,9,2021-09-30,215000.00,2046-09-21,299,NaT,,NaT,,,21+,,,0.0275,0.025 - 0.0275,AZ,Arizona,
820404,17985370,2021,9,2021-09-30,499948.12,2046-09-27,299,NaT,,NaT,,,21+,,,0.0250,0.0225 - 0.025,TX,Texas,
820422,17992033,2021,12,2021-12-16,1007700.00,2046-12-09,299,NaT,,NaT,,,21+,,,0.0160,0.015 - 0.0175,OH,Ohio,
820423,17993054,2021,12,2021-12-16,441891.16,2046-12-08,299,NaT,,NaT,,,21+,,,0.0000,,TX,Texas,


In [39]:
# Slice the data frame to only 21+ year maturity loans
data_slice = loan_data[loan_data['MatBucket']=='21+']

### 2. Use pool.py to organize cohorts --> loans are split into yyyy.mm cohorts

In [183]:
from pooler import pool

# Define a function to create static pools of Loans from the DataFrame
def create_pooler(in_df:pd.DataFrame)-> pool.Pooler:
    temp = in_df.set_index('GP')
    temp = temp.to_dict()
    loans_dict = {}
    for gp in temp['NoteDt'].keys():
        loans_dict[str(gp)] = pool.Loan(gp, pd.to_datetime(temp['NoteDt'][gp]))
        loans_dict[str(gp)].maturity_dt = temp['MaturityDt'][gp]
        loans_dict[str(gp)].maturity_mths_qty = temp['MaturityMthsQty'][gp]
        loans_dict[str(gp)].default_dt = temp['DefaultDt'][gp]
        loans_dict[str(gp)].default_mths_qty = temp['DefaultMthsQty'][gp]
        loans_dict[str(gp)].prepay_dt = temp['PrepayDt'][gp]
        loans_dict[str(gp)].prepay_mths_qty = temp['PrepayMthsQty'][gp]

    return pool.Pooler(loans_dict)

data_slice = loan_data[loan_data['MatBucket']=='21+']
data_slice = data_slice[(data_slice['LoanAmt']>0)&(data_slice['LoanAmt']<=2_500_000)]
data_slice = data_slice[data_slice['Code']!=721110]
my_pooler = create_pooler(data_slice)
my_pooler.build_triangles_counts()

pool_dict = {}
for k, v in my_pooler.triangles.items():
    pool_dict[k] = dict(outstanding=v[0], prepayments=v[1], defaults=v[2])


### 3. Convert Pool dictionaries into Dataframe

In [184]:
# Create Pool Dataframe
df_pool = pd.DataFrame.from_dict( pool_dict, orient='index')
df_pool.index = [float(e) for e in df_pool.index.to_list()]
df_pool = df_pool.sort_index()
# Show the dataframe here
df_pool.head()

Unnamed: 0,outstanding,prepayments,defaults
2000.01,"[411, 411, 411, 411, 410, 409, 409, 409, 406, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 6, 0, 2, 7, 0, 4, ...","[0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 1, ..."
2000.02,"[310, 310, 310, 310, 310, 309, 309, 308, 308, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 1, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 0, 1, 0, 1, ..."
2000.03,"[421, 421, 421, 421, 421, 419, 417, 417, 415, ...","[0, 0, 0, 0, 2, 1, 0, 2, 0, 3, 1, 1, 4, 0, 7, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 3, 0, 1, ..."
2000.04,"[389, 389, 389, 389, 389, 389, 386, 385, 383, ...","[0, 0, 0, 0, 0, 3, 1, 1, 1, 3, 0, 1, 4, 2, 3, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 1, ..."
2000.05,"[439, 439, 439, 439, 438, 438, 438, 437, 436, ...","[0, 0, 0, 1, 0, 0, 0, 1, 2, 0, 6, 0, 5, 0, 2, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, ..."


In [186]:
# Format array lengths
max_row_length = df_pool.shape[0]
count = -1
for i, row in df_pool.iterrows():
    count+=1
    for col in df_pool.columns:
        # ----------------------------------------------------------------------------------------------
        #Method 1: This line will simply truncate the array to the right length:
        # ----------------------------------------------------------------------------------------------
        # df_pool.at[i,col] = row[col][:(max_row_length-count)]
        # ----------------------------------------------------------------------------------------------
        #Method 2: Pad arrays with NaNs --> this will truncate the array then fill it back in with NaNs
        # ----------------------------------------------------------------------------------------------
        arr = row[col][:(max_row_length-count)].astype(float)
        padded_arr = np.pad(arr, (0, max_row_length - (max_row_length-count) ), mode='constant', constant_values=np.nan)
        df_pool.at[i,col] = padded_arr


### 4. Group into Annual cohorts and calculate SMM and CPR

### Reshape the Yearly cohorts data from Months on Book to Year on Book  
`[i,...........,n=MoB] ---> applymap(aggregate_method)`  
 
`[[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]...n_years=YoB]`

In [187]:

# First try here
# def aggregate_annual_sums(in_arr):
#     n_years = len(in_arr) // 12
#     arr_2d = in_arr[:n_years*12].reshape(n_years,12)
#     return np.nansum(arr_2d, axis=1)

def outstanding_annual_rundown(in_arr):
    return in_arr[::11]

def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.sum(subsections, axis=1))

def aggregate_annual_averages(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmean(arr_2d, axis=1)

def aggregate_annual_median(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmedian(arr_2d, axis=1)




In [188]:
# This is where we start abstracting away from the Data --> you're now looking at Annual Cohorts grouped together
df_pool['Year'] = df_pool.index.astype(int)
# Filter out years without 12 months of history
vals = df_pool['Year'].value_counts().to_dict()

yr_range = []
for k, v in vals.items():
    if v == 12:
        yr_range.append(k)

df_pool = df_pool[df_pool['Year'].isin(yr_range)]

# Switch to year group
year_grouped = df_pool.groupby('Year')
year_grouped = year_grouped.agg(np.nansum)


In [189]:
def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.nansum(subsections, axis=1))

year_grouped['outstanding'] = year_grouped['outstanding'].apply(outstanding_annual_rundown)
year_grouped[['prepayments','defaults']] = year_grouped[['prepayments','defaults']].applymap(aggregate_annual_sums)
year_grouped['cpr'] = (year_grouped['prepayments']+year_grouped['defaults'])/year_grouped['outstanding']
# year_grouped['cpr'] = (1-(1-year_grouped['smm'])**12)
def enforce_shape(in_df:pd.DataFrame)->pd.DataFrame:
    data = in_df.copy(deep=True)
    last_year = data.index.max()+1
    for i in range(len(data)):
        max_col = last_year - data.index[i]
        data.iloc[i,(max_col):] = np.NaN
    return data

def generate_lifetime(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

cpr_heat = pd.DataFrame.from_dict(year_grouped['cpr'].to_dict(), orient='index')

cpr_heat = enforce_shape(cpr_heat)
lifetime_cprs = generate_lifetime(cpr_heat)
totals = year_grouped[['outstanding']].to_dict()
prepays = year_grouped[['prepayments']].to_dict()
defaults = year_grouped[['defaults']].to_dict()
totals = totals['outstanding']
prepays = prepays['prepayments']
defaults = defaults['defaults']
totals = (pd.DataFrame.from_dict(totals, orient='index'))
prepays = (pd.DataFrame.from_dict(prepays, orient='index'))
defaults = (pd.DataFrame.from_dict(defaults, orient='index'))
totals = enforce_shape(totals)
prepays = enforce_shape(prepays)
defaults = enforce_shape(defaults)
triangles = [totals,prepays,defaults]
totals = pd.concat(triangles,axis=0)
min_max_median = pd.DataFrame.from_dict({'max': cpr_heat.max(axis=0), 'median': cpr_heat.median(axis=0), 'avg': cpr_heat.mean(axis=0),\
                                          'min': cpr_heat.min(axis=0)}).transpose().to_csv('811_mids.csv')
cpr_heat.to_csv('manual_outputs/cpr_heat.csv')
lifetime_cprs.to_csv('manual_outputs/lifetime.csv')
min_max_median.to_csv('manual_outputs/min_max_mids.csv')
totals.to_csv('manual_outputs/totals.csv')

Year
2000    [97.0, 340.0, 401.0, 425.0, 409.0, 499.0, 431....
2001    [23.0, 138.0, 311.0, 454.0, 536.0, 510.0, 280....
2002    [42.0, 180.0, 399.0, 1008.0, 782.0, 517.0, 178...
2003    [51.0, 226.0, 736.0, 1010.0, 552.0, 234.0, 48....
2004    [70.0, 373.0, 839.0, 791.0, 269.0, 83.0, 50.0,...
2005    [64.0, 368.0, 511.0, 310.0, 88.0, 53.0, 55.0, ...
2006    [48.0, 185.0, 160.0, 62.0, 42.0, 59.0, 72.0, 1...
2007    [21.0, 49.0, 36.0, 43.0, 48.0, 77.0, 77.0, 77....
2008    [17.0, 11.0, 30.0, 54.0, 101.0, 135.0, 149.0, ...
2009    [7.0, 18.0, 39.0, 211.0, 202.0, 187.0, 192.0, ...
2010    [7.0, 44.0, 155.0, 364.0, 343.0, 323.0, 315.0,...
2011    [21.0, 64.0, 230.0, 401.0, 399.0, 363.0, 347.0...
2012    [23.0, 83.0, 198.0, 400.0, 381.0, 387.0, 373.0...
2013    [34.0, 113.0, 249.0, 473.0, 502.0, 429.0, 376....
2014    [18.0, 108.0, 328.0, 599.0, 603.0, 544.0, 342....
2015    [38.0, 173.0, 427.0, 662.0, 693.0, 441.0, 511....
2016    [40.0, 175.0, 443.0, 687.0, 464.0, 604.0, 255....
2017    [

### 5. Create basic triangles

### 6. Get Line Plots (or just the data for line plots)

In [33]:
def generate_lifetime(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

lifetime_cprs = generate_lifetime(cpr_heat)

lifetime_cprs.fillna('')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
2000,0.028908,0.067926,0.095588,0.109065,0.119703,0.135512,0.152552,0.165773,0.166687,0.159305,...,0.133122,0.131684,0.130753,0.130462,0.12858,0.128371,0.128942,0.128907,0.129513,
2001,0.011315,0.036692,0.065382,0.094292,0.12091,0.152553,0.161985,0.166114,0.159212,0.151749,...,0.133065,0.130472,0.129956,0.129552,0.13082,0.12836,0.129758,0.130564,,
2002,0.01456,0.032773,0.065331,0.121369,0.170796,0.197208,0.197018,0.185433,0.173238,0.16325,...,0.141801,0.142,0.140098,0.140672,0.143116,0.140337,0.14496,,,
2003,0.014679,0.042629,0.10217,0.16289,0.186757,0.186395,0.171398,0.158603,0.150871,0.143025,...,0.133326,0.132512,0.132195,0.131106,0.130857,0.130011,,,,
2004,0.017335,0.060835,0.138176,0.182227,0.181021,0.166228,0.156359,0.147618,0.14191,0.138658,...,0.13095,0.132926,0.129882,0.130437,0.131372,,,,,
2005,0.015571,0.065601,0.095944,0.110743,0.109712,0.105464,0.101661,0.099709,0.101247,0.100301,...,0.110234,0.10961,0.112706,0.116785,,,,,,
2006,0.016346,0.059518,0.083044,0.091608,0.093937,0.094837,0.096415,0.098166,0.096982,0.095589,...,0.10784,0.109396,0.11035,,,,,,,
2007,0.015645,0.049623,0.065859,0.076817,0.080279,0.086406,0.086577,0.085639,0.086938,0.087446,...,0.105726,0.108164,,,,,,,,
2008,0.007463,0.033527,0.047035,0.052094,0.060996,0.067995,0.073182,0.079423,0.08271,0.084806,...,0.104132,,,,,,,,,
2009,0.007841,0.015908,0.025247,0.051343,0.066114,0.074524,0.080168,0.089195,0.095134,0.103715,...,,,,,,,,,,


In [34]:
import plotly.express as px

fig = px.line(lifetime_cprs.transpose(), line_shape= 'spline', title="Lifetime Average CPR by Year from Origination", markers=True)
fig.update_layout(yaxis_title='CPR', xaxis_title= 'Year from Origination', yaxis=dict(tickformat='0.0%'))

In [17]:
loan_data['MarginBucket'].value_counts().index.to_list()

['0.025 - 0.0275',
 '0.0275+',
 '0.0175 - 0.02',
 '0.0 - 0.01',
 '0.02 - 0.0225',
 '0.0225 - 0.025',
 '0.0125 - 0.015',
 '0.015 - 0.0175',
 '0.01 - 0.0125']

In [18]:
def subset_dataframe(df, conditions):
    mask = pd.Series(True, index=df.index)
    for col, cond in conditions.items():
        mask &= df[col].isin(cond)
    return df[mask]


In [114]:
# Define a sample dataframe
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Dave'],
    'age': [25, 30, 35, 40],
    'gender': ['F', 'M', 'M', 'M']
}
df = pd.DataFrame(data)

# Subset the dataframe based on multiple conditions
subset = subset_dataframe(df, {'age': [25, 30], 'gender': ['F']})
print(subset)


    name  age gender
0  Alice   25      F
