### Basic imports

In [212]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os
from dotenv import load_dotenv
load_dotenv()

engine = sa.create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
    .format(host=os.getenv('host'), db=os.getenv('db'), user=os.getenv('uname'), pw=os.getenv('password')))


### 1. Read in Data from csv --> could also use pd.read_sql and use .env to load auth info

In [270]:
loan_data = pd.read_csv("raw_data/loans_v2.csv")
# format date columns to datetime data types
date_cols = [c for c in loan_data.columns if str(c)[-2:]=='Dt']
for col in date_cols:
    loan_data[col] = pd.to_datetime(loan_data[col])


In [214]:
def subset_dataframe(df, conditions, inverse=False):
    """return a subset of a dataframe based on multiple columns and conditions"""
    if not inverse:
        mask = pd.Series(True, index=df.index)
        for col, cond in conditions.items():
            mask &= df[col].isin(cond)
    else:
        mask = pd.Series(True, index=df.index)
        for col, cond in conditions.items():
            mask &= ~df[col].isin(cond)
    return df[mask]

In [292]:
test_slice = loan_data[loan_data['MatBucket']=='8-11']
test_slice = test_slice[test_slice['Note_Yr']<=2021]
# conditions = {'Code':[721110]}

# test1 = subset_dataframe(test_slice,conditions, inverse=True)
test_slice = test_slice[test_slice['CodeText'].notna()]
test_slice[test_slice['CodeText'].str.contains('Fitness')]
# test_slice[test_slice['LoanAmt']<200000]


Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name,yr
327,4300417,2006,6,2006-06-30,7500.0,2016-06-30,120,2007-12-17,17.0,2007-04-20,9.0,9.0,8-11,713940.0,Fitness and Recreational Sports Centers,0.0450,0.0275+,TX,Texas,2006.0
657,2030105,2001,5,2001-05-31,132200.0,2009-05-31,96,2009-05-31,96.0,NaT,,96.0,8-11,713940.0,Fitness and Recreational Sports Centers,0.0250,0.0225 - 0.025,CA,California,2001.0
1061,2360609,2003,4,2003-04-30,154900.0,2013-04-30,120,2006-03-31,35.0,NaT,,35.0,8-11,713940.0,Fitness and Recreational Sports Centers,0.0225,0.02 - 0.0225,CA,California,2003.0
1352,3966691,2004,6,2004-06-30,278000.0,2014-06-30,120,2006-06-30,24.0,NaT,,24.0,8-11,713940.0,Fitness and Recreational Sports Centers,0.0250,0.0225 - 0.025,WA,Washington,2004.0
1931,4340297,2007,2,2007-02-28,80000.0,2015-04-28,98,2011-06-14,51.0,2009-08-03,29.0,29.0,8-11,713940.0,Fitness and Recreational Sports Centers,0.0200,0.0175 - 0.02,WA,Washington,2007.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738671,10439288,2020,9,2020-09-30,225800.0,2030-07-30,118,NaT,,NaT,,,8-11,713940.0,Fitness and Recreational Sports Centers,0.0275,0.025 - 0.0275,MN,Minnesota,2020.0
738685,10567226,2020,8,2020-08-31,340400.0,2030-08-07,119,NaT,,NaT,,,8-11,713940.0,Fitness and Recreational Sports Centers,0.0275,0.025 - 0.0275,CO,Colorado,2020.0
738750,10701723,2021,2,2021-02-03,225300.0,2030-12-17,118,NaT,,NaT,,,8-11,713940.0,Fitness and Recreational Sports Centers,0.0275,0.025 - 0.0275,OH,Ohio,2021.0
738851,17645973,2021,6,2021-06-30,405000.0,2031-06-01,119,NaT,,NaT,,,8-11,713940.0,Fitness and Recreational Sports Centers,0.0100,0.0 - 0.01,KY,Kentucky,2021.0


In [269]:
test_slice[test_slice['Code']==624410]['CodeText']

504       Child Care Services
1087      Child Care Services
1590      Child Care Services
2481      Child Care Services
2492      Child Care Services
                 ...         
738623    Child Care Services
738627    Child Care Services
738891    Child Care Services
738948    Child Care Services
738994    Child Care Services
Name: CodeText, Length: 3735, dtype: object

In [216]:
# Slice the data frame to only 21+ year maturity loans
data_slice = loan_data[loan_data['MatBucket']=='21+']

### 2. Use pool.py to organize cohorts --> loans are split into yyyy.mm cohorts

In [248]:
data_slice = loan_data[loan_data['MatBucket']=='21+']
data_slice = data_slice[(data_slice['LoanAmt']>2_500_000)]
data_slice = data_slice[data_slice['Code']==721110]
data_slice[data_slice['Note_Yr']<=2021]
# (data_slice['LoanAmt']>0)&

Unnamed: 0,GP,Note_Yr,Note_Mth,NoteDt,LoanAmt,MaturityDt,MaturityMthsQty,PrepayDt,PrepayMthsQty,DefaultDt,DefaultMthsQty,PP_qty,MatBucket,Code,CodeText,Margin,MarginBucket,state_abbreviation,state_name,yr
3848,4729548,2013,8,2013-08-01,4000000.0,2038-08-01,300,2018-06-30,58.0,NaT,,58.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0125,0.01 - 0.0125,TX,Texas,2013.0
4339,4885629,2016,2,2016-02-01,2880000.0,2041-02-01,300,2020-01-31,47.0,NaT,,47.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0150,0.0125 - 0.015,AZ,Arizona,2016.0
4495,4925852,2015,8,2015-08-01,2801800.0,2041-02-01,306,2019-10-31,50.0,NaT,,50.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0125,0.01 - 0.0125,OR,Oregon,2015.0
4555,4941051,2016,7,2016-07-19,4400000.0,2041-07-19,300,2020-04-30,45.0,NaT,,45.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0225,0.02 - 0.0225,NJ,New Jersey,2016.0
4727,4983334,2017,3,2017-03-21,4225000.0,2042-02-21,299,2022-06-06,62.0,NaT,,62.0,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0275,0.025 - 0.0275,KY,Kentucky,2017.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
738501,5129682,2020,1,2020-01-31,2602500.0,2044-09-26,295,NaT,,NaT,,,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0150,0.0125 - 0.015,GA,Georgia,2020.0
738549,5137719,2019,11,2019-11-30,4435000.0,2044-11-20,299,NaT,,NaT,,,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0150,0.0125 - 0.015,MI,Michigan,2019.0
738720,10651432,2020,9,2020-09-30,4300000.0,2045-09-16,299,NaT,,NaT,,,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0125,0.01 - 0.0125,WA,Washington,2020.0
738780,12642889,2021,3,2021-03-25,3800000.0,2046-02-23,298,NaT,,NaT,,,21+,721110.0,Hotels (except Casino Hotels) and Motels,0.0250,0.0225 - 0.025,NJ,New Jersey,2021.0


In [240]:
from pooler import pool

# Define a function to create static pools of Loans from the DataFrame
def create_pooler(in_df:pd.DataFrame)-> pool.Pooler:
    temp = in_df.set_index('GP')
    temp = temp.to_dict()
    loans_dict = {}
    for gp in temp['NoteDt'].keys():
        loans_dict[str(gp)] = pool.Loan(gp, pd.to_datetime(temp['NoteDt'][gp]))
        loans_dict[str(gp)].maturity_dt = temp['MaturityDt'][gp]
        loans_dict[str(gp)].maturity_mths_qty = temp['MaturityMthsQty'][gp]
        loans_dict[str(gp)].default_dt = temp['DefaultDt'][gp]
        loans_dict[str(gp)].default_mths_qty = temp['DefaultMthsQty'][gp]
        loans_dict[str(gp)].prepay_dt = temp['PrepayDt'][gp]
        loans_dict[str(gp)].prepay_mths_qty = temp['PrepayMthsQty'][gp]

    return pool.Pooler(loans_dict)

my_pooler = create_pooler(data_slice)
my_pooler.build_triangles_counts()

pool_dict = {}
for k, v in my_pooler.triangles.items():
    pool_dict[k] = dict(outstanding=v[0], prepayments=v[1], defaults=v[2])


### 3. Convert Pool dictionaries into Dataframe

In [241]:
# Create Pool Dataframe
df_pool = pd.DataFrame.from_dict( pool_dict, orient='index')
df_pool.index = [float(e) for e in df_pool.index.to_list()]
df_pool = df_pool.sort_index()
# Show the dataframe here
df_pool.head()

Unnamed: 0,outstanding,prepayments,defaults
2000.04,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2000.05,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2010.11,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2010.12,"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2011.01,"[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [242]:
# Format array lengths
max_row_length = df_pool.shape[0]
count = -1
for i, row in df_pool.iterrows():
    count+=1
    for col in df_pool.columns:
        # ----------------------------------------------------------------------------------------------
        #Method 1: This line will simply truncate the array to the right length:
        # ----------------------------------------------------------------------------------------------
        # df_pool.at[i,col] = row[col][:(max_row_length-count)]
        # ----------------------------------------------------------------------------------------------
        #Method 2: Pad arrays with NaNs --> this will truncate the array then fill it back in with NaNs
        # ----------------------------------------------------------------------------------------------
        arr = row[col][:(max_row_length-count)].astype(float)
        padded_arr = np.pad(arr, (0, max_row_length - (max_row_length-count) ), mode='constant', constant_values=np.nan)
        df_pool.at[i,col] = padded_arr


### 4. Group into Annual cohorts and calculate SMM and CPR

### Reshape the Yearly cohorts data from Months on Book to Year on Book  
`[i,...........,n=MoB] ---> applymap(aggregate_method)`  
 
`[[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]`  
`[i,...]...n_years=YoB]`

In [243]:

# First try here
# def aggregate_annual_sums(in_arr):
#     n_years = len(in_arr) // 12
#     arr_2d = in_arr[:n_years*12].reshape(n_years,12)
#     return np.nansum(arr_2d, axis=1)

def outstanding_annual_rundown(in_arr):
    return in_arr[::11]

def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.sum(subsections, axis=1))

def aggregate_annual_averages(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmean(arr_2d, axis=1)

def aggregate_annual_median(in_arr):
    n_years = len(in_arr) // 11
    arr_2d = in_arr[:n_years*11].reshape(n_years,11)
    return np.nanmedian(arr_2d, axis=1)




In [244]:
# This is where we start abstracting away from the Data --> you're now looking at Annual Cohorts grouped together
df_pool['Year'] = df_pool.index.astype(int)
# Filter out years without 12 months of history
vals = df_pool['Year'].value_counts().to_dict()

yr_range = []
for k, v in vals.items():
    if v == 12:
        yr_range.append(k)

df_pool = df_pool[df_pool['Year'].isin(yr_range)]

# Switch to year group
year_grouped = df_pool.groupby('Year')
year_grouped = year_grouped.agg(np.nansum)


In [245]:
def aggregate_annual_sums(in_arr):
    remainder = len(in_arr) % 11
    if remainder != 0:
        padding = np.zeros(11 - remainder)
        subsections = np.concatenate([in_arr, padding])
    else:
        subsections = in_arr
    subsections = np.split(subsections, len(subsections) // 11)
    return (np.nansum(subsections, axis=1))

year_grouped['outstanding'] = year_grouped['outstanding'].apply(outstanding_annual_rundown)
year_grouped[['prepayments','defaults']] = year_grouped[['prepayments','defaults']].applymap(aggregate_annual_sums)
year_grouped['cpr'] = (year_grouped['prepayments']+year_grouped['defaults'])/year_grouped['outstanding']
# year_grouped['cpr'] = (1-(1-year_grouped['smm'])**12)
def enforce_shape(in_df:pd.DataFrame)->pd.DataFrame:
    data = in_df.copy(deep=True)
    last_year = data.index.max()+1
    for i in range(len(data)):
        max_col = last_year - data.index[i]
        data.iloc[i,(max_col):] = np.NaN
    return data

def generate_lifetime(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

cpr_heat = pd.DataFrame.from_dict(year_grouped['cpr'].to_dict(), orient='index')

cpr_heat = enforce_shape(cpr_heat)
lifetime_cprs = generate_lifetime(cpr_heat)
totals = year_grouped[['outstanding']].to_dict()
prepays = year_grouped[['prepayments']].to_dict()
defaults = year_grouped[['defaults']].to_dict()
totals = totals['outstanding']
prepays = prepays['prepayments']
defaults = defaults['defaults']
totals = (pd.DataFrame.from_dict(totals, orient='index'))
prepays = (pd.DataFrame.from_dict(prepays, orient='index'))
defaults = (pd.DataFrame.from_dict(defaults, orient='index'))
totals = enforce_shape(totals)
prepays = enforce_shape(prepays)
defaults = enforce_shape(defaults)
triangles = [totals,prepays,defaults]
totals = pd.concat(triangles,axis=0)
min_max_median = pd.DataFrame.from_dict({'max': cpr_heat.max(axis=0), 'median': cpr_heat.median(axis=0), 'avg': cpr_heat.mean(axis=0),\
                                          'min': cpr_heat.min(axis=0)}).transpose()
cpr_heat.to_csv('manual_outputs/cpr_heat.csv')
lifetime_cprs.to_csv('manual_outputs/lifetime.csv')
min_max_median.to_csv('manual_outputs/min_max_mids.csv')
totals.to_csv('manual_outputs/totals.csv')

### 5. Create basic triangles

### 6. Get Line Plots (or just the data for line plots)

In [229]:
def generate_lifetime(cpr_df:pd.DataFrame)-> pd.DataFrame:
    # get the cumulative sum of each row WHILE ignoring NaN values (otherwise the denominator is off)
    cumulative_sum = np.nancumsum(cpr_df.values, axis=1)
    # compute the number of non-NaN values in each row
    num_non_nan = (~np.isnan(cpr_df.values)).cumsum(axis=1)
    # get ROW-WISE average up until the first NaN value is encountered
    row_avg = np.where(np.isnan(cpr_df), np.nan, cumulative_sum / num_non_nan)
    # create new dataframe with row-wise averages
    lifetime_df = pd.DataFrame(row_avg, columns=cpr_df.columns, index=cpr_df.index)
    return lifetime_df

lifetime_cprs = generate_lifetime(cpr_heat)

lifetime_cprs.fillna('')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
2011,0.0,0.014599,0.043567,0.079746,0.104003,0.115702,0.127075,0.136676,0.144525,0.157765,0.149225,,,
2012,0.011364,0.030586,0.052649,0.086362,0.110847,0.125938,0.141487,0.145108,0.159426,0.168012,,,,
2013,0.01105,0.027871,0.050745,0.08013,0.103014,0.120467,0.126776,0.131002,0.132872,,,,,
2014,0.00823,0.020713,0.048859,0.07921,0.099206,0.114362,0.124733,0.13521,,,,,,
2015,0.001887,0.025518,0.042857,0.088177,0.107764,0.11085,0.1218,,,,,,,
2016,0.00534,0.021462,0.048711,0.082801,0.091813,0.112981,,,,,,,,
2017,0.007519,0.029643,0.061484,0.075794,0.107267,,,,,,,,,
2018,0.009655,0.025023,0.03942,0.077462,,,,,,,,,,
2019,0.010622,0.016047,0.058243,,,,,,,,,,,
2020,0.009375,0.035445,,,,,,,,,,,,


In [34]:
import plotly.express as px

fig = px.line(lifetime_cprs.transpose(), line_shape= 'spline', title="Lifetime Average CPR by Year from Origination", markers=True)
fig.update_layout(yaxis_title='CPR', xaxis_title= 'Year from Origination', yaxis=dict(tickformat='0.0%'))

In [17]:
loan_data['MarginBucket'].value_counts().index.to_list()

['0.025 - 0.0275',
 '0.0275+',
 '0.0175 - 0.02',
 '0.0 - 0.01',
 '0.02 - 0.0225',
 '0.0225 - 0.025',
 '0.0125 - 0.015',
 '0.015 - 0.0175',
 '0.01 - 0.0125']

In [18]:
def subset_dataframe(df, conditions):
    mask = pd.Series(True, index=df.index)
    for col, cond in conditions.items():
        mask &= df[col].isin(cond)
    return df[mask]


In [114]:
# Define a sample dataframe
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Dave'],
    'age': [25, 30, 35, 40],
    'gender': ['F', 'M', 'M', 'M']
}
df = pd.DataFrame(data)

# Subset the dataframe based on multiple conditions
subset = subset_dataframe(df, {'age': [25, 30], 'gender': ['F']})
print(subset)


    name  age gender
0  Alice   25      F
