In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from scipy.optimize import curve_fit, minimize

import plotly
from plotly import graph_objects as go
import plotly.io as pio

# change default plotly theme
pio.templates.default = "plotly_white"

In [2]:
inputs = pd.read_csv('pltv_inputs.csv')
data = pd.read_csv('ke_data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   First Loan Local Disbursement Month  181 non-null    object 
 1   Months Since First Loan Disbursed    181 non-null    int64  
 2   Count First Loans                    181 non-null    int64  
 3   Count Borrowers                      181 non-null    int64  
 4   Count Loans                          181 non-null    int64  
 5   Total Amount                         181 non-null    int64  
 6   Total Interest Assessed              181 non-null    int64  
 7   Total Rollover Charged               181 non-null    int64  
 8   Total Rollover Reversed              181 non-null    int64  
 9   Default Rate Amount 7D               158 non-null    float64
 10  Default Rate Amount 30D              148 non-null    float64
 11  Default Rate Amount 51D         

In [4]:
# fix date inconsistencies
data = data.replace({'2021-9': '2021-09', '2021-8': '2021-08', '2021-7': '2021-07', 
              '2021-6': '2021-06', '2021-5': '2021-05', '2021-4': '2021-04', '2020-9': '2020-09'})

# sort by months since first disbursement
data = data.sort_values(['First Loan Local Disbursement Month', 
                         'Months Since First Loan Disbursed'])

# remove all columns calculated through looker
data = data.loc[:,:"Default Rate Amount 51D"]

In [5]:
data.head()

Unnamed: 0,First Loan Local Disbursement Month,Months Since First Loan Disbursed,Count First Loans,Count Borrowers,Count Loans,Total Amount,Total Interest Assessed,Total Rollover Charged,Total Rollover Reversed,Default Rate Amount 7D,Default Rate Amount 30D,Default Rate Amount 51D
0,2020-09,0,7801,7801,13156,48361000,6540240,681325,81520,0.155382,0.121192,0.113031
15,2020-09,1,0,4481,5697,34490000,4660880,416544,32387,0.130661,0.101823,0.095738
2,2020-09,2,0,3661,4310,31461000,4297310,401077,30617,0.139719,0.103958,0.094792
9,2020-09,3,0,3050,3599,30482000,4178400,343062,17629,0.125111,0.089089,0.084399
10,2020-09,4,0,2549,2985,29303000,3964590,300262,920,0.11372,0.08675,0.081658


## pLTV

In [6]:
# KSH to USD conversion factor
ksh_usd = 0.00925

In [48]:
class sBG:
    """
    sBG model class containing all functionality for creating, analyzing, and backtesting
    the sBG model.
    """
    def __init__(self, data):
        self.data = data
        
        self.clean_data()
        
    def clean_data(self):
        # fix date inconsistencies
        self.data = self.data.replace({'2021-9': '2021-09', '2021-8': '2021-08', \
                                       '2021-7': '2021-07', '2021-6': '2021-06', \
                                       '2021-5': '2021-05', '2021-4': '2021-04', \
                                       '2020-9': '2020-09'})

        # sort by months since first disbursement
        self.data = self.data.sort_values(['First Loan Local Disbursement Month', 
                                 'Months Since First Loan Disbursed'])

        # remove all columns calculated through looker
        self.data = self.data.loc[:,:"Default Rate Amount 51D"]
        
        # add more convenient cohort column
        self.data['cohort'] = self.data['First Loan Local Disbursement Month']
        
        
    # --- DATA FUNCTIONS --- #
    def borrower_retention(self, cohort_data):
        # retention
        return 100*cohort_data['Count Borrowers']/cohort_data['Count Borrowers'].max()

    def borrower_survival(self, cohort_data):
        return 100*cohort_data['Count Borrowers']/cohort_data['Count Borrowers'].shift(1)
    
    def loans_per_borrower(self, cohort_data):
        return cohort_data['Count Loans']/cohort_data['Count Borrowers']
    
    def loan_size(self, cohort_data, to_usd):
        df = cohort_data['Total Amount']/cohort_data['Count Loans']
        if to_usd:
            df *= ksh_usd
        return df
    
    def interest_rate(self, cohort_data):
        return 100*cohort_data['Total Interest Assessed']/cohort_data['Total Amount']
    
    def default_rate(self, cohort_data, period=7):
        if period==7:
            return 100*cohort_data['Default Rate Amount 7D'].fillna(0)
        
        elif period==51:
            default_rate = 100*cohort_data['Default Rate Amount 51D']

            recovery_rate_51 = float(inputs[inputs.market=='ke']['recovery_7-30'] + \
                                     inputs[inputs.market=='ke']['recovery_30-51'])

            ## fill null 51dpd values with 7dpd values based on recovery rates
            derived_51dpd = 100*(cohort_data['Count Loans']*(cohort_data['default_rate_7dpd']/100) - \
                cohort_data['Count Loans']*(cohort_data['default_rate_7dpd']/100)*recovery_rate_51)/ \
                cohort_data['Count Loans']
            
            return default_rate.fillna(derived_51dpd)
        
        elif period==365:
            default_rate = np.nan*cohort_data['Default Rate Amount 51D']

            recovery_rate_365 = float(inputs[inputs.market=='ke']['recovery_51_'])

            ## fill null 365dpd values with 51dpd values based on recovery rates
            derived_365dpd = 100*(cohort_data['Count Loans']*(cohort_data['default_rate_51dpd']/100) - \
                cohort_data['Count Loans']*(cohort_data['default_rate_51dpd']/100)* \
                recovery_rate_365)/cohort_data['Count Loans']

            return default_rate.fillna(derived_365dpd)
        
    def loans_per_original(self, cohort_data):
        return cohort_data['Count Loans']/cohort_data['Count Borrowers'].max()
    
    def origination_per_original(self, cohort_data, to_usd):
        df = cohort_data['Total Amount']/cohort_data['Count Borrowers'].max()
        if to_usd:
            df *= ksh_usd
        return df
    
    def revenue_per_original(self, cohort_data, to_usd):
        df = (cohort_data['Total Interest Assessed'] + \
                                                 cohort_data['Total Rollover Charged'] - \
                                                 cohort_data['Total Rollover Reversed'])/ \
                                                cohort_data['Count Borrowers'].max()
        if to_usd:
            df *= ksh_usd
        return df
    
    def credit_margin(self, cohort_data):
        return cohort_data['revenue_per_original'] - \
                (cohort_data['origination_per_original'] + cohort_data['revenue_per_original'])* \
                cohort_data['default_rate_365dpd']/100
    
    def opex_per_original(self, cohort_data):
        opex_cost_per_loan = float(inputs[inputs.market=='ke']['opex cost per loan'])
        cost_of_capital = float(inputs[inputs.market=='ke']['cost of capital'])/12
        
        return opex_cost_per_loan*cohort_data['loans_per_original'] + cost_of_capital*cohort_data['origination_per_original']
        
    def generate_features(self, to_usd=True):
        """
        Generate all features required for pLTV model.
        """
        cohorts = []

        # for each cohort
        for cohort in self.data.loc[:,'First Loan Local Disbursement Month'].unique():
            cohort_data = self.data[self.data['First Loan Local Disbursement Month']==cohort].iloc[:-2, :]

            # data functions
            cohort_data['borrower_retention'] = self.borrower_retention(cohort_data)
            cohort_data['borrower_survival'] = self.borrower_survival(cohort_data)
            cohort_data['loans_per_borrower'] = self.loans_per_borrower(cohort_data)
            cohort_data['loan_size'] = self.loan_size(cohort_data, to_usd)
            cohort_data['interest_rate'] = self.interest_rate(cohort_data)
            cohort_data['default_rate_7dpd'] = self.default_rate(cohort_data, period=7)
            cohort_data['default_rate_51dpd'] = self.default_rate(cohort_data, period=51)
            cohort_data['default_rate_365dpd'] = self.default_rate(cohort_data, period=365)
            cohort_data['loans_per_original'] = self.loans_per_original(cohort_data)
            cohort_data['origination_per_original'] = self.origination_per_original(cohort_data, to_usd)
            cohort_data['revenue_per_original'] = self.revenue_per_original(cohort_data, to_usd)
            cohort_data['CM$_per_original'] = self.credit_margin(cohort_data)
            cohort_data['opex_per_original'] = self.opex_per_original(cohort_data)
            cohort_data['LTV_per_original'] = cohort_data['CM$_per_original'] - cohort_data['opex_per_original']
            cohort_data['CM%_per_original'] = 100*cohort_data['LTV_per_original']/cohort_data['revenue_per_original']
            
            # reset the index and append the data
            cohorts.append(cohort_data.reset_index(drop=True))

        self.cohorts = cohorts
        self.data = pd.concat(cohorts, axis=0)
    
    def plot_cohorts(self, param):
        """
        Generate scatter plot for a specific paramter.
        """
        # for each cohort
        curves = []
        for cohort in self.data.loc[:,'First Loan Local Disbursement Month'].unique():
            output = self.data[self.data['First Loan Local Disbursement Month']==cohort]

            output = output[param].reset_index(drop=True)
            output.name = cohort

            curves.append(output)
            
        traces = []

        for cohort in curves:
            traces.append(go.Scatter(name=cohort.name, x=cohort.index, y=cohort))

        fig = go.Figure(traces)
        fig.update_layout(xaxis=dict(title='Month Since Disbursement'),
                         yaxis=dict(title=param))

        fig.show()
        

Review calculations for revenue per original, they're very off. Used a different formula than Liang's. Understand his formula.

In [49]:
m = sBG(data)

In [50]:
m.generate_features()

In [51]:
m.plot_cohorts('borrower_survival')

### Power law regression

In [11]:
def power_law(x, A, B):
    return A*x**B

In [12]:
arr = m.data[m.data['First Loan Local Disbursement Month']=='2020-09'].loc[1:, 'borrower_retention']
arr = arr.dropna()
power_param, power_cov = curve_fit(power_law, arr.index, arr)

In [13]:
x = list(range(1,25))
power_fit = power_law(x, power_param[0], power_param[1])

In [14]:
power_fit

array([62.00792042, 42.82217139, 34.48413233, 29.57264734, 26.25010798,
       23.81446459, 21.93233679, 20.42263254, 19.17747563, 18.12811356,
       17.22840234, 16.44607782, 15.75781833, 15.14629549, 14.59833181,
       14.10370587, 13.65435215, 13.24381052, 12.866838  , 12.5191295 ,
       12.19711287, 11.89779617, 11.61865206, 11.35752914])

In [15]:
traces = [
    go.Scatter(name='actual', x=arr.index, y=arr),
    go.Scatter(name='power-law', x=x, y=power_fit)
]

fig = go.Figure(traces)

fig.show()

### sBG probalistic model

sBG model assumptions:
1. The propensity of one customer to drop out is independent of the behavior of every other customer.

https://faculty.wharton.upenn.edu/wp-content/uploads/2012/04/Fader_hardie_jim_07.pdf

In [16]:
# initial guesses @ alpha and beta
alpha = 1
beta = 1

In [17]:
def p(t, alpha, beta):
    if t==1:
        return alpha/(alpha + beta)
    else:
        return p(t-1, alpha, beta) * (beta+t-2)/(alpha+beta+t-1)
    
def s(t, alpha, beta):
    if t==1:
        return 1 - p(t, alpha, beta)
    else:
        return s(t-1, alpha, beta) - p(t, alpha, beta)
    
def log_likelihood(params):
    alpha, beta = params
    ll=0
    for t in c[1:].index:
        ll += (c[t-1]-c[t])*np.log(p(t, alpha, beta))
    ll += c.iloc[-1]*np.log(s((len(c)-1)-1, alpha, beta)-p(len(c)-1, alpha, beta))
    return -ll

In [71]:
cohort = '2021-01'

In [72]:
c = m.data[m.data['First Loan Local Disbursement Month']==cohort]['Count Borrowers']
c = c.reset_index(drop=True)

In [73]:
# since we're working with logs, we need bounds for alpha and beta > 0.
bounds = ((0,None), (0,None))

results = minimize(log_likelihood, np.array([1,1]), bounds=bounds)
results

      fun: 19059.167023391776
 hess_inv: <2x2 LbfgsInvHessProduct with dtype=float64>
      jac: array([0.00582077, 0.00218279])
  message: 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 33
      nit: 7
     njev: 11
   status: 0
  success: True
        x: array([0.66914057, 0.78384057])

In [74]:
alpha_opt, beta_opt = results.x

sBG_forecast = []
for i in x:
    sBG_forecast.append(100*s(i, alpha_opt, beta_opt))

In [77]:
arr = m.data[m.data['First Loan Local Disbursement Month']==cohort].loc[1:, 'borrower_retention']
arr = arr.dropna()

power_param, power_cov = curve_fit(power_law, arr.index, arr)
power_fit = power_law(x, power_param[0], power_param[1])

In [78]:
traces = [
    go.Scatter(name='actual', x=arr.index, y=arr),
    go.Scatter(name='power-law', x=x, y=power_fit),
    go.Scatter(name='sBG', x=x, y=sBG_forecast)
]

fig = go.Figure(traces)
fig.update_layout(xaxis=dict(title='Months Since First Loan'))

fig.show()

#### Scale forecast

In [142]:
forecast = []

alpha = beta = 1

x = list(range(1, 25))
df = m.data[['cohort', 'Count Borrowers']]
for cohort in df.cohort.unique():
    c_data = df[df.cohort==cohort]
    n = c_data.loc[0, 'Count Borrowers']
    
    if len(c_data) > 3:
        c = c_data['Count Borrowers']

        # fit model
        bounds = ((0,None), (0,None))
        results = minimize(log_likelihood, np.array([1,1]), bounds=bounds)

        # forecast
        sBG_forecast = []
        for i in x:
            sBG_forecast.append(n*s(i, results.x[0], results.x[1]))

        sBG_forecast = pd.DataFrame(sBG_forecast, index=x, columns=['Count Borrowers'])
    
        holder_df = pd.DataFrame(np.nan, index=range(0,25), columns=['null'])
        
        c_data['data_type'] = 'actual'
        
        c_data = pd.concat([c_data, holder_df], axis=1).drop('null', axis=1)
        c_data.cohort = c_data.cohort.ffill()

        c_data.data_type = c_data.data_type.fillna('forecast')
        
        forecast.append(c_data.fillna(sBG_forecast))
        
forecast = pd.concat(forecast)

In [151]:
cohort='2021-02'

df = forecast[forecast.cohort==cohort]

traces = []
for dtype in df.data_type.unique():
    traces.append(go.Scatter(name=dtype, x=df[df.data_type==dtype].index, 
                             y=df[df.data_type==dtype]['Count Borrowers'], mode='markers+lines'))
    
fig = go.Figure(traces)
fig.update_layout(xaxis=dict(title='Months Since Disbursement'),
                 yaxis=dict(title='Count Borrowers'))

fig.show()

In [137]:
forecast[forecast.cohort=='2021-10']

Unnamed: 0,cohort,Count Borrowers
0,2021-10,8858.0
1,2021-10,4319.0
2,2021-10,3082.0
3,2021-10,2511.0
4,2021-10,2139.822141
5,2021-10,1886.187356
6,2021-10,1698.664673
7,2021-10,1553.234426
8,2021-10,1436.468255
9,2021-10,1340.215476


### Backtest Framework

In [26]:
def test(actual, forecast, metric='rmse'):
    """
    Test forecast performance against actuals using method defined by metric.
    """
    if metric=='rmse':
        error = np.sqrt(sum((forecast[:len(actual)] - actual)**2))
    elif metric=='mae':
        error = np.mean(forecast[:len(actual)] - actual)
    return error

In [27]:
test(arr, power_fit, metric='rmse') 

7.911571109795891

In [28]:
test(arr, sBG_forecast)

6.382120180468375

### Test all cohorts

In [36]:
test_vals = {}

for cohort in m.data.cohort.unique():
    x = list(range(1,25))

    # power-law
    arr = m.data[m.data.cohort==cohort]['borrower_retention'][1:].dropna()
    
    if len(arr)>=3:
        power_param, power_cov = curve_fit(power_law, arr.index, arr)
        power_fit = power_law(x, power_param[0], power_param[1])

        # sbg
        c = m.data[m.data.cohort==cohort]['Count Borrowers'].reset_index(drop=True)

        # since we're working with logs, we need bounds for alpha and beta > 0.
        bounds = ((0,None), (0,None))

        results = minimize(log_likelihood, np.array([1,1]), bounds=bounds)

        sBG_forecast = []
        for i in x:
            sBG_forecast.append(100*s(i, results.x[0], results.x[1]))

        test_vals[cohort] = {'power-law': test(arr, power_fit, metric='rmse'), 
                             'sbg': test(arr, sBG_forecast)}
    
test_vals

{'2020-09': {'power-law': 9.916461969436115, 'sbg': 5.942824530450512},
 '2020-10': {'power-law': 8.878383231455054, 'sbg': 6.802539463007343},
 '2020-11': {'power-law': 7.41916331953283, 'sbg': 5.531285348505579},
 '2020-12': {'power-law': 6.790207358997688, 'sbg': 5.201175246265757},
 '2021-01': {'power-law': 7.911571109795891, 'sbg': 6.382120180468375},
 '2021-02': {'power-law': 5.44267072441055, 'sbg': 3.835302216820124},
 '2021-03': {'power-law': 7.761078473343302, 'sbg': 6.929119754451952},
 '2021-04': {'power-law': 5.722561737841632, 'sbg': 4.604031032253486},
 '2021-05': {'power-law': 6.2827668657365425, 'sbg': 5.168228245303411},
 '2021-06': {'power-law': 4.683240567322109, 'sbg': 3.9029195927544857},
 '2021-07': {'power-law': 4.955487707285651, 'sbg': 4.4694318959938215},
 '2021-08': {'power-law': 4.515225858421871, 'sbg': 4.553677985892246},
 '2021-09': {'power-law': 1.9865115724016997, 'sbg': 1.7385765475413657},
 '2021-10': {'power-law': 0.1274764675060846, 'sbg': 0.278399

In [42]:
m.data['First Loan Local Disbursement Month']

0    2020-09
1    2020-09
2    2020-09
3    2020-09
4    2020-09
      ...   
0    2021-11
1    2021-11
2    2021-11
0    2021-12
1    2021-12
Name: First Loan Local Disbursement Month, Length: 149, dtype: object

In [None]:
forecast_methods = {
    'Count Borrowers': 'sbg',
    'loans_per_borrwer': 'power-law'
}

In [None]:
m.data[m.data['First Loan Local Disbursement Month']=='2020-09']['Count Borrowers']

#### Questions/Concerns

1. Is there a framework that's been developed/used to back test this forecast model?
2. Filter out bad cohorts. Why are some starting at 0? (e.g. 2021-07)
3. Why are the last few months not included? Why not just omit the final incomplete month?


In [None]:
new_data[new_data['First Loan Local Disbursement Month'] == '2021-07']