In [None]:
%matplotlib inline
figsize = (10,3)

# Assess Input Datasets  

In [None]:
import pandas, numpy
pandas.options.display.float_format = '{:,.4f}'.format
data_folder = '../data'
date_format='%Y-%m-%d' #truncate datetimes to dates
id_columns = ["id","company_id","invoice_id","account_id","customer_id"]
id_column_types = dict(zip(id_columns,[str] * len(id_columns)))
invoices = pandas.read_csv(data_folder + '/invoice.csv', na_values='inf', dtype=id_column_types,
                           parse_dates=['invoice_date', 'due_date', 'cleared_date'], date_format=date_format)
payments = pandas.read_csv(data_folder + '/invoice_payments.csv', na_values='inf', dtype=id_column_types,
                           parse_dates=['transaction_date'], date_format=date_format)
invoices.__len__(), payments.__len__()

## Determine Data Definitions and Relationships

***Define Datasets & Their Relationships***

We have two input datasets: invoices and their payments.

- Payments are amounts in time, which are directly mapped to companies. 
- Invoices can have multiple payments, but usually only have 1. 
- All payments have invoices, but not all invoices have payments.  

In [None]:
invoices.rename(columns={"id":"invoice_id"}, inplace=True)
invoices.dtypes

In [None]:
payments.dtypes

In [None]:
#The join key will be invoice_id, so it must be unique (and it is).
invoices.invoice_id.value_counts(dropna=False).value_counts(dropna=False)\
.to_frame(name="ids").rename_axis('invoices_per_id')

In [None]:
#all payments are represented in both datasets 
len(set(payments.invoice_id) - set(invoices.invoice_id))

In [None]:
#7% of invoices do not have payments yet
len(set(invoices.invoice_id) - set(payments.invoice_id))/invoices.__len__()

In [None]:
#invoices usually have one payment but may have more
payments.invoice_id.value_counts(dropna=False).value_counts(dropna=False, normalize=True)\
.to_frame(name="invoices").rename_axis('payments_per_invoice')

***Define Entities & Their Relationships***

- Company: business entity for which Tesorio is forecasting cash collected. There are only two. Each company collects using multiple currencies from multiple customers. 
- Account: **In this limited dataset, accounts and companies are synonymous, so we ignore accounts.**  
- Customer: metadata about an invoice which is specific to each company. Each customer can have multiple currencies.

In [None]:
invoices.groupby("company_id")[["customer_id","currency"]].nunique()

In [None]:
invoices.groupby("customer_id").company_id.nunique().value_counts()\
.to_frame(name='customers').rename_axis('companies_per_customer')

In [None]:
invoices.groupby(["customer_id"]).currency.nunique().value_counts()\
.to_frame(name='customers').rename_axis('currencies_per_customer')

In [None]:
invoices.groupby("company_id").account_id.nunique().to_frame(name="unique_accounts")

In [None]:
invoices.groupby("account_id").company_id.nunique().value_counts()\
.to_frame(name='count').rename_axis('companies_per_account')

## Identify Data Cleaning Needs

***Payments***

Transaction data begins in 2011 and ends 2021-05-18. We will assume this is when the data was pulled. Based on volume, the last month appears to have an unrepresentative set of payments, so we filter out those payments. 

In [None]:
payments.__len__()
payments['transaction_month'] = payments.transaction_date.dt.to_period('M').dt.to_timestamp()
payments.groupby("transaction_month").invoice_id.nunique()\
.plot(kind='area', title="Payments by Transaction Month", figsize=figsize)

In [None]:
payments = payments[payments.transaction_month<payments.transaction_month.max()]
last_complete_transaction_month = payments.transaction_month.max()
first_transaction_month = payments.transaction_month.min()
first_transaction_month, last_complete_transaction_month

In [None]:
#converted_amount is reliable
(((payments.amount * payments.root_exchange_rate_value) - payments.converted_amount).abs()).max()

In [None]:
payments[payments.amount.isnull()!=payments.converted_amount.isnull()].__len__()

In [None]:
payment_stats = payments.describe(include='all')
payment_stats.loc['% populated'] = payment_stats.loc['count']/payments.__len__()
payment_stats

In [None]:
payments.select_dtypes(include='float').hist(bins=50, figsize=figsize, layout=(1,3))

***Invoice Dates***

- Date range is different from payments. Filter dates out of range to ensure invoices have complete payment history.
- Invoice can be opened with a past due date. Filter these out. 
- Invoices due 4+ months after opened are negligible: remove. 
- Invoices cleared before or 13+ months after opened are negligible: remove. 

In [None]:
invoices['invoice_month'] = invoices.invoice_date.dt.to_period('M').dt.to_timestamp()
invoices['due_month'] = invoices.due_date.dt.to_period('M').dt.to_timestamp()
invoices.groupby("invoice_month").invoice_id.count()\
.plot(kind='area', title="Invoices by Invoice Month", figsize=figsize)

In [None]:
#opened after they were due or before of payment data begins -  filter 
print(invoices.loc[invoices.invoice_month.dt.to_period('M')>invoices.due_month.dt.to_period('M')].__len__(), 
invoices.loc[invoices.invoice_month<first_transaction_month].__len__())
invoices = invoices.loc[(invoices.invoice_month<=invoices.due_date.dt.to_period('M').dt.to_timestamp()) &
                        (invoices.invoice_month>=first_transaction_month)]

In [None]:
#need to filter out invoices due in the future relative to the payment date window
invoices.groupby("due_month").invoice_id.count()\
.plot(kind='area', title="Invoices by Due Month", figsize=figsize)

In [None]:
invoices['months_allowed'] = invoices.due_month.dt.to_period('M') - invoices.invoice_month.dt.to_period('M')
invoices.months_allowed = invoices.months_allowed.map(lambda m: m.n if not pandas.isnull(m) else None)
#almost all invoices are due immediately or within 3 months. 
invoices.months_allowed.value_counts(normalize=True, dropna=False)

In [None]:
invoices = invoices[invoices.months_allowed<=3]

In [None]:
invoices['months_to_clear'] = invoices.cleared_date.dt.to_period('M') \
- invoices.invoice_date.dt.to_period('M')
invoices.months_to_clear = invoices.months_to_clear.map(lambda m: m.n if not pandas.isnull(m) else None)
#almost all invoices are cleared within a year. 
# filter out ones that cleared before or 13+ months after they opened
invoices.months_to_clear.value_counts(normalize=True, dropna=False).head(20)

In [None]:
invoices = invoices[(invoices.months_to_clear.isnull()) | (invoices.months_to_clear.between(0,12))]

In [None]:
invoices_stats = invoices.describe(include='all')
invoices_stats.loc['% populated'] = invoices_stats.loc['count']/invoices.__len__()
invoices_stats

***Invoice status vs cleared date***

All invoices have a date cleared, even the 3% that are open. Although these cases span multiple invoice dates, the cleared_date is all one value, which is in the future relative to when the data was pulled, so we remove it. 

In [None]:
invoices.loc[invoices.cleared_date.isnull()].__len__()

In [None]:
invoices.status.value_counts(normalize=True, dropna=False).to_frame(name="% of Invoices")

In [None]:
open_status = (invoices.status=='OPEN')
invoices.loc[invoices.cleared_date.isnull() != open_status,['status','cleared_date']]\
.value_counts(dropna=False)

In [None]:
#all open invoices have the same cleared date, which is after the latest payment data 
invoices.loc[open_status].cleared_date.value_counts(dropna=False)

In [None]:
#this is in spite of having multiple invoice dates
invoices.loc[open_status].invoice_date.agg(['min', 'max'])

In [None]:
invoices['cleared_month'] = invoices.cleared_date.dt.to_period('M').dt.to_timestamp()
#last month we will generate a forecast 
#To ensure completeness, the months we use for modeling must fall within the payments data date range. 
invoices['final_month_open'] = invoices.cleared_month.map(
    lambda cleared_month: min(cleared_month,last_complete_transaction_month)
)
invoices.loc[open_status,'cleared_date'] = None
invoices.loc[open_status,'cleared_month'] = None

***Exchange Rate***

USD exchange rate has some inaccurate outliers: remove. The exchange rate on the invoice seems to be taken from when the invoice is opened. 

In [None]:
invoices.query("currency=='USD'").root_exchange_rate_value.describe(percentiles=[0.0001,0.9999])

In [None]:
invoices.query("currency=='USD' and root_exchange_rate_value<0.7").__len__()

In [None]:
invoices.query("currency=='USD' and root_exchange_rate_value>1.3").__len__()

In [None]:
invoices = invoices.loc[(invoices.currency!='USD') | (invoices.root_exchange_rate_value.between(0.7,1.3))]

In [None]:
usd_currency_ranges = invoices.query("currency=='USD'").groupby("invoice_month").root_exchange_rate_value\
.agg(['min','max'])
usd_currency_ranges['spread_ratio'] = usd_currency_ranges['max']/usd_currency_ranges['min']
usd_currency_ranges.spread_ratio.plot(kind='line', title="USD Exchange Rate Spread Ratio By Invoice Month",
                                       figsize=figsize)

In [None]:
usd_currency_ranges = invoices.query("currency=='USD'").groupby("cleared_month").root_exchange_rate_value\
.agg(['min','max'])
(usd_currency_ranges['max']/usd_currency_ranges['min']).sort_values()\
.plot(kind='line', title="USD Exchange Rate Spread Ratio By Cleared Month", figsize=figsize)

***Merging & Checking for Consistency***

- Companies are consistent between payments and invoices, when payments are present. 
- Amounts and exchange rates are expected to differ between invoices and payments. 

In [None]:
invoices['converted_amount'] = invoices.amount_inv * invoices.root_exchange_rate_value

In [None]:
invoice_payments = invoices.rename(columns={"amount_inv":"amount"})\
.merge(payments, on="invoice_id", how='left', suffixes=('_inv', '_pmt'))
# #update cleared_date to be up to the point in time of the payment
# cleared_in_future = invoice_payments.cleared_date>invoice_payments.transaction_date
# invoice_payments.loc[cleared_in_future,'cleared_month'] = None
# invoice_payments.loc[cleared_in_future,'months_to_clear'] = None
invoice_payments.drop(columns=['cleared_date'], inplace=True)

In [None]:
invoice_payments.invoice_id.nunique()

In [None]:
#no payment is more than the invoice amount in the original currency
invoice_payments.loc[invoice_payments.amount_pmt>invoice_payments.amount_inv].__len__()

In [None]:
duplicated_columns = [col.replace('_pmt','') for col in invoice_payments.columns if col.endswith('_pmt')]
for col in  duplicated_columns:
    if col in ('amount', 'converted_amount', 'root_exchange_rate_value'):
        continue
    inconsistent_rows = invoice_payments.loc[(invoice_payments[f"{col}_pmt"].isnull()==False) & 
                                 (invoice_payments[f"{col}_pmt"]!=invoice_payments[f"{col}_inv"])]
    print(f"{col}: {inconsistent_rows.__len__()/invoice_payments.__len__()} inconsistent rows after merge")

In [None]:
invoice_payments = invoice_payments.drop(columns=['company_id_pmt'])\
.rename(columns={"company_id_inv":"company_id"})

## Establish Business Rules

- Payments are collected based on the invoice currency.
- Payments cannot be more than what is owed: ignore payments that exceed amount remaining.
- Payments can be less than what is owed:
    - Payments can be a negligible % of the invoice, but are always non-zero.
    - Consider an invoice "collected" when paid > 99.99%.
- Once collected, an invoice is cleared:
    - Invoices can be cleared prematurely without being collected. 
    - When an invoice is cleared prematurely, it won't be collected.
- Regardless of status, invoices can have zero payments. 

In [None]:
#using USD to determine collection would mean 9% of payments are greater than their invoice amount
(invoice_payments.converted_amount_pmt>invoice_payments.converted_amount_inv).mean()

In [None]:
# Rougly 12% of payments are partial
invoice_payments['amount_pmt_pct'] = (invoice_payments.amount_pmt/invoice_payments.amount_inv)
(invoice_payments.amount_pmt_pct.dropna()<1).mean()

In [None]:
#on average, a payment is 94% of the invoice
invoice_payments.amount_pmt_pct.mean()

In [None]:
# negligible amounts are a small fraction of payments
invoice_payments.amount_pmt_pct.min(), (invoice_payments.amount_pmt_pct<=0.0001).mean()

In [None]:
invoice_payments.sort_values(by=['invoice_id','transaction_date'], inplace=True)
#round to eliminate the impact of negligible payments
#hence, an invoice is "collected" when paid > 99.99%.
invoice_payments['amount_pmt_pct_cum'] = invoice_payments.groupby("invoice_id").amount_pmt_pct.cumsum()\
.fillna(0).round(4)

In [None]:
#small percent of payments represent overpayments - filter out
print((invoice_payments.amount_pmt_pct_cum>1).mean())
invoice_payments = invoice_payments[invoice_payments.amount_pmt_pct_cum<=1].copy()
#dedupe by invoice id and payment month, using the last transaction for each
invoice_payments.drop_duplicates(subset=['invoice_id','transaction_month'], keep='last', inplace=True)
#dedupe by invoice id and cumulative amount paid, using the first transaction for each (dupes are very rare)
invoice_payments.drop_duplicates(subset=['invoice_id','amount_pmt_pct_cum'], keep='first', inplace=True)
# invoice is collected if/when payments accumulate to the invoice amount in the original currency.
collected = invoice_payments.amount_pmt_pct_cum==1
invoice_payments['collected'] = collected
invoice_payments.loc[collected, 'collected_month'] = invoice_payments.loc[collected, 'transaction_month']
assert invoice_payments.groupby("invoice_id").collected_month.nunique().max()==1

In [None]:
#small percent collected before active - filter out
print((invoice_payments.collected_month<invoice_payments.invoice_month).mean())
invoice_payments = invoice_payments[(invoice_payments.collected_month>=invoice_payments.invoice_month) |
                                    (invoice_payments.collected_month.isnull())]

***Cleared vs Collected***

In [None]:
#Only one invoice was collected after being cleared. Remove as an outlier.
print(invoice_payments.query("cleared_month<collected_month").__len__())
invoice_payments = invoice_payments[~(invoice_payments.cleared_month<invoice_payments.collected_month)]

In [None]:
#2 records are collected but have 'OPEN' status / no cleared date  - remove
print(invoice_payments.loc[(invoice_payments.collected) & (invoice_payments.cleared_month.isnull())]\
.status.value_counts())
invoice_payments = invoice_payments.loc[(invoice_payments.collected==False) | 
                                        (invoice_payments.cleared_month.isnull()==False)]

In [None]:
invoice_payments.__len__(), invoice_payments.invoice_id.nunique()

In [None]:
assert invoice_payments.groupby("invoice_id").cleared_month.nunique().max()==1
assert invoice_payments.groupby("invoice_id").collected_month.nunique().max()==1

In [None]:
invoice_end_state = invoice_payments.drop_duplicates(subset='invoice_id', keep='last').copy()
#all collected invoices are cleared. however, not all cleared invoices were collected. 
#the 60% of uncollected invoices that were cleared took over 2x as long to clear vs collected ones
#this suggests that they're being cleared as part of a separate process. 
invoice_end_state['cleared'] = invoice_end_state.status=='CLEARED'
invoice_end_state['periods_to_clear'] = invoice_end_state.months_to_clear/(invoice_end_state.months_allowed+1)
invoice_end_state.groupby("collected", as_index=False)[['cleared','months_to_clear','periods_to_clear']].mean()

In [None]:
# 92% of invoices are collected 
invoice_end_state.collected.mean()

In [None]:
#95% of cleared invoices are collected, representing 96% of totals on average. 
invoice_end_state.loc[invoice_end_state.status=='CLEARED',['collected','amount_pmt_pct_cum']].mean()

# Structure Input Data for Modeling

- The model will handle OPEN invoices and classify how many months in the future they will be collected. 
    - Define an invoice as open/active between its invoice date and date cleared or collected, whichever is first. 
- Forecasts will be generated once at the beginning of each month. To model the data, we select a representative forecast month by randomly sampling one past month per invoice:
    - Begin forecast window when the invoice is active and the payments data is complete
    - End forecast window when invoice is collected. 
    - For open invoices to be included in model training, end the forecast window when payments data ends. 


- Normalizing by company:
    - Tesorio's pricing model is subscription, so Tesorio's revenue will be loosely based on paying customers per period. 
    - Normalizing USD amounts by company means each company will be of equal value each month they retain with Tesorio, regardless of their total cash flow. 

In [None]:
import random

def select_forecast_month(invoice_id, invoice_month, max_forecast_month):
    if pandas.isnull(max_forecast_month):
        return None
    #begin forecast window when the invoice is active and the payments data is complete
    period_start = max(invoice_month,first_transaction_month)
    period_range = pandas.period_range(period_start, max_forecast_month, freq='M')
    if len(period_range)==0:
        return None
    #ensure consistent forecast month per invoice across payments
    psuedorandom = random.Random(invoice_id)
    return psuedorandom.choice(period_range)

select_forecast_month = numpy.vectorize(select_forecast_month)
invoice_payments['forecast_month_collected'] = select_forecast_month(invoice_payments.invoice_id, 
                                                           invoice_payments.invoice_month, 
                                                           invoice_payments.collected_month)
invoice_payments['forecast_month_uncollected'] = select_forecast_month(invoice_payments.invoice_id, 
                                                           invoice_payments.invoice_month, 
                                                           invoice_payments.final_month_open)
assert invoice_payments.groupby("invoice_id").forecast_month_collected.nunique().max()==1
assert invoice_payments.groupby("invoice_id").forecast_month_uncollected.nunique().max()==1

In [None]:
invoice_payments.__len__(), invoice_payments.invoice_id.nunique()

In [None]:
#should be the same date ranges for both options
assert (invoice_payments.forecast_month_uncollected.agg(['min','max']).values==\
invoice_payments.forecast_month_collected.agg(['min','max']).values).max()

In [None]:
import random

def process_model_inputs(invoice_payments, current_state_month=None):
    #last record for each invoice has columns associated with collected date 
    invoice_point_in_time = invoice_payments.drop_duplicates(subset='invoice_id', keep='last').copy()
    if current_state_month: #prepare for live scoring on trained model
        forecast_month = current_state_month
    else: #use random valid past month per invoice to train model
        forecast_month = invoice_point_in_time.forecast_month_collected.fillna(
            invoice_point_in_time.forecast_month_uncollected) 
    invoice_point_in_time['forecast_month'] = forecast_month
    final_state_columns = ['amount_pmt_pct_cum',"collected_month","final_month_open"]
    invoice_point_in_time = invoice_point_in_time[["invoice_id","forecast_month"]+final_state_columns].dropna(
        subset=["invoice_id","forecast_month"]
    )
    invoice_payments_point_in_time = invoice_point_in_time.merge(
        invoice_payments,on="invoice_id", how="inner", suffixes=('','_final')
    ).sort_values(by=['invoice_id','transaction_month'])
    #last payment record for each invoice 
    invoices_to_score = invoice_payments_point_in_time.drop_duplicates(subset='invoice_id', keep='last')
    #get invoice payment state before the forecast month. if there is no forecast month, get all data 
    last_prior_payment_state = invoice_payments_point_in_time.loc[
        (invoice_payments_point_in_time.transaction_month<
         invoice_payments_point_in_time.forecast_month.dt.to_timestamp()),
        ['invoice_id','amount_pmt_pct_cum','transaction_month']
    ] 
    #get the last payment state prior to the invoice forecast month
    last_prior_payment_state.drop_duplicates(subset='invoice_id', keep='last', inplace=True)
    invoices_to_score = invoices_to_score\
    .merge(last_prior_payment_state, how='left', on='invoice_id',suffixes=('', '_prior'))
    invoices_to_score['prior_remaining_inv_pct'] = 1 - invoices_to_score.amount_pmt_pct_cum_prior.fillna(0)
    invoices_to_score['final_remaining_inv_pct'] = 1 - invoices_to_score.amount_pmt_pct_cum_final.fillna(0)
    invoices_to_score['months_to_final_state'] =  (
        invoices_to_score.collected_month.fillna(invoices_to_score.final_month_open).dt.to_period('M') 
        - invoices_to_score.forecast_month).map(lambda m: m.n+1 if not pandas.isnull(m) else 1)
    invoices_to_score['collected_per_month'] = (
        (invoices_to_score.prior_remaining_inv_pct - invoices_to_score.final_remaining_inv_pct)/
        invoices_to_score.months_to_final_state
    )
    columns_to_model = ['invoice_id','invoice_month','forecast_month','due_month','months_allowed',
                         'amount_inv','converted_amount_inv', 'currency','company_id','customer_id',
                         'collected_month','prior_remaining_inv_pct','final_remaining_inv_pct',
                         'months_to_final_state','collected_per_month', 'status']
    invoices_to_score = invoices_to_score[columns_to_model]
    #normalize USD amounts by company
    totals_by_company = invoices_to_score.groupby("company_id", as_index=False).converted_amount_inv.sum()
    invoices_to_score = invoices_to_score.merge(totals_by_company,on="company_id", suffixes=('','_company'))
    inv_pct_of_company_total = invoices_to_score.converted_amount_inv\
                                                    /invoices_to_score.converted_amount_inv_company
    invoices_to_score['inv_pct_of_company_total'] = inv_pct_of_company_total
    invoices_to_score.drop(columns=["converted_amount_inv"], inplace=True)
    #date quantities
    invoices_to_score['months_open'] = (invoices_to_score.forecast_month \
                                          - invoices_to_score.invoice_month.dt.to_period('M'))\
                                                    .map(lambda m: m.n+1 if not pandas.isnull(m) else None)
    invoices_to_score['month_due'] = (invoices_to_score.due_month.dt.to_period('M') \
                                      - invoices_to_score.forecast_month)\
                                                    .map(lambda m: m.n+1 if not pandas.isnull(m) else None)
    invoices_to_score.forecast_month = invoices_to_score.forecast_month.dt.to_timestamp()
    # late invoices should not impact the % of the invoice due per month.   
    invoices_to_score['due_per_month'] = 1/invoices_to_score.month_due.clip(lower=1)
    return invoices_to_score

invoices_to_model = process_model_inputs(invoice_payments)
#one record per invoice
assert invoices_to_model.groupby("invoice_id").count().max().max()==1
#always has a collection rate
assert (invoices_to_model.collected_per_month.isnull()).sum()==0

In [None]:
open_invoices_to_model = invoices_to_model.query("status=='OPEN'")
assert open_invoices_to_model.collected_month.count()==0
#open_invoices_to_model[['invoice_month','due_month','forecast_month']].agg(['min','max','count'])

#todo: this should be a test
current_open_invoices = process_model_inputs(invoice_payments.query("status=='OPEN'"), 
                                 current_state_month=last_complete_transaction_month.to_period('M'))
#one record per invoice
assert current_open_invoices.groupby("invoice_id").count().max().max()==1
assert current_open_invoices.collected_month.count()==0
current_open_invoices.drop(columns=['collected_month','collected_per_month'], inplace=True)
#current_open_invoices[['invoice_month','due_month','forecast_month']].agg(['min','max','count'])

# Analyze and Refine Data to Be Modeled


In [None]:
#Collection speed slows down sharply for invoices due a month or more in the future
invoices_to_model.groupby("month_due").collected_per_month.mean()\
.plot(figsize=figsize, title="Average % Collected Per Month by Month Due")

In [None]:
#Long left tail is clipped when calculating % of invoice due per month looking forward
invoices_to_model.month_due.plot(kind='hist', bins=14, figsize=figsize, 
 title="Month Due Relative to Forecast Date", density=True)

In [None]:
# 2% of invoices are due after the payments data time period. 
# invoices due in the future have low collection velocity. 
# need to be mindful of how this will impact trends
print((invoices_to_model.due_month>last_complete_transaction_month).mean())
#invoices_to_model = invoices_to_model[invoices_to_model.due_month<=last_complete_transaction_month]

In [None]:
invoices_to_model.drop(columns=['status']).describe(include='all', percentiles=[])\
.T.drop(columns=['50%','std','top','freq'])

In [None]:
invoices_to_model.collected_per_month\
.plot(kind='hist', bins=14, figsize=figsize, title="Average % Collected Per Month")

In [None]:
invoices_to_model.due_per_month\
.plot(kind='hist', bins=14, figsize=figsize, title="Average % Due Per Month")

In [None]:
invoices_to_model['uncollected'] = invoices_to_model.collected_month.isnull()

***By Dates***

In [None]:
invoices_to_model.groupby("forecast_month").invoice_id.count()\
.plot(kind='area', title="Invoices by Forecast Month", figsize=figsize)

In [None]:
invoices_to_model.groupby("invoice_month").invoice_id.count()\
.plot(kind='area', title="Invoices by Invoice Month", figsize=figsize)

In [None]:
invoices_to_model.groupby("due_month").invoice_id.count()\
.plot(kind='area', title="Invoices by Due Month", figsize=figsize)

***Trends in Variable to Be Modeled***

Aside from invoices that were not due when this data was pulled, invoices are getting collected more quickly, leading to a greater percentage collected over time - a good sign for Tesorio's business. Collections are not keeping pace with due dates yet due to uncollected invoices, but the trend is positive.

In [None]:
invoices_to_model.groupby("due_month")[['collected_per_month','due_per_month']].mean()\
.plot(kind='line', title="Average % Collected and Due Per Month", figsize=figsize)

In [None]:
invoices_to_model.groupby("due_month").uncollected.mean()\
.plot(kind='line', title="% Invoices Uncollected by Due Month", figsize=figsize)

***By Currency***

Some currencies have very low collection rates and take multiple billing periods to collect. This may be due to currency fluctuations.

In [None]:
invoices_to_model.groupby("currency")\
.agg({"collected_per_month":["mean","std"],"invoice_id":"count","uncollected":"mean"})\
.sort_values(by=('uncollected','mean'), ascending=False)

***Distributions by Entity***

We have trouble collecting from some customers, regardless of their currency. 

In [None]:
customer_averages = invoices_to_model.set_index("customer_id").select_dtypes(include=['float','int','boolean'])\
.reset_index().groupby("customer_id").mean()
customer_averages.hist(bins=50, figsize=(10,14), layout=(4,3))

In [None]:
customer_stats = invoices_to_model.groupby("customer_id").uncollected.agg(['count','mean'])\
.add_prefix('uncollected_').sort_values(by="uncollected_mean", ascending=False)
customer_stats.query("uncollected_count>=30").uncollected_mean\
.plot(kind='hist', figsize=figsize, title="Customers with 30+ Invoices: % Uncollected", bins=50)

In [None]:
western_customer_stats = invoices_to_model.query("currency in ('USD','EUR','GBP')")\
.groupby("customer_id").uncollected.agg(['count','mean'])\
.add_prefix('uncollected_').sort_values(by="uncollected_mean", ascending=False)
western_customer_stats.query("uncollected_count>=30").uncollected_mean\
.plot(kind='hist', figsize=figsize, title="US and European Customers with 30+ Invoices: % Uncollected", bins=50)

# Business Analysis

## Establish Business Motivation for ML

Cash collections don't follow due dates. On average:

- 6% of total cash due each month is unpaid, equating to a \\$407K average deficit.
- Total cash collected each month is 9% off from the amount due, equating to a $520K average difference in cash flow. 

We also see that we need to filter out the first few month of due dates, which have unusually high differences.

In [None]:
amount_due = invoice_payments.groupby("due_month").converted_amount_inv.sum().to_frame(name="amount_due_usd")
amount_paid = payments.rename(columns={"transaction_month":"due_month"})\
.groupby("due_month").converted_amount.sum().to_frame(name="amount_paid_usd")
business_motivation = amount_due.join(amount_paid, how='inner').reset_index(names='due_month')\
.query(f"due_month>='{first_transaction_month}' and due_month<'{last_complete_transaction_month}'")
business_motivation['pct_unpaid'] = 1 - (business_motivation.amount_paid_usd/business_motivation.amount_due_usd)
business_motivation['unpaid'] = business_motivation.amount_due_usd - business_motivation.amount_paid_usd
business_motivation.set_index('due_month', inplace=True)

In [None]:
business_motivation.pct_unpaid.plot(figsize=figsize, title="% Unpaid (USD Due)")

In [None]:
business_motivation.pct_unpaid.mean(), business_motivation.pct_unpaid.abs().mean()

In [None]:
business_motivation.unpaid.mean(), business_motivation.unpaid.abs().mean()

In [None]:
invoices_to_model.query("due_month<'2011-10-01'").__len__()

## Define & Quantify Benchmark for Data Science

Companies will be more likely to retain if their monthly forecast error is low as a percentage of their total cash flow.

As a benchmark, we use due date in place of the forecast, filtering out the first few invoice months, which have abnormally high errors rates. Weighting companies equally, on average, a company's monthly cash flow is 9.7% off from their cash due.

In [None]:
#filter out dates with high variation
invoices_to_model = invoices_to_model.query("due_month>='2011-10-01'")

In [None]:
invoices_to_model.__len__(), invoices_to_model.inv_pct_of_company_total.sum()

In [None]:
from sklearn.metrics import mean_absolute_error
#enables time-based split
invoices_to_model['forecast_date_fold']=(invoices_to_model.forecast_month.rank(pct=True)*6).round()

In [None]:
benchmark = invoices_to_model.query("forecast_date_fold>4")\
[['due_per_month','collected_per_month','inv_pct_of_company_total']].copy()
benchmark['month_collected'] = (1/benchmark.collected_per_month).replace(numpy.inf,numpy.nan).round(0)
benchmark['month_due'] = (1/benchmark.due_per_month).replace(numpy.inf,None).round(0)
benchmark = benchmark.groupby("month_due", as_index=False).inv_pct_of_company_total.sum()\
.merge(
    benchmark.groupby("month_collected", as_index=False).inv_pct_of_company_total.sum().rename(
        columns={"month_collected":"month_due"}
    ), on="month_due", suffixes=('_due','')
)
benchmark

In [None]:
#means method
benchmark_pct_diff = mean_absolute_error(benchmark.inv_pct_of_company_total, 
                                         benchmark.inv_pct_of_company_total_due)/benchmark.inv_pct_of_company_total_due.mean()
benchmark_pct_diff

In [None]:
#sums method
benchmark['abs_diff'] = (benchmark.inv_pct_of_company_total \
                                  - benchmark.inv_pct_of_company_total_due).abs()
benchmark.abs_diff.sum()/benchmark.inv_pct_of_company_total_due.sum()

## Demonstrate Minimum Potential Impact of ML 

Turning this into an ML task without any additional feature engineering, a company's average % difference from cash flow forecasted is 27% lower than the benchmark. 

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(
    nthreads=-1,     # number of threads when launching a new H2O server
    max_mem_size=12  # in gigabytes
)

In [None]:
#invoice weight is based on the company-adjusted amount. unit is number of rows for related ML parameters
invoices_to_model['inv_company_weight'] = invoices_to_model.inv_pct_of_company_total*invoices_to_model.invoice_id.nunique()\
/invoices_to_model.company_id.nunique()
invoices_to_model.inv_company_weight.sum(), invoices_to_model.__len__()

In [None]:
id_columns_h2o = [col for col in id_columns if col in invoices_to_model.columns]
invoices_to_model_h2o = h2o.H2OFrame(invoices_to_model,
           column_types=dict(zip(id_columns_h2o,["string"] * len(id_columns_h2o))))

In [None]:
#time-based split: cross-validating on future data relative to what is being trained
train = invoices_to_model_h2o[invoices_to_model_h2o['forecast_date_fold'] <= 3]
blend = invoices_to_model_h2o[(invoices_to_model_h2o['forecast_date_fold'] > 3) \
                              & (invoices_to_model_h2o['forecast_date_fold'] <= 4)]
valid = invoices_to_model_h2o[invoices_to_model_h2o['forecast_date_fold'] > 4]

In [None]:
y_numeric='collected_per_month'
x = ['months_allowed','amount_inv','inv_pct_of_company_total','currency','months_open','due_per_month',
     'prior_remaining_inv_pct']
#huber is a bi-modal distribution
#hyperparameter tuning is addressed by using AutoML and specifying sort and stopping metrics. 
#train, blend, and validation dataframes are binned sequentially by forecast month
#this enforces the time-based split during hyperparameter tuning. 
aml = H2OAutoML(max_runtime_secs=60, distribution='huber', sort_metric='mae', stopping_metric='mae',
                stopping_tolerance=0.01)
aml_model = aml.train(x=x , y=y_numeric, training_frame=train, blending_frame=blend, validation_frame=valid, 
                      weights_column='inv_company_weight') 

In [None]:
aml_model

In [None]:
aml_model.mae(),aml_model.mae(valid=True)

In [None]:
aml_model.r2(),aml_model.r2(valid=True)

In [None]:
validation_results = valid[[y_numeric,'inv_pct_of_company_total']].cbind(aml_model.predict(valid)).as_data_frame()
validation_results.rename(columns={"predict":"predict_collected_per_month"}, inplace=True)
validation_results['month_collected'] = (1/validation_results.collected_per_month)\
.replace(numpy.inf,numpy.nan).round(0)
validation_results['predict_month_collected'] = (1/validation_results.predict_collected_per_month)\
.replace(numpy.inf,None).round(0)
validation_results = validation_results.groupby("predict_month_collected", as_index=False).inv_pct_of_company_total.sum()\
.merge(
    validation_results.groupby("month_collected", as_index=False).inv_pct_of_company_total.sum().rename(
        columns={"month_collected":"predict_month_collected"}
    ), on="predict_month_collected", suffixes=('_predict','')
)
validation_results

In [None]:
#means method
ml_pct_diff = mean_absolute_error(validation_results.inv_pct_of_company_total, 
                                  validation_results.inv_pct_of_company_total_predict)\
/validation_results.inv_pct_of_company_total_predict.mean()
ml_pct_diff

In [None]:
#sums method
validation_results['abs_diff'] = (validation_results.inv_pct_of_company_total \
                                  - validation_results.inv_pct_of_company_total_predict).abs()
float(validation_results.abs_diff.sum()/validation_results.inv_pct_of_company_total_predict.sum())

In [None]:
ml_pct_diff, benchmark_pct_diff