In [None]:
%matplotlib inline

# Input Datasets  

In [None]:
import pandas
data_folder = '../data'
date_format='%Y-%M-%d' #truncate datetimes to dates
invoices = pandas.read_csv(data_folder + '/invoice.csv', na_values='inf', 
                           parse_dates=['invoice_date', 'due_date', 'cleared_date'], date_format=date_format)
payments = pandas.read_csv(data_folder + '/invoice_payments.csv', na_values='inf',
                           parse_dates=['transaction_date'], date_format=date_format)

## Dataset Definitions & Relationships

We have two input datasets: invoices and their payments.
- Payments are amounts in time, which are directly mapped to companies. 
- Invoices can have multiple payments, but usually only have 1. 

In [None]:
invoices.dtypes

In [None]:
payments.dtypes

In [None]:
#The join key will be invoice_id, so it must be unique (and it is).
invoices.id.value_counts(dropna=False).value_counts(dropna=False)\
.to_frame(name="ids").rename_axis('invoices_per_id')

In [None]:
payments.invoice_id.value_counts(dropna=False).value_counts(dropna=False)\
.to_frame(name="invoices").rename_axis('payments_per_invoice')

### Transforming Dates to Quantities

In [None]:
invoice_time_allowed = invoices.due_date - invoices.invoice_date
invoice_time_open = invoices.cleared_date - invoices.invoice_date
invoice_time_late = invoice_time_open - invoice_time_allowed

In [None]:
invoices['days_allowed'] = invoice_time_allowed.map(lambda t: t.days if not pandas.isnull(t) else None)
invoices['days_open'] = invoice_time_open.map(lambda t: t.days if not pandas.isnull(t) else None)
invoices['days_late'] = invoice_time_late.map(lambda t: t.days if not pandas.isnull(t) else None)

invoices['months_allowed'] = (invoices.due_date.dt.to_period('M') - invoices.invoice_date.dt.to_period('M'))
invoices.months_allowed = invoices.months_allowed.map(lambda m: m.n if not pandas.isnull(m) else None)
invoices['months_open'] = (invoices.cleared_date.dt.to_period('M') - invoices.invoice_date.dt.to_period('M'))
invoices.months_open = invoices.months_open.map(lambda m: m.n if not pandas.isnull(m) else None)
invoices['months_late'] = (invoices.cleared_date.dt.to_period('M') - invoices.due_date.dt.to_period('M'))
invoices.months_late = invoices.months_late.map(lambda m: m.n if not pandas.isnull(m) else None)

## Entity Definitions & Relationships

- Company: business entity for which Tesorio is forecasting cash collected. There are only two. Each company collects using multiple currencies from multiple customers. 
- Account: **In this limited dataset, accounts and companies are synonymous, so we ignore accounts.**  
- Customer: metadata about an invoice which is specific to each company. 

In [None]:
invoices.groupby("company_id")[["customer_id","currency"]].nunique()

In [None]:
invoices.groupby("customer_id").company_id.nunique().value_counts()\
.to_frame(name='customers').rename_axis('companies_per_customer')

In [None]:
invoices.groupby("company_id").account_id.nunique().to_frame(name="unique_accounts")

In [None]:
invoices.groupby("account_id").company_id.nunique().value_counts()\
.to_frame(name='count').rename_axis('companies_per_account')

## Data Cleaning Needs

### Payments

Transaction data begins in 2011 and ends on 2021-01-31. 

In [None]:
payments.__len__()

In [None]:
payment_stats = payments.describe(include='all')
payment_stats.loc['% populated'] = payment_stats.loc['count']/payments.__len__()
payment_stats

In [None]:
#converted_amount is reliable
(((payments.amount * payments.root_exchange_rate_value) - payments.converted_amount).abs()).max()

In [None]:
payments.select_dtypes(include='float').hist(bins=50, figsize=(12, 3), layout=(1,3))

### Invoices

In [None]:
invoices_stats = invoices.describe(include='all')
invoices_stats.loc['% populated'] = invoices_stats.loc['count']/invoices.__len__()
invoices_stats

In [None]:
#opened outside of payment data time period
(invoices.loc[invoices.invoice_date>payments.transaction_date.max()].__len__(), 
invoices.loc[invoices.invoice_date<payments.transaction_date.min()].__len__())

### Cleared vs Open 

- Open invoices still have a date cleared

In [None]:
invoices.status.value_counts(normalize=True, dropna=False).to_frame(name="% of Invoices")

In [None]:
invoices.loc[invoices.status == 'OPEN'].cleared_date.value_counts(dropna=False)

In [None]:
invoices.loc[invoices.cleared_date.isnull() != (invoices.status == 'OPEN'),['status','cleared_date']]\
.value_counts(dropna=False)

### Date Relationships for Cleared Invoices

In [None]:
cleared_invoices = invoices.query("status=='CLEARED'").drop(columns=['status'])

In [None]:
#invoices are either cleared around the normal billing cycle or a year later
cleared_invoices.days_open.plot(kind='hist',bins=365, title="Cleared Invoices: Days Open", figsize=(12, 3))

In [None]:
cleared_invoices.__len__()/invoices.__len__()

In [None]:
#invoices are either open only in the month they became active or a year later
cleared_invoices.months_open.value_counts(normalize=True, dropna=False)

In [None]:
#invoices are either due the same month they became active or a year later
cleared_invoices.months_allowed.value_counts(normalize=True, dropna=False)

In [None]:
cleared_invoices.months_late.value_counts(normalize=True, dropna=False)

In [None]:
cleared_invoices['months_late_vs_allowed'] = cleared_invoices.months_late/(cleared_invoices.months_allowed+1)
cleared_invoices.months_late_vs_allowed.value_counts(normalize=True, dropna=False)

In [None]:
cleared_invoices.select_dtypes(include=['float']).hist(bins=50, figsize=(12, 9))

### Exchange Rate

Exchange rates change for both payments and open invoices. Customers would expect to pay the amount they were originally invoiced in their own currency, not the USD amount originally invoiced. Therefore, we should use raw amounts to determine how much is paid vs due. 

In [None]:
# USD is not is always 1 - it varies a lot
currency_ranges = cleared_invoices.groupby("currency").root_exchange_rate_value.describe(percentiles=[])
(currency_ranges['max']/currency_ranges['min']).sort_values().plot(kind='bar', title="Exchange Rate Spread Ratio")

In [None]:
# a significant % of cleared USD invoices have an exchange rate unequal to 1
cleared_invoices_usd = cleared_invoices\
.query("currency=='USD' and months_allowed>=0 and months_allowed<=12 and months_late<=12").copy()
cleared_invoices_usd['exchange_rate_is_1'] = cleared_invoices_usd['root_exchange_rate_value'] == 1
cleared_invoices_usd.exchange_rate_is_1.mean()

In [None]:
# USD exchange rate variations from 1 tend to be invoices which took longer to clear
# This suggests that the invoice exchange rate is "current state data." 
time_to_clear = cleared_invoices_usd.cleared_date - cleared_invoices_usd.invoice_date
cleared_invoices_usd['months_to_clear'] = time_to_clear.map(lambda t: round(t.days/30))
cleared_invoices_usd.months_to_clear.value_counts(normalize=True, dropna=False)

In [None]:
cleared_invoices_usd.groupby("exchange_rate_is_1").months_to_clear.agg(['mean','count'])

In [None]:
cleared_invoices_usd.groupby(cleared_invoices_usd.months_to_clear.clip(upper=13, lower=-1))\
.exchange_rate_is_1.mean().plot(title='% of USD Invoices With Exchange Rate Equal to 1', figsize=(12,3))

### Merge

- 18% of payments are partial. 
- Exchange rates vary across payments.
- Companies are consistent between payments and invoices, when payments are present

In [None]:
#all payment invoices are represented in both datasets 
len(set(payments.invoice_id) - set(invoices.id))

In [None]:
#7% of invoices do not have payments yet
len(set(invoices.id) - set(payments.invoice_id))/invoices.__len__()

In [None]:
invoice_payments = invoices.rename(columns={"id":"invoice_id","amount_inv":"amount"})\
.merge(payments, on="invoice_id", how='left', suffixes=('_inv', '_pmt'))

In [None]:
duplicated_columns = [col.replace('_pmt','') for col in invoice_payments.columns if col.endswith('_pmt')]
for col in  duplicated_columns:
    inconsistent_rows = invoice_payments.loc[invoice_payments[col + '_pmt']!=invoice_payments[col + '_inv']]
    print(f"{col}: {inconsistent_rows.__len__()/invoice_payments.__len__()} inconsistent rows in merged dataset")

In [None]:
invoice_payments.query("company_id_pmt!=company_id_inv").company_id_pmt.value_counts()

## Adding Analytical Variables 

### Date Quantity Variables

#### Broken Down By Period

What period-level should we use? (day, week, month)
Create periods from invoice date to close date
Rolling payment window: due_date - current period
Rolling days open: cleared_date - current period 

# Metadata Calculations & Cleaning

Totals, Uniques, Averages, Ranges, Outliers, Missings
Variables: Invoices, USD Amounts, Cleared/Open, Due Date, Invoice Date, Transaction Date, Customers, Companies, Accounts

# Notes

## Notable entities

e.g. customers with notable values

## Sparsity

### Entities

### Date Periods

## Trends Over Time

# Analysis

## Business Motivation

Cash collections don't follow due dates

## Data Science Benchmark

Define & Quantify: customers' mean absolute % error each period from cash due.

## Data Science Target

Best outcome variable? 
Days late
Days open (Total and Rolling)
Days Open as a % of Payment Window (Total and Rolling)
Days Late as a % of Payment Window (Total and Rolling)