In [None]:
# 02_reconciliation.ipynb
# 3-way reconciliation: payments_system vs bank_statement vs refunds
# Outputs: /mnt/data/outputs/reconciliation_summary.csv etc.

import pandas as pd
from pathlib import Path

data_dir = Path('/mnt/data')   # change if needed
out_dir = data_dir / 'outputs'
out_dir.mkdir(exist_ok=True)


In [None]:
payments = pd.read_csv(data_dir / 'payments_system.csv', parse_dates=['timestamp'])
bank = pd.read_csv(data_dir / 'bank_statement.csv', parse_dates=['bank_timestamp'])
refunds = pd.read_csv(data_dir / 'refunds.csv', parse_dates=['refund_timestamp'])

print('Loaded rows -> payments:', len(payments), 'bank:', len(bank), 'refunds:', len(refunds))


In [None]:
# standardize types & columns
payments['amount'] = payments['amount'].astype(float)
payments['transaction_id'] = payments['transaction_id'].astype(str)
bank['bank_amount'] = bank['bank_amount'].astype(float)
bank['transaction_id'] = bank['transaction_id'].astype(str)
refunds['refund_amount'] = refunds['refund_amount'].astype(float)
refunds['transaction_id'] = refunds['transaction_id'].astype(str)


In [None]:
p_b = payments.merge(bank, on='transaction_id', how='left', indicator=True)
missing_settlements = p_b[p_b['_merge'] == 'left_only'].copy()
matched_settlements = p_b[p_b['_merge'] == 'both'].copy()

print('Missing settlements (in payments but not in bank):', len(missing_settlements))
print('Matched settlements:', len(matched_settlements))


In [None]:
# aggregate refunds if multiple refunds per txn
refunds_agg = refunds.groupby('transaction_id').agg({
    'refund_amount': 'sum',
    'refund_timestamp': 'min'   # earliest refund time
}).reset_index()

p_r = payments.merge(refunds_agg, on='transaction_id', how='left', indicator=True)
refunded = p_r[p_r['refund_amount'].notna()].copy()
print('Refunded transactions found in payments dataset:', len(refunded))


In [None]:
# Over-refunds: refund_amount > amount
refunded['over_refund_flag'] = refunded['refund_amount'] > refunded['amount']

# Refunds for failed transactions
refunded['refund_on_failed'] = (refunded['status'] == 'FAILED') & (refunded['refund_amount'].notna())

# Refunds without matching payments (refund exists but no payments record)
refunds_only = refunds[~refunds['transaction_id'].isin(payments['transaction_id'])].copy()

print('Over-refunds:', refunded['over_refund_flag'].sum())
print('Refunds on FAILED txns:', refunded['refund_on_failed'].sum())
print('Refunds without any payments record (refunds_only):', len(refunds_only))


In [None]:
total_overrefund = refunded.loc[refunded['over_refund_flag'], 'refund_amount'].sum() - refunded.loc[refunded['over_refund_flag'], 'amount'].sum()
total_refund_on_failed = refunded.loc[refunded['refund_on_failed'], 'refund_amount'].sum()
total_unmatched_refunds = refunds_only['refund_amount'].sum()

# For missing settlements, compute net difference (system amount - bank amount) for matched rows
miss_settlement_count = len(missing_settlements)
matched_amount_diff = (matched_settlements['amount'] - matched_settlements['bank_amount']).sum()

summary = pd.DataFrame([
    ['Over-Refunds', int(refunded['over_refund_flag'].sum()), float(total_overrefund)],
    ['Refunds on Failed Transactions', int(refunded['refund_on_failed'].sum()), float(total_refund_on_failed)],
    ['Unmatched Refunds', int(len(refunds_only)), float(total_unmatched_refunds)],
    ['Missing Settlements', int(miss_settlement_count), float(matched_amount_diff)]
], columns=['leak_type', 'count', 'amount_lost_kes'])

# Save outputs
summary.to_csv(out_dir / 'reconciliation_summary.csv', index=False)
missing_settlements.to_csv(out_dir / 'missing_settlements.csv', index=False)
refunded.to_csv(out_dir / 'refunded_detailed.csv', index=False)
refunds_only.to_csv(out_dir / 'refunds_unmatched.csv', index=False)

summary


In [None]:
# Top customers by refunded amount
top_refunded_customers = refunded.groupby('customer_id').agg({
    'refund_amount': 'sum',
    'amount': 'sum',
    'transaction_id': 'count'
}).rename(columns={'transaction_id':'refund_count'}).sort_values('refund_amount', ascending=False).head(10)

top_refunded_customers.to_csv(out_dir / 'top_refunded_customers.csv')
top_refunded_customers


In [None]:
# Example waterfall numbers
total_processed = payments['amount'].sum()
total_refunds = refunds['refund_amount'].sum()
total_missing_settlements = missing_settlements['amount'].sum() if 'amount' in missing_settlements.columns else 0.0
net_collected = total_processed - total_refunds - total_missing_settlements

waterfall = pd.DataFrame([
    ['Total Processed', total_processed],
    ['Total Refunds', total_refunds],
    ['Missing Settlements', total_missing_settlements],
    ['Net Collected', net_collected]
], columns=['metric','kes'])

waterfall.to_csv(out_dir / 'revenue_waterfall.csv', index=False)
waterfall
