In [27]:
import pandas as pd

In [None]:
# Load available CSVs and list files in working directory
import os, glob
print('Working directory:', os.getcwd())
print('Listing files in working directory:')
for f in sorted(os.listdir('.')):
    print('-', f)

# Attempt to load known CSVs and expose them as variables `api` and `bank` if present
for name, fname in [('api','api_source.csv'), ('bank','bank_settlement.csv')]:
    if os.path.exists(fname):
        try:
            df = pd.read_csv(fname, parse_dates=['txn_date'], dayfirst=False)
        except Exception:
            # fallback to plain read if date parsing fails or column missing
            df = pd.read_csv(fname)
        print(f'Loaded {fname} -> {name}: shape={df.shape}')
        try:
            display(df.head())
        except Exception:
            print(df.head().to_string())
        globals()[name] = df
    else:
        print(f'Not found: {fname}')

# Detect PDFs in the folder
pdfs = glob.glob('*.pdf')
if pdfs:
    print('PDFs detected:', pdfs)
else:
    print('No PDF files detected in working directory')

In [28]:

api = pd.read_csv("api_source.csv", parse_dates=['txn_date'])
bank = pd.read_csv("bank_settlement.csv", parse_dates=['txn_date', 'settlement_date'])

print("API columns:", api.columns)
print("BANK columns:", bank.columns)
print(api.head())
print(bank.head())


API columns: Index(['txn_id', 'utr', 'amount', 'txn_date', 'status'], dtype='object')
BANK columns: Index(['txn_id', 'utr', 'amount', 'txn_date', 'status', 'settlement_date'], dtype='object')
   txn_id           utr  amount   txn_date   status
0       1  LEWCTMBSBPCC    1299 2025-11-03  SUCCESS
1       2  P4DHW8UN9B5K     299 2025-10-26  SUCCESS
2       3  EDK4FUWMTCBG     999 2025-11-03  SUCCESS
3       4  XPLUQK3YHUS6     999 2025-10-29  SUCCESS
4       5  ZK9DP2ERBX2U    1299 2025-11-02  SUCCESS
   txn_id           utr  amount   txn_date   status settlement_date
0       1  LEWCTMBSBPCC    1299 2025-11-03  SUCCESS      2025-11-03
1       2  P4DHW8UN9B5K     299 2025-10-26  SUCCESS      2025-10-27
2       3  EDK4FUWMTCBG     999 2025-11-03   FAILED      2025-11-04
3       4  XPLUQK3YHUS6     999 2025-10-29  SUCCESS      2025-10-29
4       5  ZK9DP2ERBX2U    1299 2025-11-02  SUCCESS      2025-11-02


In [29]:
#1)	Ingest	both files standardize dtypes; ensure date parsing

api.columns = api.columns.str.lower().str.strip()
bank.columns = bank.columns.str.lower().str.strip()


api['utr'] = api['utr'].astype(str).str.strip()
bank['utr'] = bank['utr'].astype(str).str.strip()

api['status'] = api['status'].str.lower().str.strip()
bank['status'] = bank['status'].str.lower().str.strip()

api = api.drop_duplicates(subset=['utr','txn_date','amount'])
bank = bank.drop_duplicates(subset=['utr','txn_date','amount'])


In [32]:
api_exact  = api.drop_duplicates(subset=['utr', 'amount', 'txn_date']).copy()
bank_exact = bank.drop_duplicates(subset=['utr', 'amount', 'txn_date']).copy()

exact_match = pd.merge(api_exact,bank_exact,
    on=['utr', 'amount', 'txn_date'],
    how='inner',
    suffixes=('_api', '_bank')
)




In [None]:
exact_match['category'] = 'exact_match'


print("Exact matches:", len(exact_match))
display(exact_match.head())

âœ… Exact matches: 72


Unnamed: 0,txn_id_api,utr,amount,txn_date,status_api,txn_id_bank,status_bank,settlement_date,category
0,1,LEWCTMBSBPCC,1299,2025-11-03,success,1,success,2025-11-03,exact_match
1,2,P4DHW8UN9B5K,299,2025-10-26,success,2,success,2025-10-27,exact_match
2,3,EDK4FUWMTCBG,999,2025-11-03,success,3,failed,2025-11-04,exact_match
3,4,XPLUQK3YHUS6,999,2025-10-29,success,4,success,2025-10-29,exact_match
4,5,ZK9DP2ERBX2U,1299,2025-11-02,success,5,success,2025-11-02,exact_match
