In [3]:
import pandas as pd
from collections import defaultdict

# Load the Excel file
xls = pd.ExcelFile('data/Expenses-2025.12.xlsx')

# Get all sheet names
sheet_names = xls.sheet_names

# Initialize a list to store all payees
all_payees = []

# Loop through each sheet and extract the 'Payee' column
for sheet_name in sheet_names:
    try:
        df = pd.read_excel(xls, sheet_name=sheet_name)
        if 'Payee' in df.columns:
            all_payees.extend(df['Payee'].dropna().tolist())
    except Exception as e:
        print(f"Could not process sheet {sheet_name}: {e}")

# Remove duplicates
unique_payees = sorted(list(set(all_payees)))

# Group by the first 8 characters
grouped_payees = defaultdict(list)
for payee in unique_payees:
    grouped_payees[payee[:8]].append(payee)

# Filter for groups with more than one payee
similar_payees = {key: value for key, value in grouped_payees.items() if len(value) > 1}

# Print the results
for common_chars, payees in similar_payees.items():
    print(f"Common characters: '{common_chars}', Count: {len(payees)}")
    for payee in payees:
        print(f"  - {payee}")

Common characters: '7-ELEVEN', Count: 2
  - 7-ELEVEN 23020           SEATTLE      WA
  - 7-ELEVEN T3 #03-12  SINGAPORE           SG
Common characters: 'AAA WA M', Count: 2
  - AAA WA MEMBERSHIP 00     800-562-2582 WA
  - AAA WA MEMBERSHIP 01     800-562-2582 WA
Common characters: 'AIRBNB *', Count: 2
  - AIRBNB * HMKNBQB23F AIRBNB.COM CA
  - AIRBNB * HMSBE25BKD AIRBNB.COM CA
Common characters: 'ALASKA A', Count: 9
  - ALASKA AIR 0272105259538SEATTLE WA
  - ALASKA AIR 0272105260139SEATTLE WA
  - ALASKA AIR 0272105260140SEATTLE WA
  - ALASKA AIR 0272105262086SEATTLE WA
  - ALASKA AIR 0272106640952SEATTLE WA
  - ALASKA AIR 0272109051092SEATTLE WA
  - ALASKA AIR 0272394527601SEATTLE WA
  - ALASKA AIR SEATTLE WA
  - ALASKA AIRLINES     SEATTLE             WA
Common characters: 'AMAZON M', Count: 31
  - AMAZON MARKEPLACE NA PA
  - AMAZON MKTPL*061WN5NY3   Amzn.com/billWA
  - AMAZON MKTPL*273Q84KG3   Amzn.com/billWA
  - AMAZON MKTPL*6E4ZE8763   Amzn.com/billWA
  - AMAZON MKTPL*EC40N97Z3   Amz