In [None]:
import re
import pandas as pd
Inflows = pd.read_parquet("/uss/hdsi-prismdata/q1-ucsd-inflows.pqt")
Outflows = pd.read_parquet("/uss/hdsi-prismdata/q1-ucsd-outflows.pqt")
Diff = Outflows[Outflows['memo'] != Outflows['category']]['memo']
# Compile once for speed
RULES = [
    # 0) Normalize whitespace & weird spaces first
    (re.compile(r"\u00A0"), " "),                                # non-breaking space
    (re.compile(r"\s{2,}"), " "),
    
    # 1) Remove mask tokens & generic IDs
    (re.compile(r"(?:#)?X{4,}", re.I), " "),                     # XXXX, #XXXX, XXXXXXXXX...
    (re.compile(r"\bRF#\w+\b", re.I), " "),
    (re.compile(r"\bMCC\b\s*\w+", re.I), " "),
    (re.compile(r"\bSEQ#?\s*\w+\b", re.I), " "),
    (re.compile(r"\bC#\**\s*\w+\b", re.I), " "),
    (re.compile(r"\bID:\s*\w+\b", re.I), " "),
    (re.compile(r"\bOrder Number\b.*?(?=\b[A-Z]|$)", re.I), " "),
    
    # 2) Remove phone numbers (real or masked)
    (re.compile(r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", re.I), " "),
    (re.compile(r"\b\d{3}-\d{4}\b"), " "),                       # 888-802-30 (truncated)
    
    # 3) Remove dates & times
    (re.compile(r"\b\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b"), " "),    # 03/25, 03/25/22, 03-25-22
    (re.compile(r"\b\d{1,2}:\d{2}(?::\d{2})?\s*[ap]?\b", re.I), " "),  # 12:03, 07:07p
    (re.compile(r"\bDATE\b\s*\d{2}[/-]\d{2}\b", re.I), " "),
    (re.compile(r"\(\d{2}/\d{2}/\d{2}.*?\)"), " "),              # parenthetical stamps
    
    # 4) Remove URLs/domains & billing tails
    (re.compile(r"\b(?:https?:\/\/)?[A-Za-z0-9.-]+\.(?:com|org|net|edu|io|co)\b(?:\/[^\s]*)?", re.I), " "),
    (re.compile(r"\b(?:AMZN\.COM\/BILL|AMZNAMZN\.COM\s*BILL|AMAZON\.COM\/BILL|APPLE\.COM\/BILL|G\.CO\/HELPPAY#|SUPPORT\.GOOGLE?)\b", re.I), " "),
    
    # 5) Remove banking boilerplate & transport words
    (re.compile(r"\b(?:PURCHASE|AUTHORIZED|RECURRING|PAYMENT|WITHDRAWAL|DEBIT|CREDIT|POS|CHECKCARD|DBT|CRD|PIN|SIG|CARD|NBR|TRANS(?:ACTION)?|EFF\.?|DATE|TIME|DES:|INDN:|WEB(?: ID)?:|CCD(?: ID)?:)\b", re.I), " "),
    (re.compile(r"\b(?:VISA|MASTERCARD|AMEX|DISC(?:OVER)?|DDA)\b", re.I), " "),
    (re.compile(r"\b(?:POS|PIN|MAC|MWBTGD|EXA|SIG|DDA|ACH)\b", re.I), " "),
    
    # 6) Canonicalize common merchants (keep signal)
    (re.compile(r"\bAMZN\s+MKTP\s+US\b", re.I), " amazon "),
    (re.compile(r"\bAMAZON(?:\.COM)?\b", re.I), " amazon "),
    (re.compile(r"\bAMZN\b", re.I), " amazon "),
    (re.compile(r"\bAmazon(?:\.com)?\*?[A-Z0-9-]*\b", re.I), " amazon "),
    (re.compile(r"\bAmazon\s+Prime\*?[A-Z0-9-]*\b", re.I), " amazon prime "),
    (re.compile(r"\bAPPLE(?:\.COM)?(?:\/BILL)?\b", re.I), " apple "),
    (re.compile(r"\bMicrosoft\b|\bMSBILL\.INFO\b", re.I), " microsoft "),
    (re.compile(r"\bGOOGLE\b|\bGoogle\s*Play\b|\bYouTube\b", re.I), " google "),
    (re.compile(r"\bCASH\s*APP\*?", re.I), " cash app "),
    (re.compile(r"\bPLAYSTATION(?:\s*NETWORK|NETWOR)?\b|\bSIE\b", re.I), " playstation "),
    (re.compile(r"\bEBAY\b|\beBay\b", re.I), " ebay "),
    (re.compile(r"\bAFTERPAY\b", re.I), " afterpay "),
    
    # 7) Toss order/line id styles that remain
    (re.compile(r"\bO\*\d{2}-X{5,}-\d{2,}\b", re.I), " "),
    (re.compile(r"\b[A-Z]{1,3}\d{2,4}\b(?=\s|$)"), " "),         # OK251, CA133, C718
    (re.compile(r"\b[0-9A-Z]{4,}\b(?= ?/ ?WA| ?/ ?US)", re.I), " "),
    
    # 8) Remove state abbreviations as stand-alone tokens (optional—comment if you want them)
    (re.compile(r"\b(AL|AK|AZ|AR|CA|CO|CT|DC|DE|FL|GA|HI|IA|ID|IL|IN|KS|KY|LA|MA|MD|ME|MI|MN|MO|MS|MT|NC|ND|NE|NH|NJ|NM|NV|NY|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VA|VT|WA|WI|WV|WY)\b"), " "),
    
    # 9) Punctuation/space tidy
    (re.compile(r"[|#%*()_=+;,/\\]+"), " "),
    (re.compile(r"[-]{2,}"), " "),
    (re.compile(r"\s{2,}"), " "),
    (re.compile(r"^\s+|\s+$"), ""),                               # trim
]

def clean_memo_series(s: pd.Series) -> pd.Series:
    out = s.astype(str)
    for pat, repl in RULES:
        out = out.str.replace(pat, repl, regex=True)
    # Lowercase & final collapse
    out = out.str.lower().str.replace(r"\s{2,}", " ", regex=True).str.strip()
    return out

hi = Diff.reset_index()
hi['memo_clean'] = clean_memo_series(hi['memo'])