# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

On branch main
Your branch is ahead of 'origin/main' by 25 commits.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   Kyle_Choi.ipynb
	deleted:    memos_cleaned.csv

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	memos_P1.csv
	sample.csv

no changes added to commit (use "git add" and/or "git commit -a")


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=1)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in#.sample(100_000).copy()
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0          AMAZON.COM*2X0A SEATTLE /WA US CARD PURCHASE
1     Amazon.com*1Q44K99 04-27 Amzn.com/bill WA XXXX...
2     DBT CRD XXXX 09/17/22 XXXXXXX TST* EGGS UP GRI...
3     DDA PURCHASE *XXXX XXXXXX EMS STORES 100 ARTHU...
4     FIVERR XXXXXXXXXX NY                         0...
5     PAYPAL INST XFER MICROSOFT ULTIM WEB ID: PAYPA...
6     PURCHASE AUTHORIZED ON 04/12 CHIPOTLE ONLINE C...
7     PURCHASE AUTHORIZED ON 04/22 CIRCLE K XXXXX XX...
8     PURCHASE AUTHORIZED ON 04/22 CONVENIENCE ST St...
9     PURCHASE AUTHORIZED ON 08/31 BUFFALO WILD WING...
10    PURCHASE AUTHORIZED ON 10/21 eBay O*01-XXXXX-1...
11    PURCHASE AUTHORIZED ON 12/02 CHEDDARS XXXXXXXX...
12    PURCHASE AUTHORIZED ON 12/03 WALMART.COM XXX-X...
13    PURCHASE AUTHORIZED ON 12/07 AMZN Mktp US*7H17...
14    PURCHASE AUTHORIZED ON 12/11 COSTCO WHSE #XXXX...
15    RECURRING PAYMENT AUTHORIZED ON 02/01 DollarSh...
16    RECURRING PAYMENT AUTHORIZED ON 08/28 APPLE.CO...
17            TARGET CARD SRVC BILL PAY ********

## 2. Define Regex Rules

In [6]:
STATE_LIST = [ 
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO) 
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER) 
    "KS", "KY", 
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA 
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA) 
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY" 
] 
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
CITY_LIST = [
    "SAN", "MIAMI", "DIEGO", "PHOENIX", "SEATTLE", "HOUSTON", "SANTA", "ORLANDO", 
    "CHICAGO", "LAS", "ATLANTA", "LOS", "MESA", "VEGAS", "CHARLOTTE", "TAMPA", 
    "GREENVILLE", "BROOKLYN", "DENVER", "ANGELES", "ANTONIO", "MEMPHIS", "YORK",
    "RICHMOND", "BEACH", "PALM", "FORT", "ST", "LAKE", "WEST", "DES", "PARK",
    "HILL", "NORTH", "SPRING", "CREEK", "SAINT", "RIVER", "SOUTH", "MYERS",
    # Added from n-grams
    "CITY", "NEW", "TROY", "VALLEY", "PORT"
]
CITY_REGEX = r"\b(" + "|".join(CITY_LIST) + r")\b"

In [45]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "CRD", "PUR", "PIN", "SIG", "LLC", "SIGNATURE", "WEB", "PAYMENT",
    "ACH", "DEB", "INTL", "RECURRING", "DIGIT", "ONLINE", "PROTECTION", "VSA",
    "800", "888", "WITHDRAWAL", "INDN", "STORE", "STORES", "RESTAURANT", "SUPER",
    "BEAUTY", "DELI",
    "GROCERY", "NAILS", "STOP", "BUSINESS", "PARKING", "PET", "GARDEN", "FIL",
    "POS", "PURCHASE", "DEBIT", "MARKET", "FOOD", "VENDING", 
    "POINT", "BIG", "NON", "TR", "SUP", "CENTER"
    # 'XXX'
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [9]:
REGEX_PRE = [ 
    # === 0) Normalize spaces first === 
    (r"\u00A0", " "), # Replace non-breaking space with regular space 
    (r"\s{2,}", " "), # Collapse multiple spaces into one 

    # === 1) “Authorized / Recurring” headers === 
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "), 
    (r"\bRECURRING\s+PYMT\b", " "), 

    # === 2) Card & mask boilerplate === 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "), 
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "), 
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "), 
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "), 
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "), 
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "), 
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "), 
    (r"\bF[XF]{4,}\b", " "), # Handle FXXXXX, FXXXX
    (r"\bXXX\b", " "), # Handle masked number from n-grams
    (r"X{4,}", " "), # Remove generic masked numbers 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "), 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords 
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start 
    (r"^\s*REC\s+POS\b", " "), 
    (r"^\s*RECURRING\b", " "), 

    # === 2.5) Prefix Normalization === 
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM" 
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR 

    # === 3) State + mask tails === 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), 

    # === 4) Dates/times === 
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025 
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM 
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM 

    # === 4.5) Noise numbers (from n-grams) ===
    (r"\b(?:00|10|15|16|20|365)\b", " "),

    # === 5) Merchant-terminal boilerplate === 
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "), 
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "), 
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "), 
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID... 
     
    # === 6) Misc tails === 
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "), 
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "), 
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "), 
    (r"\bBILL\s+PAYMENT\b", " "), 
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    (r"-\s*MEMO=", " "), 
    (r"(?:USA|US)$", " "), # Remove USA or US at the end 
    (r"\s+FSP$", " "), 
    (r"\bL\d{3}\b", " "), # Handle L340

    # === 7) Phone numbers === 
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212 
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212 
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), 

    # === 8) URLs/domains === 
    (r"^\.COM\s+BILL\b.*", " "), 
    (r"\.COM\b", " "), # Remove .COM anywhere
    (r"\s+COM$", " "), # Remove .COM at end of string

    # === 9) State abbreviations === 
    (STATE_REGEX, " "), # Remove standalone state codes 
    
    # === 9.5) City abbreviations ===
    (CITY_REGEX, " "), # Remove standalone city names
    
    # === 10) Noise Words (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 11) Final Tidy (Punctuation) === 
    (r"[|%_=;\\/]+", " "), # Remove misc separators 
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s*|\s*-\s*$", " "), # Remove leading/trailing hyphens
]

In [10]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADDY
    re.compile(r".*(GODADDY\.COM|GODADDY)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # === NEW: High-Frequency Full Merchant Names (from 1-grams) ===
    re.compile(r"^(AFTERPAY)\b.*"),
    re.compile(r"^(ALDI)\b.*"),
    re.compile(r"^(AMZN(?:\s*MKTP)?)\b.*"), # Handles amzn, amzn mktp
    re.compile(r"^(AMAZON(?:\.COM|\s+PRIME)?)\b.*"), # Handles amazon, amazon prime
    re.compile(r"^(APPLE(?:\.COM)?)\b.*"),
    re.compile(r"^(BRIGIT)\b.*"),
    re.compile(r"^(BURGER\s+KING)\b.*"),
    re.compile(r"^(CASH\s+APP)\b.*"),
    re.compile(r"^(CHICK-FIL-A)\b.*"),
    re.compile(r"^(CHIPOTLE)\b.*"),
    re.compile(r"^(CIRCLE\s+K)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),
    re.compile(r"^(DAIRY\s+QUEEN)\b.*"),
    re.compile(r"^(DOLLAR\s+GENERAL)\b.*"),
    re.compile(r"^(DOLLAR\s+TREE)\b.*"),
    re.compile(r"^(DOORDASH)\b.*"),
    re.compile(r"^(DUNKIN)\b.*"),
    re.compile(r"^(EBAY)\b.*"), # Added from n-grams
    re.compile(r"^(ETSY)\b.*"),
    re.compile(r"^(FAMILY\s+DOLLAR)\b.*"),
    re.compile(r"^(FOOD\s+LION)\b.*"),
    re.compile(r"^(FRYS)\b.*"),
    re.compile(r"^(GOOGLE)\b.*"),
    re.compile(r"^(HELPPAY)\b.*"),
    re.compile(r"^(HOME\s+DEPOT)\b.*"),
    re.compile(r"^(INSTACART)\b.*"),
    re.compile(r"^(KFC)\b.*"),
    re.compile(r"^(KROGER)\b.*"),
    re.compile(r"^(LITTLE\s+CAESARS)\b.*"),
    re.compile(r"^(LOWE'?S)\b.*"),
    re.compile(r"^(LYFT)\b.*"),
    re.compile(r"^(MCDONALD'?S)\b.*"),
    re.compile(r"^(MICROSOFT)\b.*"),
    re.compile(r"^(PUBLIX)\b.*"),
    re.compile(r"^(ROSS)\b.*"),
    re.compile(r"^(SAFEWAY)\b.*"),
    re.compile(r"^(SAMS\s*CLUB|SAMSCLUB)\b.*"), # Updated from n-grams
    re.compile(r"^(SHOPRITE)\b.*"),
    re.compile(r"^(SONIC)\b.*"),
    re.compile(r"^(STARBUCKS)\b.*"),
    re.compile(r"^(SUBWAY)\b.*"),
    re.compile(r"^(TACO\s+BELL)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(UBER(?:\s+EATS)?)\b.*"),
    re.compile(r"^(USPS)\b.*"),
    re.compile(r"^(VONS)\b.*"),
    # Updated from n-grams (wal, mart, wm, superc, supercenter)
    re.compile(r"^(WALMART(?:\s*SUPERCENTER|\s*SUPER\s*C)?|WM\s*SUPERCENTER|WAL-MART|WAL\s*MART)\b.*"),
    re.compile(r"^(WENDY'?S)\b.*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s09'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    re.compile(r"^(7(?:-ELEVEN|\s+11))\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # NEW: Rule for HOME* AHS.COM pattern
    re.compile(r"^HOME\s*\*?\s*([A-Z\s0-9'.-]+?)(?:\s+[A-Z]{2,})?.*"),
    
    # === TST RULES ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0EXAMPLE'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [11]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 25.8 s, sys: 202 ms, total: 26 s
Wall time: 26 s


In [40]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            groups = match.groups()
            if groups:
                return groups[-1].strip()
    return memo

df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 4.53 s, sys: 781 µs, total: 4.53 s
Wall time: 4.53 s


In [13]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [14]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [44]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
369607,PURCHASE AUTHORIZED ON 10/16 DD/BR #XXXXXX Q35...,DDBR # Q35 DUNEDIN S,DDBR
162257,FRYS-FOOD XXXX E. GREE SCOTTSDALE AZ 12/13,FRYS-FOOD E. GREE SCOTTSDALE,FRYS
430789,RECURRING PAYMENT AUTHORIZED ON 12/03 HELP.HBO...,HELP.HBOMAX HTTPSHBOMAX.C S,HELP.HBOMAX HTTPSHBOMAX.C S
357727,PURCHASE AUTHORIZED ON 09/16 KATAPULT NY SXXXX...,KATAPULT S,KATAPULT S
267125,PURCHASE AUTHORIZED ON 01/21 SAVOY TIVOLI SAN ...,SAVOY TIVOLI FRANCISCO S,SAVOY TIVOLI FRANCISCO S
441771,SHEIN.COM SHEIN.COM DE 03/27,SHEIN SHEIN,SHEIN SHEIN
450653,STARBUCKS 800-782-728 XXX-XXX-XXXX WA 0...,STARBUCKS XXX-XXX,STARBUCKS
331191,PURCHASE AUTHORIZED ON 07/09 SWA*EARLYBRDXXXXX...,SWA*EARLYBRD XXX-XXX- S,SWA
477706,UPROMISE INVMNTS ACHCNTRIBS XXXXXX XXXXXXXXXXX...,UPROMISE INVMNTS ACHCNTRIBS 266,UPROMISE INVMNTS ACHCNTRIBS 266
395158,PURCHASE AUTHORIZED ON 12/19 WAL-MART #XXXX LI...,WAL-MART # LIBERTY P,WAL-MART


In [39]:
df[df['memo'].str.contains('GRILL')]

Unnamed: 0,memo,memo_pre,memo_post
2014,1 LB SUSHI & GRILL MIAMI FL 0...,1 LB SUSHI & GRILL,1 LB SUSHI & GRILL
2124,110 GRILL MIDDLETOWN MIDDLETOWN NY 1...,110 GRILL MIDDLETOWN MIDDLETOWN,110 GRILL MIDDLETOWN MIDDLETOWN
5052,7 LEGUAS MEXICAN GRILL HUNTSVILL,7 LEGUAS MEXICAN GRILL HUNTSVILL,7 LEGUAS MEXICAN GRILL HUNTSVILL
7980,ABALONETTI BAR & GRILL XXX-XXXXXXX CA 1...,ABALONETTI BAR & GRILL XXX,ABALONETTI BAR & GRILL XXX
9720,AFRA GRILL COLUMBUS OH 0...,AFRA GRILL COLUMBUS,AFRA GRILL COLUMBUS
...,...,...,...
526103,ZIPPS SPORTS GRILL TEMPE AZ 05/31,ZIPPS SPORTS GRILL TEMPE,ZIPPS SPORTS GRILL TEMPE
526104,ZIPPS SPORTS GRILL TEMPE AZ 06/29,ZIPPS SPORTS GRILL TEMPE,ZIPPS SPORTS GRILL TEMPE
526105,ZIPPS SPORTS GRILL TEMPE AZ 08/09,ZIPPS SPORTS GRILL TEMPE,ZIPPS SPORTS GRILL TEMPE
527181,debit card FUJI GRILL BREA 101 W IMPERIAL HWY ...,FUJI GRILL BREA 101 W IMPERIAL HWY STE BRE 1 2,FUJI GRILL BREA 101 W IMPERIAL HWY STE BRE 1 2


In [16]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [17]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

MCDONALD'S
APPLE
TARGET
STARBUCKS
AMZN
SUBWAY
POS
USPS
BRIGIT
COSTCO
ROSS
KFC
ETSY
SHOPRITE
CHIPOTLE
MCDONALDS
GODADDY
SALLY
AMZNFREETIME
CCBILL
PETCO
WINN-DIXIE
SHIPT
CHEWY
GOFAN
BASKIN
RALPHS
SLICE
BETMGM
IHOP
INTUIT
STEAK-N-SHAKE
DILLONS
STATERBRO
POPEYES
*STARBUCKS
VANS
SKILLZ
MGM
FRG
P
TOLLWAY-AUTOREPLEN
.KOHLS
PRICELN
BANFIELD-
RIOT
GAMESTOP
MCW
SAVEMART
VERIZONWRLSS
BELK
OCULUS
FOODMAXX
MARINA
RAINBOW
CRYPTO
WALGREENS
CANVA
BASHAS''
ABC
GNC
MEIJER
CLAIRE'S
L
SEZZLE
D
STAPLES
FOOD4LESS
QFC
NYTIMES
*EBAY
FIV
V
GOODWILL
AMZ
CHECKERS
*UBER
TILLYS
UPS
SHOPIFY
IBI
*MICROSOFT
ETT
OTT
BLUESKY
DROPBOX
T
POTBELLY
EL
CLEO
JACK'S
A
WEGMANS
ABCMOUSE
JOURNEYS
QUADPAY
ROSES
LUCKY
EVI
RVT
ENMARKET
NORTON
SHEIN
EA
FBPAY
CRT
HLLFRSH
GERALD
*STEAM
PARADISE-MOOREMOORESVILLE
OPC
DRI
TLG
EBAY
REI
EXPRESS
LJS
NORDSTROM
SEDANOS
DEBIT
ZTL
FH
EZPASS
M
NEWSSTAND
E-Z
EVERYPLATE
PARKMOBILE
UBR
SOUTHWES
DAVIS
G
PACSUN
C
RGP
MOE'S
FRED-MEYER
TRTHFDR
PAR
ALEX
EPC
SEI
LIQUOR
TLF
PCH
GLOSS
OUTLET
CMS
BUCKLE
WIX


In [18]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
8876,ACH/DISCOVER,,
52207,Brooklyn,,
71408,CHECKCARD XXXX ARXXXX,,
74774,CHECKCARD XXXX CK XXXXXXX,,
74926,CHECKCARD XXXX CO CHARLOTTE CHARLOTTE NC XXXXX...,,
...,...,...,...
523840,XXXXXXXXXXX XXXXXX,,
523860,XXXXXXXXXXXX,,
524051,XXXXXXXXXXXXX,,
524215,XXXXXXXXXXXXXXX,,


In [19]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
8876,ACH/DISCOVER,,
52207,Brooklyn,,
71408,CHECKCARD XXXX ARXXXX,,
74774,CHECKCARD XXXX CK XXXXXXX,,
74926,CHECKCARD XXXX CO CHARLOTTE CHARLOTTE NC XXXXX...,,
...,...,...,...
523840,XXXXXXXXXXX XXXXXX,,
523860,XXXXXXXXXXXX,,
524051,XXXXXXXXXXXXX,,
524215,XXXXXXXXXXXXXXX,,


In [20]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
1842,02-20-22 SEATTLE WA XXXX AMAZON.COM*1B8II62G0 ...,AMAZON *1B8II62G0 SUNTRUST DEBIT PURCHASE,AMAZON
1843,02-20-22 SEATTLE WA XXXX AMAZON.COM*1B9CD29N0 ...,AMAZON *1B9CD29N0 SUNTRUST DEBIT PURCHASE,AMAZON
1844,02-20-22 SEATTLE WA XXXX AMAZON.COM*1I32Q2NG1 ...,AMAZON *1I32Q2NG1 SUNTRUST DEBIT PURCHASE,AMAZON
1845,02-20-22 SEATTLE WA XXXX AMAZON.COM*1I58N0TJ1 ...,AMAZON *1I58N0TJ1 SUNTRUST DEBIT PURCHASE,AMAZON
1846,02-20-22 SEATTLE WA XXXX AMAZON.COM*1IXXXXAV1 ...,AMAZON *1I AV1 SUNTRUST DEBIT PURCHASE,AMAZON
...,...,...,...
528725,www.Playgr* Bidiboo.Co,.PLAYGR* BIDIBOO.CO,.PLAYGR
528726,www.Playgr* Littlemiss,.PLAYGR* LITTLEMISS,.PLAYGR
528732,www.Styles* Luv N Hair,.STYLES* LUV N HAIR,.STYLES
528733,www.Stylese* West Brim,.STYLESE* BRIM,.STYLESE


In [21]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

496717                     [.GBPRIMEP, GBP, GOOD L BANGKOK]
9219                       [ADOBE , XXX-XXX- ADOBE.LY ENUS]
39265                           [AMAZON ,  E3LH3 AMZN BILL]
404368                           [AMAZON ,  H9 AMZN BILLWA]
510938                              [AMAZON , 0N4WC WACARD]
64258                   [AMAZON , 1K4ZD3XE2 AM AMZN BILLWA]
35690                                  [AMAZON , 2C8VX39R2]
35819                        [AMAZON , 2F2PE0DW3 AMZN BILL]
35846                                  [AMAZON , 2G4CJ8D80]
15069                           [AMAZON , 2X0A US PURCHASE]
380880                       [AMAZON , 3Q2I67N AMZN BILL S]
32083                            [AMAZON , 576QN AMZN BILL]
65424                   [AMAZON , 642AK6TE3 AM AMZN BILLWA]
36504                      [AMAZON , 7L85S8S53 AMZN BILLWA]
15483                                  [AMAZON , 936OR9RU3]
517849                             [AMAZON , AZ7 AMZN BILL]
264897                       [AMAZON , D

# Phase 2: Extract & Analyze N-Grams

In [22]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [23]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [24]:
%%time
corpus = df['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=100)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=100)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=100)
print(f"--- N-gram Analysis Complete ---")

Analyzing 528766 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 9.97 s, sys: 8.25 ms, total: 9.98 s
Wall time: 9.98 s


In [25]:
top_1grams
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [37]:
top_1grams

[('xxx', 34685),
 ('amazon', 32484),
 ('amzn', 22493),
 ('cash', 21287),
 ('app', 19346),
 ('mktp', 18994),
 ('mart', 14572),
 ('wal', 13290),
 ('pos', 13087),
 ('doordash', 12164),
 ('purchase', 10634),
 ('mcdonald', 8857),
 ('market', 8845),
 ('debit', 8140),
 ('dollar', 8070),
 ('apple', 8000),
 ('google', 6644),
 ('food', 6249),
 ('target', 6034),
 ('wm', 5818),
 ('pizza', 5157),
 ('taco', 4719),
 ('city', 4689),
 ('prime', 4169),
 ('cafe', 3903),
 ('starbucks', 3855),
 ('king', 3853),
 ('burger', 3499),
 ('mobile', 3455),
 ('el', 3400),
 ('shop', 3304),
 ('circle', 3293),
 ('superc', 3280),
 ('publix', 3279),
 ('00', 3277),
 ('general', 3180),
 ('new', 3142),
 ('liquor', 3100),
 ('uber', 3059),
 ('afterpay', 3000),
 ('sup', 2895),
 ('bar', 2809),
 ('ebay', 2731),
 ('vending', 2700),
 ('bell', 2659),
 ('express', 2618),
 ('subway', 2594),
 ('15', 2591),
 ('troy', 2574),
 ('chick', 2569),
 ('valley', 2563),
 ('kroger', 2546),
 ('20', 2536),
 ('supercenter', 2441),
 ('house', 2375),


In [27]:
ngrams_1

['xxx',
 'amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'mart',
 'wal',
 'pos',
 'doordash',
 'purchase',
 'mcdonald',
 'market',
 'debit',
 'dollar',
 'apple',
 'google',
 'food',
 'target',
 'wm',
 'pizza',
 'taco',
 'city',
 'prime',
 'cafe',
 'starbucks',
 'king',
 'burger',
 'mobile',
 'el',
 'shop',
 'circle',
 'superc',
 'publix',
 '00',
 'general',
 'new',
 'liquor',
 'uber',
 'afterpay',
 'sup',
 'bar',
 'ebay',
 'vending',
 'bell',
 'express',
 'subway',
 '15',
 'troy',
 'chick',
 'valley',
 'kroger',
 '20',
 'supercenter',
 'house',
 'dunkin',
 '365',
 'little',
 'home',
 'grill',
 'family',
 '16',
 'depot',
 'tr',
 'usps',
 'coffee',
 'brigit',
 'mexican',
 '10',
 'wine',
 'instacart',
 'club',
 'walmart',
 'frys',
 'ross',
 'microsoft',
 'aldi',
 'costco',
 'caesars',
 'foods',
 'sams',
 'kfc',
 'smoke',
 'fresh',
 'center',
 'sonic',
 'point',
 'lyft',
 'wendy',
 'com',
 'safeway',
 'etsy',
 'queen',
 'shoprite',
 'big',
 'port',
 'samsclub',
 'helppay',
 'liquors',
 'non'

In [38]:
ngrams_1

['xxx',
 'amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'mart',
 'wal',
 'pos',
 'doordash',
 'purchase',
 'mcdonald',
 'market',
 'debit',
 'dollar',
 'apple',
 'google',
 'food',
 'target',
 'wm',
 'pizza',
 'taco',
 'city',
 'prime',
 'cafe',
 'starbucks',
 'king',
 'burger',
 'mobile',
 'el',
 'shop',
 'circle',
 'superc',
 'publix',
 '00',
 'general',
 'new',
 'liquor',
 'uber',
 'afterpay',
 'sup',
 'bar',
 'ebay',
 'vending',
 'bell',
 'express',
 'subway',
 '15',
 'troy',
 'chick',
 'valley',
 'kroger',
 '20',
 'supercenter',
 'house',
 'dunkin',
 '365',
 'little',
 'home',
 'grill',
 'family',
 '16',
 'depot',
 'tr',
 'usps',
 'coffee',
 'brigit',
 'mexican',
 '10',
 'wine',
 'instacart',
 'club',
 'walmart',
 'frys',
 'ross',
 'microsoft',
 'aldi',
 'costco',
 'caesars',
 'foods',
 'sams',
 'kfc',
 'smoke',
 'fresh',
 'center',
 'sonic',
 'point',
 'lyft',
 'wendy',
 'com',
 'safeway',
 'etsy',
 'queen',
 'shoprite',
 'big',
 'port',
 'samsclub',
 'helppay',
 'liquors',
 'non'

In [28]:
top_2grams

[('amzn mktp', 18946),
 ('cash app', 18686),
 ('xxx xxx', 13278),
 ('wal mart', 11511),
 ('amazon prime', 3851),
 ('wm superc', 3278),
 ('superc wal', 3277),
 ('dollar general', 3050),
 ('mobile purchase', 2991),
 ('mart sup', 2692),
 ('wm supercenter', 2408),
 ('taco bell', 2408),
 ('365 market', 2205),
 ('debit purchase', 1966),
 ('dollar tr', 1835),
 ('burger king', 1828),
 ('little caesars', 1529),
 ('home depot', 1482),
 ('family dollar', 1306),
 ('sams club', 1092),
 ('market 432', 1031),
 ('hunt valley', 999),
 ('uber eats', 947),
 ('food lion', 946),
 ('wal wal', 880),
 ('dairy queen', 791),
 ('jack box', 774),
 ('dollar tree', 770),
 ('winco foods', 733),
 ('carls jr', 728),
 ('purchase sign', 725),
 ('sign based', 725),
 ('del taco', 694),
 ('panda express', 678),
 ('nayax vending', 669),
 ('food mart', 667),
 ('dollar ge', 663),
 ('apple xxx', 663),
 ('snack soda', 658),
 ('bath body', 651),
 ('20 00', 644),
 ('cash 20', 641),
 ('market work', 585),
 ('soda vending', 568),
 

In [29]:
top_3grams

[('wm superc wal', 3277),
 ('superc wal mart', 2766),
 ('wal mart sup', 2692),
 ('365 market 432', 1031),
 ('wal wal mart', 875),
 ('mobile purchase sign', 725),
 ('purchase sign based', 725),
 ('apple xxx xxx', 663),
 ('cash 20 00', 640),
 ('market work renton', 566),
 ('snack soda vending', 557),
 ('market xxx xxx', 507),
 ('dollar ge dg', 502),
 ('365 market xxx', 497),
 ('xxx xxx troy', 492),
 ('365 market 43', 469),
 ('market 43 troy', 466),
 ('market 432 32', 462),
 ('432 32 troy', 455),
 ('bath body works', 450),
 ('market 432 329', 441),
 ('432 329 troy', 439),
 ('cash 40 00', 400),
 ('domino xxx xxx', 380),
 ('klover app boost', 363),
 ('merchant issued target', 363),
 ('fresh market coffe', 339),
 ('market coffe waxahachie', 339),
 ('point sale debitl340', 335),
 ('xxx xxx 15', 316),
 ('help hbomax httpshbomax', 311),
 ('otha kimbrough pos', 280),
 ('routs farmers market', 270),
 ('float corp payments', 263),
 ('network xxx xxx', 261),
 ('aystation network xxx', 252),
 ('debi

In [30]:
# Use 1 grams to find prefixes