# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 6e6896a] regex
 1 file changed, 9502 insertions(+), 9050 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in#.sample(100_000).copy()
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     AFTERPAY 08-13 185-XXXXXXXX CA XXXX DEBIT CARD...
1     Amazon Prime*1UXXXXX Amzn.com/bill WA        0...
2                     CACTUS CARLOS LLC Sedona AZ 08/21
3     CHECKCARD XXXX GOOGLE *FI WM498R g.co/helppay#...
4     CHECKCARD XXXX H-E-B GAS/CARWASH KATY TX XXXXX...
5         CHECKCARD XXXX WM SUPERCENTER FAYETTEVILLE,NC
6                                   Chief's Bbq & Grill
7     DBT Purchase CASH APP*KANAKA |CASH APP*KANAKA ...
8     DD'S DISCOUNTS 09/14 #XXXXXXXXX PURCHASE DD'S ...
9     DISHACoatesville SABO                        0...
10    Direct Debit: Home River Group, Web Pmts , 851...
11    HARBOR FREIGHT TOOLS 3 ORANGE CITY FL        0...
12                                        King CA 08/28
13          PARKCOLUMBUS.COM MOB parkmobilecom OH 10/10
14    POS Debit - Visa Check Card XXXX - TST* E.A.T....
15    PURCHASE AUTHORIZED ON 06/11 CASH APP*KYLE XXX...
16    PURCHASE AUTHORIZED ON 08/29 WAL-MART Wal-Mart...
17    PURCHASE AUTHORIZED ON 09/24 VALLEY HOUSE 

## 2. Define Regex Rules

In [6]:
STATE_LIST = [ 
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO) 
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER) 
    "KS", "KY", 
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA 
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA) 
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY" 
] 
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
CITY_LIST = [
    "SAN", "MIAMI", "DIEGO", "PHOENIX", "SEATTLE", "HOUSTON", "SANTA", "ORLANDO", 
    "CHICAGO", "LAS", "ATLANTA", "LOS", "MESA", "VEGAS", "CHARLOTTE", "TAMPA", 
    "GREENVILLE", "BROOKLYN", "DENVER", "ANGELES", "ANTONIO", "MEMPHIS", "YORK",
    "RICHMOND", "BEACH", "PALM", "FORT", "ST", "LAKE", "WEST", "DES", "PARK",
    "HILL", "NORTH", "SPRING", "CREEK", "SAINT", "RIVER", "SOUTH", "MYERS",
    # Added from n-grams
    "CITY", "NEW", "TROY", "VALLEY", "PORT"
]
CITY_REGEX = r"\b(" + "|".join(CITY_LIST) + r")\b"

In [8]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "CRD", "PUR", "PIN", "SIG", "LLC", "SIGNATURE", "WEB", "PAYMENT",
    "ACH", "DEB", "INTL", "RECURRING", "DIGIT", "ONLINE", "PROTECTION", "VSA",
    "800", "888", "WITHDRAWAL", "INDN", "STORE", "STORES", "RESTAURANT", "SUPER",
    "BEAUTY", "DELI", "MART", "CENTER", "SUPERC",
    "GROCERY", "NAILS", "STOP", "BUSINESS", "PARKING", "PET", "GARDEN", "FIL",
    "POS", "PURCHASE", "DEBIT", "MARKET", "FOOD", "VENDING", 
    "POINT", "BIG", "NON", "TR", "SUP", "CENTER"
    # 'XXX'
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [9]:
REGEX_PRE = [ 
    # === 0) Normalize spaces first === 
    (r"\u00A0", " "), # Replace non-breaking space with regular space 
    (r"\s{2,}", " "), # Collapse multiple spaces into one 

    # === 1) “Authorized / Recurring” headers === 
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "), 
    (r"\bRECURRING\s+PYMT\b", " "), 

    # === 2) Card & mask boilerplate === 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "), 
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "), 
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "), 
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "), 
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "), 
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "), 
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "), 
    (r"\bF[XF]{4,}\b", " "), # Handle FXXXXX, FXXXX
    (r"\bXXX\b", " "), # Handle masked number from n-grams
    (r"X{4,}", " "), # Remove generic masked numbers 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "), 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords 
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start 
    (r"^\s*REC\s+POS\b", " "), 
    (r"^\s*RECURRING\b", " "), 

    # === 2.5) Prefix Normalization === 
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM" 
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR 

    # === 3) State + mask tails === 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), 

    # === 4) Dates/times === 
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025 
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM 
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM 

    # === 4.5) Noise numbers (from n-grams) ===
    (r"\b(?:00|10|15|16|20|365)\b", " "),

    # === 5) Merchant-terminal boilerplate === 
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "), 
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "), 
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "), 
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID... 
     
    # === 6) Misc tails === 
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "), 
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "), 
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "), 
    (r"\bBILL\s+PAYMENT\b", " "), 
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    (r"-\s*MEMO=", " "), 
    (r"(?:USA|US)$", " "), # Remove USA or US at the end 
    (r"\s+FSP$", " "), 
    (r"\bL\d{3}\b", " "), # Handle L340

    # === 7) Phone numbers === 
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212 
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212 
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), 

    # === 8) URLs/domains === 
    (r"^\.COM\s+BILL\b.*", " "), 
    (r"\.COM\b", " "), # Remove .COM anywhere
    (r"\s+COM$", " "), # Remove .COM at end of string

    # === 9) State abbreviations === 
    (STATE_REGEX, " "), # Remove standalone state codes 
    
    # === 9.5) City abbreviations ===
    (CITY_REGEX, " "), # Remove standalone city names
    
    # === 10) Noise Words (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 11) Final Tidy (Punctuation) === 
    (r"[|%_=;\\/]+", " "), # Remove misc separators 
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s*|\s*-\s*$", " "), # Remove leading/trailing hyphens
]

In [10]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADDY
    re.compile(r".*(GODADDY\.COM|GODADDY)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # === NEW: High-Frequency Full Merchant Names (from 1-grams) ===
    re.compile(r"^(AFTERPAY)\b.*"),
    re.compile(r"^(ALDI)\b.*"),
    re.compile(r"^(AMZN(?:\s*MKTP)?)\b.*"), # Handles amzn, amzn mktp
    re.compile(r"^(AMAZON(?:\.COM|\s+PRIME)?)\b.*"), # Handles amazon, amazon prime
    re.compile(r"^(APPLE(?:\.COM)?)\b.*"),
    re.compile(r"^(BRIGIT)\b.*"),
    re.compile(r"^(BURGER\s+KING)\b.*"),
    re.compile(r"^(CASH\s+APP)\b.*"),
    re.compile(r"^(CHICK-FIL-A)\b.*"),
    re.compile(r"^(CHIPOTLE)\b.*"),
    re.compile(r"^(CIRCLE\s+K)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),
    re.compile(r"^(DAIRY\s+QUEEN)\b.*"),
    re.compile(r"^(DOLLAR\s+GENERAL)\b.*"),
    re.compile(r"^(DOLLAR\s+TREE)\b.*"),
    re.compile(r"^(DOORDASH)\b.*"),
    re.compile(r"^(DUNKIN)\b.*"),
    re.compile(r"^(EBAY)\b.*"), # Added from n-grams
    re.compile(r"^(ETSY)\b.*"),
    re.compile(r"^(FAMILY\s+DOLLAR)\b.*"),
    re.compile(r"^(FOOD\s+LION)\b.*"),
    re.compile(r"^(FRYS)\b.*"),
    re.compile(r"^(GOOGLE)\b.*"),
    re.compile(r"^(HELPPAY)\b.*"),
    re.compile(r"^(HOME\s+DEPOT)\b.*"),
    re.compile(r"^(INSTACART)\b.*"),
    re.compile(r"^(KFC)\b.*"),
    re.compile(r"^(KROGER)\b.*"),
    re.compile(r"^(LITTLE\s+CAESARS)\b.*"),
    re.compile(r"^(LOWE'?S)\b.*"),
    re.compile(r"^(LYFT)\b.*"),
    re.compile(r"^(MCDONALD'?S)\b.*"),
    re.compile(r"^(MICROSOFT)\b.*"),
    re.compile(r"^(PUBLIX)\b.*"),
    re.compile(r"^(ROSS)\b.*"),
    re.compile(r"^(SAFEWAY)\b.*"),
    re.compile(r"^(SAMS\s*CLUB|SAMSCLUB)\b.*"), # Updated from n-grams
    re.compile(r"^(SHOPRITE)\b.*"),
    re.compile(r"^(SONIC)\b.*"),
    re.compile(r"^(STARBUCKS)\b.*"),
    re.compile(r"^(SUBWAY)\b.*"),
    re.compile(r"^(TACO\s+BELL)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(UBER(?:\s+EATS)?)\b.*"),
    re.compile(r"^(USPS)\b.*"),
    re.compile(r"^(VONS)\b.*"),
    # Updated from n-grams (wal, mart, wm, superc, supercenter)
    re.compile(r"^(WALMART(?:\s*SUPERCENTER|\s*SUPER\s*C)?|WM\s*SUPERCENTER|WAL-MART|WAL\s*MART)\b.*"),
    re.compile(r"^(WENDY'?S)\b.*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s09'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    re.compile(r"^(7(?:-ELEVEN|\s+11))\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # NEW: Rule for HOME* AHS.COM pattern
    re.compile(r"^HOME\s*\*?\s*([A-Z\s0-9'.-]+?)(?:\s+[A-Z]{2,})?.*"),
    
    # === TST RULES ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0EXAMPLE'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [11]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 26.4 s, sys: 169 ms, total: 26.6 s
Wall time: 26.6 s


In [12]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            # Use the last non-null group if multiple are present
            # (handles the GODADDY rule)
            groups = match.groups()
            if groups:
                return groups[-1].strip()
    return memo

df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 4.65 s, sys: 24.2 ms, total: 4.68 s
Wall time: 4.67 s


In [13]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [14]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [35]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
1145,#XXXXXX KROGER # COLONIAL AVE ROANOKE VA,# KROGER # COLONIAL AVE ROANOKE,# KROGER # COLONIAL AVE ROANOKE
142714,Debit Card Purchase 08/16 04:26p #XXXX FRIDA R...,04:26P # FRIDA,04:26P # FRIDA
98504,CKCD DEBIT XXXX 01/31 20:13 AFTERPAY XXX-XXXXX...,:13 AFTERPAY,:13 AFTERPAY
13055,AMAZON MUSIC*2DXXXXCM2 888-8 02-XXXX WAXXX...,AMAZON MUSIC*2D CM2 -8 02,AMAZON
271577,PURCHASE AUTHORIZED ON 02/03 CASH APP*MONICA H...,CASH APP*MONICA HA S,CASH APP
126998,DEBIT CARD PURCHASE DD DOORDASH SHAKESHAC,DD DOORDASH SHAKESHAC,DOORDASH SHAKESHAC
333974,PURCHASE AUTHORIZED ON 07/17 IC* INSTACART*XXX...,IC* INSTACART* HTTPSINSTACAR S,INSTACART
114422,DBT CRD XXXX 02/24/22 XXXXXXXX PRODIGY SOLUTIO...,PRODIGY SOLUTIONS INC 866- #,PRODIGY SOLUTIONS INC 866-
464003,THE FRESH MARKE XXXX N. FEDERAL HIGH FORT LAUDERD,THE FRESH MARKE N. FEDERAL HIGH LAUDERD,THE FRESH MARKE N. FEDERAL HIGH LAUDERD
374078,PURCHASE AUTHORIZED ON 10/27 WENDY'S #XXXX MIA...,WENDY'S # S,WENDY'S


In [17]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [36]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

MCDONALD'S
APPLE
TARGET
STARBUCKS
AMZN
EBAY
SUBWAY
USPS
BRIGIT
ROSS
COSTCO
KFC
ETSY
SHOPRITE
LION
CHIPOTLE
A
432-329
MCDONALDS
EXPRESS
GODADDY
AMZNFREETIME
SALLY
CCBILL
CHEWY
PETCO
WINN-DIXIE
GOFAN
SHIPT
BASKIN
RALPHS
SLICE
BETMGM
DOMINO'S
IHOP
DOLLAR
DILLONS
CRYPTO
P
INTUIT
STEAK-N-SHAKE
TOLLWAY-AUTOREPLEN
DEPOT
POPEYES
STATERBRO
NORTHGATE
VANS
S
MGM
SKILLZ
PRICELN
GAMESTOP
43
FRG
HUNT
.KOHLS
BANFIELD-
D
RIOT
VERIZONWRLSS
OCULUS
SMITHS
MCW
SAVEMART
BELK
RAINBOW
MARINA
FOODMAXX
MEIJER
ABC
CANVA
GNC
WALGREENS
BASHAS''
SHEIN
C
STAPLES
CLAIRE'S
L
BELL''S
SEZZLE
FOOD4LESS
QFC
*EBAY
AMZ
SEI
FIV
NYTIMES
GOODWILL
V
CHECKERS
UPS
TILLYS
T
CLEO
IBI
SHOPIFY
OTT
ETT
POTBELLY
DROPBOX
BLUESKY
SOUTHWES
*UBER
EL
JOURNEYS
WEGMANS
PARKMOBILE
JACK'S
ABCMOUSE
EZPASS
QUADPAY
ROSES
LUCKY
EVI
AYSTATIONNETWORK
LIQUOR
UNITED
RVT
ENMARKET
REI
CRT
NORTON
FBPAY
EA
GERALD
HLLFRSH
ZULILY
NORDSTROM
*STEAM
PARADISE-MOOREMOORESVILLE
DRI
LJS
OPC
TLG
FRED-MEYER
*MICROSOFT
STEAM
NIKE
ZTL
PACSUN
SEDANOS
FH
EVERYPLATE
TLF


In [19]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
3820,365 MARKET,,
3821,365 MARKET 888,,
4048,365 MARKET TROY MI,,
4049,365 MARKET XXX XXX-XXXX,,
4050,365 MARKET XXX XXX-XXXX TROY MI,,
...,...,...,...
523840,XXXXXXXXXXX XXXXXX,,
523860,XXXXXXXXXXXX,,
524051,XXXXXXXXXXXXX,,
524215,XXXXXXXXXXXXXXX,,


In [20]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
3820,365 MARKET,,
3821,365 MARKET 888,,
4048,365 MARKET TROY MI,,
4049,365 MARKET XXX XXX-XXXX,,
4050,365 MARKET XXX XXX-XXXX TROY MI,,
...,...,...,...
523840,XXXXXXXXXXX XXXXXX,,
523860,XXXXXXXXXXXX,,
524051,XXXXXXXXXXXXX,,
524215,XXXXXXXXXXXXXXX,,


In [21]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
1842,02-20-22 SEATTLE WA XXXX AMAZON.COM*1B8II62G0 ...,AMAZON *1B8II62G0 SUNTRUST,AMAZON
1843,02-20-22 SEATTLE WA XXXX AMAZON.COM*1B9CD29N0 ...,AMAZON *1B9CD29N0 SUNTRUST,AMAZON
1844,02-20-22 SEATTLE WA XXXX AMAZON.COM*1I32Q2NG1 ...,AMAZON *1I32Q2NG1 SUNTRUST,AMAZON
1845,02-20-22 SEATTLE WA XXXX AMAZON.COM*1I58N0TJ1 ...,AMAZON *1I58N0TJ1 SUNTRUST,AMAZON
1846,02-20-22 SEATTLE WA XXXX AMAZON.COM*1IXXXXAV1 ...,AMAZON *1I AV1 SUNTRUST,AMAZON
...,...,...,...
528725,www.Playgr* Bidiboo.Co,.PLAYGR* BIDIBOO.CO,.PLAYGR
528726,www.Playgr* Littlemiss,.PLAYGR* LITTLEMISS,.PLAYGR
528732,www.Styles* Luv N Hair,.STYLES* LUV N HAIR,.STYLES
528733,www.Stylese* West Brim,.STYLESE* BRIM,.STYLESE


In [22]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

7973                     [AB ,  OSCKLAMALL INTERNET]
422685                    [ADOBE ,  ADOBE.LY ENUS S]
71792                  [AMAZON , 0R9JE8 AMZN BILLWA]
14328                                [AMAZON , 1 C0]
64047                  [AMAZON , 138RM7 AMZN BILLWA]
362862                  [AMAZON , 147C8 AMZN BILL S]
296026                [AMAZON , 1H1AV57 AMZN BILL S]
35216           [AMAZON , 1U8SR1F80 AMZN BILL : :17]
35305                           [AMAZON , 1W6QN3Y22]
14312                           [AMAZON , 1X1RI3NI0]
14620                           [AMAZON , 2C16M1EE0]
35642                      [AMAZON , 2C1Q AMZN BILL]
363828                [AMAZON , 2C73W69 AMZN BILL S]
65300            [AMAZON , 331LV0JE3 AM AMZN BILLWA]
501137                [AMAZON , 3Q68H4NH3 US 25 # #]
228608                     [AMAZON , 4G8WX3R AMZN B]
36536                 [AMAZON , 861AT2KL3 AMZN BILL]
414156                  [AMAZON , BV8H64R AMZN BILL]
392947                [AMAZON , C20PT94 AMZN B

# Phase 2: Extract & Analyze N-Grams

In [23]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [24]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [25]:
%%time
corpus = df['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=100)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=100)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=100)
print(f"--- N-gram Analysis Complete ---")

Analyzing 528766 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 9.64 s, sys: 4.26 ms, total: 9.64 s
Wall time: 9.64 s


In [26]:
top_1grams
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [27]:
top_1grams

[('amazon', 32494),
 ('amzn', 22483),
 ('cash', 21139),
 ('app', 19351),
 ('mktp', 18994),
 ('mart', 14566),
 ('wal', 13232),
 ('doordash', 12164),
 ('mcdonald', 8859),
 ('dollar', 8070),
 ('apple', 7942),
 ('google', 6652),
 ('target', 5990),
 ('wm', 5834),
 ('pizza', 5166),
 ('taco', 4720),
 ('prime', 4169),
 ('cafe', 3909),
 ('king', 3858),
 ('starbucks', 3855),
 ('burger', 3505),
 ('mobile', 3464),
 ('el', 3408),
 ('shop', 3309),
 ('circle', 3301),
 ('superc', 3285),
 ('publix', 3279),
 ('general', 3181),
 ('liquor', 3079),
 ('uber', 3034),
 ('bar', 3031),
 ('afterpay', 2997),
 ('ebay', 2737),
 ('bell', 2659),
 ('express', 2620),
 ('subway', 2597),
 ('chick', 2569),
 ('kroger', 2546),
 ('supercenter', 2457),
 ('house', 2393),
 ('dunkin', 2319),
 ('little', 2254),
 ('home', 2246),
 ('grill', 2237),
 ('family', 2206),
 ('depot', 1999),
 ('usps', 1907),
 ('coffee', 1888),
 ('mexican', 1810),
 ('instacart', 1804),
 ('brigit', 1801),
 ('wine', 1775),
 ('club', 1755),
 ('walmart', 1739),

In [28]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'mart',
 'wal',
 'doordash',
 'mcdonald',
 'dollar',
 'apple',
 'google',
 'target',
 'wm',
 'pizza',
 'taco',
 'prime',
 'cafe',
 'king',
 'starbucks',
 'burger',
 'mobile',
 'el',
 'shop',
 'circle',
 'superc',
 'publix',
 'general',
 'liquor',
 'uber',
 'bar',
 'afterpay',
 'ebay',
 'bell',
 'express',
 'subway',
 'chick',
 'kroger',
 'supercenter',
 'house',
 'dunkin',
 'little',
 'home',
 'grill',
 'family',
 'depot',
 'usps',
 'coffee',
 'mexican',
 'instacart',
 'brigit',
 'wine',
 'club',
 'walmart',
 'frys',
 'ross',
 'aldi',
 'microsoft',
 'costco',
 'foods',
 'caesars',
 'kfc',
 'sams',
 'fresh',
 'smoke',
 'sonic',
 'lyft',
 'wendy',
 'safeway',
 'etsy',
 'queen',
 'shoprite',
 'helppay',
 'samsclub',
 'com',
 'jacksonville',
 'liquors',
 'del',
 'hunt',
 'chicken',
 'lowe',
 'nnt',
 '432',
 'spa',
 'street',
 'vons',
 'dairy',
 'lion',
 'tree',
 'eats',
 'sushi',
 'sports',
 'arbys',
 'jack',
 'box',
 'cmsvend',
 'papa',
 'can

In [29]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'mart',
 'wal',
 'doordash',
 'mcdonald',
 'dollar',
 'apple',
 'google',
 'target',
 'wm',
 'pizza',
 'taco',
 'prime',
 'cafe',
 'king',
 'starbucks',
 'burger',
 'mobile',
 'el',
 'shop',
 'circle',
 'superc',
 'publix',
 'general',
 'liquor',
 'uber',
 'bar',
 'afterpay',
 'ebay',
 'bell',
 'express',
 'subway',
 'chick',
 'kroger',
 'supercenter',
 'house',
 'dunkin',
 'little',
 'home',
 'grill',
 'family',
 'depot',
 'usps',
 'coffee',
 'mexican',
 'instacart',
 'brigit',
 'wine',
 'club',
 'walmart',
 'frys',
 'ross',
 'aldi',
 'microsoft',
 'costco',
 'foods',
 'caesars',
 'kfc',
 'sams',
 'fresh',
 'smoke',
 'sonic',
 'lyft',
 'wendy',
 'safeway',
 'etsy',
 'queen',
 'shoprite',
 'helppay',
 'samsclub',
 'com',
 'jacksonville',
 'liquors',
 'del',
 'hunt',
 'chicken',
 'lowe',
 'nnt',
 '432',
 'spa',
 'street',
 'vons',
 'dairy',
 'lion',
 'tree',
 'eats',
 'sushi',
 'sports',
 'arbys',
 'jack',
 'box',
 'cmsvend',
 'papa',
 'can

In [30]:
top_2grams

[('amzn mktp', 18946),
 ('cash app', 18691),
 ('wal mart', 11491),
 ('amazon prime', 3851),
 ('wm superc', 3278),
 ('superc wal', 3277),
 ('dollar general', 3050),
 ('wm supercenter', 2424),
 ('taco bell', 2408),
 ('burger king', 1830),
 ('little caesars', 1529),
 ('home depot', 1482),
 ('family dollar', 1306),
 ('sams club', 1093),
 ('uber eats', 947),
 ('wal wal', 880),
 ('dairy queen', 792),
 ('jack box', 774),
 ('dollar tree', 770),
 ('winco foods', 733),
 ('carls jr', 728),
 ('mobile sign', 725),
 ('sign based', 725),
 ('del taco', 694),
 ('panda express', 678),
 ('dollar ge', 663),
 ('snack soda', 658),
 ('bath body', 652),
 ('work renton', 566),
 ('king soopers', 557),
 ('smoke shop', 556),
 ('panera bread', 533),
 ('routs farmers', 523),
 ('king soop', 521),
 ('ge dg', 505),
 ('waffle house', 482),
 ('quick chek', 478),
 ('pizza hut', 475),
 ('432 32', 462),
 ('body works', 453),
 ('432 329', 441),
 ('papa john', 427),
 ('total wine', 412),
 ('salt cit', 402),
 ('cash 40', 401)

In [38]:
ngrams_2 = [ngram for ngram, count in top_2grams]
ngrams_2

['amzn mktp',
 'cash app',
 'wal mart',
 'amazon prime',
 'wm superc',
 'superc wal',
 'dollar general',
 'wm supercenter',
 'taco bell',
 'burger king',
 'little caesars',
 'home depot',
 'family dollar',
 'sams club',
 'uber eats',
 'wal wal',
 'dairy queen',
 'jack box',
 'dollar tree',
 'winco foods',
 'carls jr',
 'mobile sign',
 'sign based',
 'del taco',
 'panda express',
 'dollar ge',
 'snack soda',
 'bath body',
 'work renton',
 'king soopers',
 'smoke shop',
 'panera bread',
 'routs farmers',
 'king soop',
 'ge dg',
 'waffle house',
 'quick chek',
 'pizza hut',
 '432 32',
 'body works',
 '432 329',
 'papa john',
 'total wine',
 'salt cit',
 'cash 40',
 'ingles markets',
 'coca cola',
 'klover app',
 'smart final',
 'harbor freight',
 'help hbomax',
 'raising cane',
 'merchant issued',
 'app boost',
 'issued target',
 'stewarts shop',
 'aystation network',
 'cvs pharmacy',
 'winn dixie',
 'trader joe',
 'buc ee',
 'fresh coffe',
 'coffe waxahachie',
 'exchg rte',
 'sale debitl

In [31]:
top_3grams

[('wm superc wal', 3277),
 ('superc wal mart', 2766),
 ('wal wal mart', 875),
 ('mobile sign based', 725),
 ('dollar ge dg', 502),
 ('bath body works', 451),
 ('klover app boost', 363),
 ('merchant issued target', 363),
 ('fresh coffe waxahachie', 339),
 ('help hbomax httpshbomax', 312),
 ('float corp payments', 263),
 ('mx nu peso', 223),
 ('ref auth purchdate', 215),
 ('extra daily spend', 209),
 ('disney plus burbank', 207),
 ('sports bar jacksonville', 203),
 ('desc entry descr', 193),
 ('trace eed ind', 193),
 ('orig desc entry', 191),
 ('fred meye fred', 188),
 ('bcn mx nu', 184),
 ('rappi restaurantes bogot', 183),
 ('restaurantes bogot col', 183),
 ('harbor freight tools', 180),
 ('mission lane vis', 178),
 ('stewarts shop 329', 176),
 ('danfoss cafe ames', 176),
 ('nayax 24 hunt', 169),
 ('nayax 14 hunt', 165),
 ('wal mart wal', 163),
 ('lane vis mission', 162),
 ('tijuana bcn mx', 158),
 ('routs farmers mkt', 157),
 ('beneva tobacco sarasota', 154),
 ('pizza hut https', 149),

In [32]:
# Use 1 grams to find prefixes