# Phase 1: Preprocessing with Regular Expressions

In [1]:
# import subprocess
# subprocess.run(['git', 'pull'])

In [2]:
# subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
# subprocess.run(['git', 'commit', '-m', 'regex'])

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     7-ELEVEN SUNNYVALE CA                        0...
1     7ELEVEN-FCTI 09/30 #XXXXXXXXX WITHDRWL XXXXX T...
2              ACH WEB MICROSOFT ULTIM PAYPAL INST XFER
3                     AMAZON.COM*1J97C3CS3 AMZN.COM/BIL
4     AMAZON.COM*2D1A866C0 SEATTLE WACard XXXX/Withd...
5     ANWAR INC. 07/22 #XXXXXXXXX PURCHASE XXXX S ZA...
6                                          AVAS FLOWERS
7     CASH APP*DIDIER GAR XXXXXXXXXX CA            0...
8     CHECKCARD XXXX BOMBORA BEACHWEAR HAVASU CITY A...
9     DDA PUR XXXXXX STAR DONUTS Pine Grove CA SQ *S...
10    Debit Purchase -visa Card XXXXgoodwill Maplewo...
11    GOODWILL OTTAW 03/06 #XXXXXXXXX PURCHASE 501 W...
12                                                GROVE
13                                     La Ventana Dalla
14                                    MEZE GREEK FUSION
15    POS Debit - Visa Check Card XXXX - AMAZON PRIM...
16    PURCHASE AUTHORIZED ON 01/26 AMZN Mktp US*VV40...
17    PURCHASE AUTHORIZED ON 03/15 AMZN Mktp US*

## 2. Define Regex Rules

In [6]:
STATE_LIST = [ 
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO) 
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER) 
    "KS", "KY", 
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA 
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA) 
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY" 
] 
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
CITY_LIST = [
    "SAN", "MIAMI", "DIEGO", "PHOENIX", "SEATTLE", "HOUSTON", "SANTA", "ORLANDO", 
    "CHICAGO", "LAS", "ATLANTA", "LOS", "MESA", "VEGAS", "CHARLOTTE", "TAMPA", 
    "GREENVILLE", "BROOKLYN", "DENVER", "ANGELES", "ANTONIO", "MEMPHIS", "YORK",
    "RICHMOND", "BEACH", "PALM", "FORT", "ST", "LAKE", "WEST", "DES", "PARK",
    "HILL", "NORTH", "SPRING", "CREEK", "SAINT", "RIVER", "SOUTH", "MYERS",
    # Added from 1-grams
    "CITY", "NEW", "TROY", "VALLEY", "PORT",
    # Added from 2-grams
    "WAXAHACHIE", "EL CAJON", "PASO ROBLES", "BUENA VI", "CHULA VISTA", 
    "BOCA RATON", "PINE PLAINS", "HIGHLANDS RAN", "RENTON", "SALT", "CIT", "BOGOT"
]
CITY_REGEX = r"\b(" + "|".join(CITY_LIST) + r")\b"

In [8]:
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "CRD", "PUR", "PIN", "SIG", "LLC", "SIGNATURE", "WEB", "PAYMENT",
    "ACH", "DEB", "INTL", "RECURRING", "DIGIT", "ONLINE", "PROTECTION", "VSA",
    "800", "888", "WITHDRAWAL", "INDN", "STORE", "STORES", "RESTAURANT", 
    "STOP", "BUSINESS", "PARKING", "FIL",
    # Added from 1-gram
    "POS", "PURCHASE", "DEBIT", "NON", "TR", "SUP",
    # Added from 2-grams
    "HELP", "HTTPSHBOMAX", "HTTPSINSTACAR", 'SUPER', 'MART', 'CENTER'
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [9]:
REGEX_PRE = [ 
    # === 0) Normalize spaces first === 
    (r"\u00A0", " "), # Replace non-breaking space with regular space 
    (r"\s{2,}", " "), # Collapse multiple spaces into one 

    # === 1) “Authorized / Recurring” headers === 
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "), 
    (r"\bRECURRING\s+PYMT\b", " "), 

    # === 2) Card & mask boilerplate === 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "), 
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "), 
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "), 
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "), 
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "), 
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "), 
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "), 
    (r"\bF[XF]{4,}\b", " "), # Handle FXXXXX, FXXXX
    (r"\bXXX\b", " "), # Handle masked number from n-grams
    (r"X{4,}", " "), # Remove generic masked numbers 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "), 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords 
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start 
    (r"^\s*REC\s+POS\b", " "), 
    (r"^\s*RECURRING\b", " "), 

    # === 2.5) Prefix Normalization === 
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM" 
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR 

    # === 3) State + mask tails === 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), 

    # === 4) Dates/times === 
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025 
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM 
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM 

    # === 4.5) Noise numbers (from n-grams) ===
    (r"\b(?:00|10|15|16|20|365)\b", " "),

    # === 5) Merchant-terminal boilerplate === 
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "), 
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "), 
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "), 
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID... 
     
    # === 6) Misc tails === 
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "), 
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "), 
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "), 
    (r"\bBILL\s+PAYMENT\b", " "), 
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    (r"-\s*MEMO=", " "), 
    (r"(?:USA|US)$", " "), # Remove USA or US at the end 
    (r"\s+FSP$", " "), 
    (r"\bL\d{3}\b", " "), # Handle L340

    # === 7) Phone numbers === 
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212 
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212 
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), 

    # === 8) URLs/domains === 
    (r"^\.COM\s+BILL\b.*", " "), 
    (r"\.COM\b", " "), # Remove .COM anywhere
    (r"\s+COM$", " "), # Remove .COM at end of string

    # === 9) State abbreviations === 
    (STATE_REGEX, " "), # Remove standalone state codes 
    
    # === 9.5) City abbreviations ===
    (CITY_REGEX, " "), # Remove standalone city names
    
    # === 10) Noise Words (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 11) Final Tidy (Punctuation) === 
    (r"[|%_=;\\/]+", " "), # Remove misc separators 
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s*|\s*-\s*$", " "), # Remove leading/trailing hyphens
]

In [10]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADDY
    re.compile(r".*(GODADDY\.COM|GODADDY)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # === NEW: High-Frequency Full Merchant Names (from 1-grams) ===
    re.compile(r"^(AFTERPAY)\b.*"),
    re.compile(r"^(ALDI)\b.*"),
    re.compile(r"^(AMZN(?:\s*MKTP)?)\b.*"), # Handles amzn, amzn mktp
    re.compile(r"^(AMAZON(?:\.COM|\s+PRIME)?)\b.*"), # Handles amazon, amazon prime
    re.compile(r"^(APPLE(?:\.COM)?)\b.*"),
    re.compile(r"^(BRIGIT)\b.*"),
    re.compile(r"^(BURGER\s+KING)\b.*"),
    re.compile(r"^(CASH\s+APP)\b.*"),
    re.compile(r"^(CHICK-FIL-A)\b.*"),
    re.compile(r"^(CHIPOTLE)\b.*"),
    re.compile(r"^(CIRCLE\s+K)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),
    re.compile(r"^(DAIRY\s+QUEEN)\b.*"),
    re.compile(r"^(DOLLAR\s+GENERAL)\b.*"),
    re.compile(r"^(DOLLAR\s+TREE)\b.*"),
    re.compile(r"^(DOORDASH)\b.*"),
    re.compile(r"^(DUNKIN)\b.*"),
    re.compile(r"^(EBAY)\b.*"), # Added from n-grams
    re.compile(r"^(ETSY)\b.*"),
    re.compile(r"^(FAMILY\s+DOLLAR)\b.*"),
    re.compile(r"^(FOOD\s+LION)\b.*"),
    re.compile(r"^(FRYS)\b.*"),
    re.compile(r"^(GOOGLE)\b.*"),
    re.compile(r"^(HELPPAY)\b.*"),
    re.compile(r"^(HOME\s+DEPOT)\b.*"),
    re.compile(r"^(INSTACART)\b.*"),
    re.compile(r"^(KFC)\b.*"),
    re.compile(r"^(KROGER)\b.*"),
    re.compile(r"^(LITTLE\s+CAESARS)\b.*"),
    re.compile(r"^(LOWE'?S)\b.*"),
    re.compile(r"^(LYFT)\b.*"),
    re.compile(r"^(MCDONALD'?S)\b.*"),
    re.compile(r"^(MICROSOFT)\b.*"),
    re.compile(r"^(PUBLIX)\b.*"),
    re.compile(r"^(ROSS)\b.*"),
    re.compile(r"^(SAFEWAY)\b.*"),
    re.compile(r"^(SAMS\s*CLUB|SAMSCLUB)\b.*"), # Updated from n-grams
    re.compile(r"^(SHOPRITE)\b.*"),
    re.compile(r"^(SONIC)\b.*"),
    re.compile(r"^(STARBUCKS)\b.*"),
    re.compile(r"^(SUBWAY)\b.*"),
    re.compile(r"^(TACO\s+BELL)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(UBER(?:\s+EATS)?)\b.*"),
    re.compile(r"^(USPS)\b.*"),
    re.compile(r"^(VONS)\b.*"),
    # Updated from n-grams (wal, mart, wm, superc, supercenter)
    re.compile(r"^(WALMART(?:\s*SUPERCENTER|\s*SUPER\s*C)?|WM\s*SUPERCENTER|WAL-MART|WAL\s*MART)\b.*"),
    re.compile(r"^(WENDY'?S)\b.*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s09'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    re.compile(r"^(7(?:-ELEVEN|\s+11))\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # NEW: Rule for HOME* AHS.COM pattern
    re.compile(r"^HOME\s*\*?\s*([A-Z\s0-9'.-]+?)(?:\s+[A-Z]{2,})?.*"),
    
    # === TST RULES ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0EXAMPLE'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [11]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 5.36 s, sys: 25.8 ms, total: 5.38 s
Wall time: 5.38 s


In [12]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            # Use the last non-null group if multiple are present
            # (handles the GODADDY rule)
            groups = match.groups()
            if groups:
                return groups[-1].strip()
    return memo

df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 976 ms, sys: 4.28 ms, total: 980 ms
Wall time: 978 ms


In [13]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [14]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [15]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
43461,BEST LIFE NUTRITION gosq.com AZ 06/24,BEST LIFE NUTRITION GOSQ,BEST LIFE NUTRITION GOSQ
151095,Doordash*Miami Grill,DOORDASH* GRILL,DOORDASH
350205,PURCHASE AUTHORIZED ON 08/28 DD DOORDASH RITAS...,DD DOORDASH RITASI - - S,DOORDASH RITASI - - S
448837,SQ *FOOD DOODS LLC WHITTIER CA 0...,SQ *FOOD DOODS WHITTIER,FOOD DOODS WHITTIER
166363,GLACIER WATER VENDIN WINSTON SALEM NC 07/10,GLACIER WATER VENDIN WINSTON SALEM,GLACIER WATER VENDIN WINSTON SALEM
171059,Gmart,GMART,GMART
187625,KROGER CO 369 STOCKBRIDGE GA,KROGER 369 STOCKBRIDGE,KROGER
511609,Withdrawal TARGET.COM * / XXX-XXX-XXXX MN Date...,TARGET * - - 25 #,TARGET
490042,WCA WALNUT CREEK,WCA WALNUT,WCA WALNUT
496534,WT XXXXXX-XXXXXX BANK OF CHINA /BNF=JinHua Fei...,WT - BANK OF CHINA BNF JINHUA FEIRUI VEHICLE ....,WT - BANK OF CHINA BNF JINHUA FEIRUI VEHICLE ....


In [16]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [17]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

MCDONALD'S
APPLE
TARGET
WAL-
STARBUCKS
AMZN
EBAY
SUBWAY
USPS
ROSS
COSTCO
BRIGIT
KFC
ETSY
SHOPRITE
CHIPOTLE
MARKET
A
MCDONALDS
EXPRESS
MARKET@WORK
GODADDY
CCBILL
AMZNFREETIME
BASKIN
PETCO
CHEWY
SLICE
DOMINO'S
SHIPT
SAVE
WINN-DIXIE
TIGER
BETMGM
GOFAN
DOLLAR
FOOD
STEAK-N-SHAKE
RALPHS
RIOT
DILLONS
POPEYES
IHOP
CRYPTO
TOLLWAY-AUTOREPLEN
INTUIT
MGM
PRICELN
VANS
VERIZONWRLSS
CLAIRE'S
.KOHLS
BANFIELD-PET
WALGREENS
MARINA
SAVEMART
STATERBRO
RAINBOW
FOODMAXX
SKILLZ
BASHAS''
FRG
P
NORTON
GAMESTOP
GNC
OCULUS
VENDING
BELK
QUADPAY
FIV
WAL
T
SOUTHWES
SHOPIFY
EVI
D
GERALD
AMZ
QFC
MEIJER
*UBER
SEI
ROSES
E-Z
SHEIN
CHECKERS
C
UNITED
OTT
MCW
EZPASS
SEZZLE
V
REI
L
TLF
CANVA
NORDSTROM
IBI
*EBAY
STAPLES
FH
WEGMANS
JOURNEYS
POTBELLY
RONAN
ANCESTRY
GLOSS
JACK'S
DROPBOX
GOODWILL
ABC
BLUESKY
EL
DRI
SEDANOS
HLLFRSH
NEWSSTAND
LIQUOR
ETT
FRED-MEYER
NIKE
LEGALSHIELD
RUBIO'S
ARBY'S
FOOD4LESS
FAMOUSFOOTWEAR
TILLYS
CLEO
S
OPC
HUDSON
ABCMOUSE
ZTL
STEAM
COSMOPROF
TCB
LJS
BUCKLE
FBPAY
NYTIMES
AYSTATIONNETWORK
TOMMY'S
POSH

In [18]:
print(df[df['memo_post'] == ''].to_string())

                                                                                            memo                                                       memo_pre memo_post
219428         PAYPAL DES:INST XFER ID:GOOGLE GOOGLE_Y INDN:JAVARIS FLOWERS CO ID:PAYPALSI77 WEB         PAYPAL : :GOOGLE GOOGLE Y :JAVARIS FLOWERS :PAYPALSI77          
219476          PAYPAL DES:INST XFER ID:MICROSOFT ULTIM INDN:SHAWN WOODHULL CO ID:PAYPALSI77 WEB          PAYPAL : :MICROSOFT ULTIM :SHAWN WOODHULL :PAYPALSI77          
219430          PAYPAL DES:INST XFER ID:GOOGLE GOOGLE_Y INDN:SHAWN WOODHULL CO ID:PAYPALSI77 WEB          PAYPAL : :GOOGLE GOOGLE Y :SHAWN WOODHULL :PAYPALSI77          
219520                 PAYPAL DES:INST XFER ID:UBER CASH INDN:ALICIA FARNUM CO ID:PAYPALSI77 WEB                 PAYPAL : :UBER CASH :ALICIA FARNUM :PAYPALSI77          
449726                                                        SQ *SODA & S'MO Eagle River AK USA                                                SQ * &

In [19]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
34242,Amazon.com*033SX3G73 Amzn.com/bill WA 07/23,AMAZON *033SX3G73 AMZN BILL,AMAZON
470521,TST* GRAZIANOS MARKET B MIAMI FL 01/20,TST* GRAZIANOS MARKET B,GRAZIANOS MARKET B
143105,Debit Card TST* Frenchys Outpost B Dunedin FL ...,TST* FRENCHYS OUTPOST B DUNEDIN,FRENCHYS OUTPOST B DUNEDIN
136620,DOORDASH*F SAN FRANCISC CA,DOORDASH*F FRANCISC,DOORDASH
481882,VISA - 05/03 DOORDASH*THE BURGER DE WWW.DOORDA...,DOORDASH*THE BURGER .DOORDASH.,DOORDASH
...,...,...,...
374744,PURCHASE AUTHORIZED ON 10/29 Intuit *ProSeries...,INTUIT *PROSERIES - - S,INTUIT
278942,PURCHASE AUTHORIZED ON 02/23 CKE*BISBEE COFFEE...,CKE*BISBEE COFFEE BISBEE S,BISBEE COFFEE BISBEE S
356509,PURCHASE AUTHORIZED ON 09/13 AMAZON.COM*1M3CB4...,AMAZON *1M3CB43 AMZN BILL S,AMAZON
141869,Debit Card Debit TARGET.COM * XXX-XXX-XXXX MN ...,TARGET * - - #,TARGET


In [20]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

17279                    [AMAZON ,  0T80 AMZN. COM BILLWA]
66846                         [AMAZON ,  D AM AMZN BILLWA]
64014                  [AMAZON , 0R4TX0I33 AM AMZN BILLWA]
64122                  [AMAZON , 1A59M2DE0 AM AMZN BILLWA]
13842                               [AMAZON , 1D7II0FI3 A]
34618                       [AMAZON , 1H2PX4ES3 AMZN BILL]
294182                      [AMAZON , 1H6TC6Z AMZN BILL S]
289296                      [AMAZON , 1N1L60X AMZN BILL S]
31904                         [AMAZON , 1O7NE56 AMZN BILL]
14223                                 [AMAZON , 1S2JM5IU3]
506216                      [AMAZON , 1V3T06PE1 AMZN BILL]
259290                          [AMAZON , 1V6TI50M1 , STR]
35344                       [AMAZON , 1X6PN3FT0 AMZN BILL]
64709                        [AMAZON , 278JD0 AMZN BILLWA]
64721                  [AMAZON , 285E86NN3 AM AMZN BILLWA]
329711                      [AMAZON , 290OR3M AMZN BILL S]
35701                       [AMAZON , 2D0PE5CF1 AMZN BIL

# Phase 2: Extract & Analyze N-Grams

In [21]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [22]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [23]:
%%time
corpus = df['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=100)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=100)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=100)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 1.89 s, sys: 4.42 ms, total: 1.89 s
Wall time: 1.89 s


In [24]:
top_1grams
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [25]:
top_1grams

[('amazon', 6029),
 ('amzn', 4288),
 ('cash', 3983),
 ('app', 3631),
 ('mktp', 3615),
 ('wal', 2572),
 ('doordash', 2325),
 ('mcdonald', 1724),
 ('market', 1694),
 ('dollar', 1540),
 ('apple', 1477),
 ('google', 1270),
 ('target', 1157),
 ('food', 1144),
 ('wm', 1107),
 ('pizza', 944),
 ('taco', 942),
 ('prime', 780),
 ('cafe', 770),
 ('king', 766),
 ('starbucks', 720),
 ('burger', 663),
 ('mobile', 630),
 ('general', 623),
 ('shop', 622),
 ('liquor', 614),
 ('circle', 607),
 ('superc', 606),
 ('publix', 598),
 ('afterpay', 592),
 ('uber', 584),
 ('bar', 568),
 ('ebay', 557),
 ('el', 549),
 ('bell', 544),
 ('express', 510),
 ('vending', 498),
 ('kroger', 488),
 ('house', 485),
 ('supercenter', 475),
 ('chick', 472),
 ('subway', 467),
 ('little', 432),
 ('grill', 423),
 ('dunkin', 419),
 ('home', 408),
 ('family', 406),
 ('usps', 364),
 ('depot', 363),
 ('coffee', 350),
 ('wine', 350),
 ('club', 345),
 ('instacart', 341),
 ('mexican', 337),
 ('brigit', 326),
 ('frys', 318),
 ('microsoft

In [26]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'wal',
 'doordash',
 'mcdonald',
 'market',
 'dollar',
 'apple',
 'google',
 'target',
 'food',
 'wm',
 'pizza',
 'taco',
 'prime',
 'cafe',
 'king',
 'starbucks',
 'burger',
 'mobile',
 'general',
 'shop',
 'liquor',
 'circle',
 'superc',
 'publix',
 'afterpay',
 'uber',
 'bar',
 'ebay',
 'el',
 'bell',
 'express',
 'vending',
 'kroger',
 'house',
 'supercenter',
 'chick',
 'subway',
 'little',
 'grill',
 'dunkin',
 'home',
 'family',
 'usps',
 'depot',
 'coffee',
 'wine',
 'club',
 'instacart',
 'mexican',
 'brigit',
 'frys',
 'microsoft',
 'beauty',
 'foods',
 'ross',
 'aldi',
 'sams',
 'caesars',
 'costco',
 'fresh',
 'kfc',
 'sonic',
 'walmart',
 'deli',
 'nails',
 'smoke',
 'big',
 'lyft',
 'queen',
 'wendy',
 'point',
 'safeway',
 'samsclub',
 'etsy',
 'helppay',
 'del',
 'nnt',
 'com',
 'jacksonville',
 'american',
 'grocery',
 'lowe',
 'bakery',
 'sports',
 'arbys',
 'shoprite',
 'eats',
 'dairy',
 '432',
 'liquors',
 'tree',
 'sp

In [27]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'wal',
 'doordash',
 'mcdonald',
 'market',
 'dollar',
 'apple',
 'google',
 'target',
 'food',
 'wm',
 'pizza',
 'taco',
 'prime',
 'cafe',
 'king',
 'starbucks',
 'burger',
 'mobile',
 'general',
 'shop',
 'liquor',
 'circle',
 'superc',
 'publix',
 'afterpay',
 'uber',
 'bar',
 'ebay',
 'el',
 'bell',
 'express',
 'vending',
 'kroger',
 'house',
 'supercenter',
 'chick',
 'subway',
 'little',
 'grill',
 'dunkin',
 'home',
 'family',
 'usps',
 'depot',
 'coffee',
 'wine',
 'club',
 'instacart',
 'mexican',
 'brigit',
 'frys',
 'microsoft',
 'beauty',
 'foods',
 'ross',
 'aldi',
 'sams',
 'caesars',
 'costco',
 'fresh',
 'kfc',
 'sonic',
 'walmart',
 'deli',
 'nails',
 'smoke',
 'big',
 'lyft',
 'queen',
 'wendy',
 'point',
 'safeway',
 'samsclub',
 'etsy',
 'helppay',
 'del',
 'nnt',
 'com',
 'jacksonville',
 'american',
 'grocery',
 'lowe',
 'bakery',
 'sports',
 'arbys',
 'shoprite',
 'eats',
 'dairy',
 '432',
 'liquors',
 'tree',
 'sp

In [28]:
top_2grams

[('amzn mktp', 3608),
 ('cash app', 3506),
 ('amazon prime', 717),
 ('wm superc', 605),
 ('superc wal', 605),
 ('dollar general', 593),
 ('taco bell', 485),
 ('wm supercenter', 473),
 ('burger king', 340),
 ('wal wal', 290),
 ('little caesars', 288),
 ('home depot', 266),
 ('family dollar', 240),
 ('sams club', 211),
 ('market 432', 197),
 ('uber eats', 190),
 ('food lion', 169),
 ('dollar tree', 154),
 ('dairy queen', 151),
 ('jack box', 147),
 ('del taco', 146),
 ('panda express', 143),
 ('winco foods', 134),
 ('mobile sign', 132),
 ('sign based', 132),
 ('carls jr', 124),
 ('king soopers', 118),
 ('bath body', 118),
 ('dollar ge', 115),
 ('smoke shop', 114),
 ('routs farmers', 114),
 ('nayax vending', 109),
 ('market work', 108),
 ('panera bread', 106),
 ('king soop', 105),
 ('fresh market', 103),
 ('waffle house', 98),
 ('pizza hut', 96),
 ('432 32', 90),
 ('point sale', 89),
 ('quick chek', 87),
 ('canteen vending', 85),
 ('432 329', 84),
 ('market 43', 82),
 ('ge dg', 81),
 ('pap

In [29]:
ngrams_2 = [ngram for ngram, count in top_2grams]
ngrams_2

['amzn mktp',
 'cash app',
 'amazon prime',
 'wm superc',
 'superc wal',
 'dollar general',
 'taco bell',
 'wm supercenter',
 'burger king',
 'wal wal',
 'little caesars',
 'home depot',
 'family dollar',
 'sams club',
 'market 432',
 'uber eats',
 'food lion',
 'dollar tree',
 'dairy queen',
 'jack box',
 'del taco',
 'panda express',
 'winco foods',
 'mobile sign',
 'sign based',
 'carls jr',
 'king soopers',
 'bath body',
 'dollar ge',
 'smoke shop',
 'routs farmers',
 'nayax vending',
 'market work',
 'panera bread',
 'king soop',
 'fresh market',
 'waffle house',
 'pizza hut',
 '432 32',
 'point sale',
 'quick chek',
 'canteen vending',
 '432 329',
 'market 43',
 'ge dg',
 'papa john',
 'body works',
 'farmers market',
 'sally beauty',
 'ingles markets',
 'market coffe',
 'cash 40',
 'aystation network',
 'smart final',
 'royal house',
 'winn dixie',
 'wine spirits',
 'merchant issued',
 'issued target',
 'harbor freight',
 'klover app',
 'academy sports',
 'otha kimbrough',
 'cvs

In [30]:
top_3grams

[('wm superc wal', 605),
 ('mobile sign based', 132),
 ('market 432 32', 90),
 ('market 432 329', 84),
 ('dollar ge dg', 81),
 ('bath body works', 78),
 ('fresh market coffe', 73),
 ('merchant issued target', 67),
 ('klover app boost', 64),
 ('point sale debitl340', 62),
 ('routs farmers market', 59),
 ('float corp payments', 50),
 ('wal wal sams', 45),
 ('ref auth purchdate', 44),
 ('quick chek food', 43),
 ('routs farmers mkt', 42),
 ('danfoss cafe ames', 41),
 ('mission lane vis', 40),
 ('mx nu peso', 39),
 ('extra daily spend', 39),
 ('wal wal sto', 39),
 ('sports bar jacksonville', 39),
 ('lane vis mission', 36),
 ('rrn cash app', 34),
 ('bcn mx nu', 34),
 ('fred meye fred', 34),
 ('rappi restaurantes col', 33),
 ('disney plus burbank', 32),
 ('portillos hot dogs', 32),
 ('tijuana bcn mx', 31),
 ('harbor freight tools', 31),
 ('ollies bargain outlet', 30),
 ('stewarts shop 329', 30),
 ('nayax vending 14', 29),
 ('vending 14 hunt', 29),
 ('buffalo wild wings', 29),
 ('dutchbrosll g

In [31]:
# Use 1 grams to find prefixes