# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main cfaca58] regex
 1 file changed, 738 insertions(+), 12487 deletions(-)
 rewrite Week 2/Haris_Saif.ipynb (91%)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in#.sample(100_000).copy()
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     ALEXS TOBACCO & BE 06-12 SPRINGHILL TN XXXX DE...
1                  Amazon.com*HW63IXXXX AMZN.COM/BILLWA
2     B C RETAIL INC PASOROB PASO ROBLES CAXXXXXX  0...
3              BARROS PIZZA - N 7TH ST PHOENIX AZ 02/01
4     BASHAS' LITCHFIELDPK AZ              XXXXXX  0...
5                BOTTLE HOUSE LIQUORS WHITTIER CA 01/19
6     CHECKCARD XXXX CARDINAL HOME CENTER - MADISON ...
7          Card purchase / TINA OMLID XXXX (XXXX-10-06)
8              DOLLAR TR XXXX MORSE R COLUMBUS OH 01/04
9                                Dd *Doordash Holidayoi
10    GIANT XXXX 205 GLEN DR MANCHESTER PA 08/09 Pur...
11                                JALISCO FAIRWAY GRILL
12    POS DEB XXXX 09/14/22 XXXXXXX APPLE COM BILL A...
13    POS Withdrawal - NOMNOM 24 0 - MEMO=POS Withdr...
14    PURCHASE AUTHORIZED ON 01/31 SALSAS TAQUERIA X...
15    PURCHASE AUTHORIZED ON 02/13 USA*VEND AT AIR S...
16    PURCHASE AUTHORIZED ON 02/16 BUC-EE'S #40 KATY...
17    PURCHASE AUTHORIZED ON 06/14 7-ELEVEN MCKI

## 2. Define Regex Rules

In [6]:
STATE_LIST = [ 
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO) 
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER) 
    "KS", "KY", 
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA 
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA) 
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY" 
] 
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
CITY_LIST = [
    "SAN", "MIAMI", "DIEGO", "PHOENIX", "SEATTLE", "HOUSTON", "SANTA", "ORLANDO", 
    "CHICAGO", "LAS", "ATLANTA", "LOS", "MESA", "VEGAS", "CHARLOTTE", "TAMPA", 
    "GREENVILLE", "BROOKLYN", "DENVER", "ANGELES", "ANTONIO", "MEMPHIS", "YORK",
    "RICHMOND", "BEACH", "PALM", "FORT", "ST", "LAKE", "WEST", "DES", "PARK",
    "HILL", "NORTH", "SPRING", "CREEK", "SAINT", "RIVER", "SOUTH", "MYERS",
    # Added from 1-grams
    "CITY", "NEW", "TROY", "VALLEY", "PORT",
    # Added from 2-grams
    "WAXAHACHIE", "EL CAJON", "PASO ROBLES", "BUENA VI", "CHULA VISTA", 
    "BOCA RATON", "PINE PLAINS", "HIGHLANDS RAN", "RENTON", "SALT", "CIT", "BOGOT"
]
CITY_REGEX = r"\b(" + "|".join(CITY_LIST) + r")\b"

In [8]:
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "CRD", "PUR", "PIN", "SIG", "LLC", "SIGNATURE", "WEB", "PAYMENT",
    "ACH", "DEB", "INTL", "RECURRING", "DIGIT", "ONLINE", "PROTECTION", "VSA",
    "800", "888", "WITHDRAWAL", "INDN", "STORE", "STORES", "RESTAURANT", 
    "BEAUTY", "DELI",
    "GROCERY", "NAILS", "STOP", "BUSINESS", "PARKING", "PET", "GARDEN", "FIL",
    # Added from 1-grams (and curated)
    "POS", "PURCHASE", "DEBIT", "VENDING", "POINT", "BIG", "NON", "TR", "SUP",
    # Added from 2-grams
    "SNACK", "SODA", "HELP", "HTTPSHBOMAX", "HTTPSINSTACAR", 'SUPER', 'MART', 'CENTER']
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [9]:
REGEX_PRE = [ 
    # === 0) Normalize spaces first === 
    (r"\u00A0", " "), # Replace non-breaking space with regular space 
    (r"\s{2,}", " "), # Collapse multiple spaces into one 

    # === 1) “Authorized / Recurring” headers === 
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "), 
    (r"\bRECURRING\s+PYMT\b", " "), 

    # === 2) Card & mask boilerplate === 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "), 
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "), 
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "), 
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "), 
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "), 
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "), 
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "), 
    (r"\bF[XF]{4,}\b", " "), # Handle FXXXXX, FXXXX
    (r"\bXXX\b", " "), # Handle masked number from n-grams
    (r"X{4,}", " "), # Remove generic masked numbers 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "), 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords 
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start 
    (r"^\s*REC\s+POS\b", " "), 
    (r"^\s*RECURRING\b", " "), 

    # === 2.5) Prefix Normalization === 
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM" 
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR 

    # === 3) State + mask tails === 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), 

    # === 4) Dates/times === 
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025 
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM 
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM 

    # === 4.5) Noise numbers (from n-grams) ===
    (r"\b(?:00|10|15|16|20|365)\b", " "),

    # === 5) Merchant-terminal boilerplate === 
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "), 
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "), 
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "), 
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID... 
     
    # === 6) Misc tails === 
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "), 
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "), 
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "), 
    (r"\bBILL\s+PAYMENT\b", " "), 
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    (r"-\s*MEMO=", " "), 
    (r"(?:USA|US)$", " "), # Remove USA or US at the end 
    (r"\s+FSP$", " "), 
    (r"\bL\d{3}\b", " "), # Handle L340

    # === 7) Phone numbers === 
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212 
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212 
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), 

    # === 8) URLs/domains === 
    (r"^\.COM\s+BILL\b.*", " "), 
    (r"\.COM\b", " "), # Remove .COM anywhere
    (r"\s+COM$", " "), # Remove .COM at end of string

    # === 9) State abbreviations === 
    (STATE_REGEX, " "), # Remove standalone state codes 
    
    # === 9.5) City abbreviations ===
    (CITY_REGEX, " "), # Remove standalone city names
    
    # === 10) Noise Words (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 11) Final Tidy (Punctuation) === 
    (r"[|%_=;\\/]+", " "), # Remove misc separators 
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s*|\s*-\s*$", " "), # Remove leading/trailing hyphens
]

In [10]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADDY
    re.compile(r".*(GODADDY\.COM|GODADDY)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # === NEW: High-Frequency Full Merchant Names (from 1-grams) ===
    re.compile(r"^(AFTERPAY)\b.*"),
    re.compile(r"^(ALDI)\b.*"),
    re.compile(r"^(AMZN(?:\s*MKTP)?)\b.*"), # Handles amzn, amzn mktp
    re.compile(r"^(AMAZON(?:\.COM|\s+PRIME)?)\b.*"), # Handles amazon, amazon prime
    re.compile(r"^(APPLE(?:\.COM)?)\b.*"),
    re.compile(r"^(BRIGIT)\b.*"),
    re.compile(r"^(BURGER\s+KING)\b.*"),
    re.compile(r"^(CASH\s+APP)\b.*"),
    re.compile(r"^(CHICK-FIL-A)\b.*"),
    re.compile(r"^(CHIPOTLE)\b.*"),
    re.compile(r"^(CIRCLE\s+K)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),
    re.compile(r"^(DAIRY\s+QUEEN)\b.*"),
    re.compile(r"^(DOLLAR\s+GENERAL)\b.*"),
    re.compile(r"^(DOLLAR\s+TREE)\b.*"),
    re.compile(r"^(DOORDASH)\b.*"),
    re.compile(r"^(DUNKIN)\b.*"),
    re.compile(r"^(EBAY)\b.*"), # Added from n-grams
    re.compile(r"^(ETSY)\b.*"),
    re.compile(r"^(FAMILY\s+DOLLAR)\b.*"),
    re.compile(r"^(FOOD\s+LION)\b.*"),
    re.compile(r"^(FRYS)\b.*"),
    re.compile(r"^(GOOGLE)\b.*"),
    re.compile(r"^(HELPPAY)\b.*"),
    re.compile(r"^(HOME\s+DEPOT)\b.*"),
    re.compile(r"^(INSTACART)\b.*"),
    re.compile(r"^(KFC)\b.*"),
    re.compile(r"^(KROGER)\b.*"),
    re.compile(r"^(LITTLE\s+CAESARS)\b.*"),
    re.compile(r"^(LOWE'?S)\b.*"),
    re.compile(r"^(LYFT)\b.*"),
    re.compile(r"^(MCDONALD'?S)\b.*"),
    re.compile(r"^(MICROSOFT)\b.*"),
    re.compile(r"^(PUBLIX)\b.*"),
    re.compile(r"^(ROSS)\b.*"),
    re.compile(r"^(SAFEWAY)\b.*"),
    re.compile(r"^(SAMS\s*CLUB|SAMSCLUB)\b.*"), # Updated from n-grams
    re.compile(r"^(SHOPRITE)\b.*"),
    re.compile(r"^(SONIC)\b.*"),
    re.compile(r"^(STARBUCKS)\b.*"),
    re.compile(r"^(SUBWAY)\b.*"),
    re.compile(r"^(TACO\s+BELL)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(UBER(?:\s+EATS)?)\b.*"),
    re.compile(r"^(USPS)\b.*"),
    re.compile(r"^(VONS)\b.*"),
    # Updated from n-grams (wal, mart, wm, superc, supercenter)
    re.compile(r"^(WALMART(?:\s*SUPERCENTER|\s*SUPER\s*C)?|WM\s*SUPERCENTER|WAL-MART|WAL\s*MART)\b.*"),
    re.compile(r"^(WENDY'?S)\b.*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s09'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    re.compile(r"^(7(?:-ELEVEN|\s+11))\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # NEW: Rule for HOME* AHS.COM pattern
    re.compile(r"^HOME\s*\*?\s*([A-Z\s0-9'.-]+?)(?:\s+[A-Z]{2,})?.*"),
    
    # === TST RULES ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0EXAMPLE'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [11]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 26.5 s, sys: 178 ms, total: 26.7 s
Wall time: 26.7 s


In [12]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            # Use the last non-null group if multiple are present
            # (handles the GODADDY rule)
            groups = match.groups()
            if groups:
                return groups[-1].strip()
    return memo

df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 4.73 s, sys: 12.2 ms, total: 4.74 s
Wall time: 4.74 s


In [13]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [14]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [33]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
433234,RIVER CITY MARKET CHICAGO IL 0...,,
357241,PURCHASE AUTHORIZED ON 09/15 CHIPOTLE XXXX STO...,CHIPOTLE STOCKTON S,CHIPOTLE
309586,PURCHASE AUTHORIZED ON 05/13 COSTCO WHSE #XXXX...,COSTCO WHSE # VICTORVILLE P,COSTCO
503285,Withdrawal DEBIT CHIP / DD DOORDASH MORAINEFI ...,DD DOORDASH MORAINEFI,DOORDASH MORAINEFI
167239,GOOGLE *BUMBLE,GOOGLE *BUMBLE,GOOGLE
265027,PURCHASE AUTHORIZED ON 01/15 KIDS FOOT LOCKER ...,KIDS FOOT LOCKER WACO P,KIDS FOOT LOCKER WACO P
234970,POS Debit - Visa Check Card XXXX - MINITZ BUFF...,MINITZ BUFFALO NIAGARA FALLSN,MINITZ BUFFALO NIAGARA FALLSN
258138,PUBLIX SUPER M 12/19 #XXXXXXXXX PURCHASE XXXX ...,PUBLIX M # LYONS RD COCONUT,PUBLIX
313799,PURCHASE AUTHORIZED ON 05/24 Wal-Mart Store VI...,WAL- VISALIA P,WAL- VISALIA P
492756,WINE AND SPIRITS XXXX BRISTOL PA 06/27,WINE AND SPIRITS BRISTOL,WINE AND SPIRITS BRISTOL


In [16]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [34]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

MCDONALD'S
APPLE
TARGET
WAL-
STARBUCKS
AMZN
EBAY
SUBWAY
USPS
BRIGIT
ROSS
COSTCO
KFC
ETSY
SHOPRITE
CHIPOTLE
A
432-329
MCDONALDS
@WORK
EXPRESS
GODADDY
AMZNFREETIME
SALLY
CCBILL
CHEWY
PETCO
SAVE
WINN-DIXIE
TIGER
GOFAN
SHIPT
BASKIN
RALPHS
SLICE
DOMINO'S
S
IHOP
BETMGM
DOLLAR
DILLONS
CRYPTO
P
FOOD
INTUIT
STEAK-N-SHAKE
TOLLWAY-AUTOREPLEN
STATERBRO
POPEYES
NORTHGATE
VANS
MGM
SKILLZ
PRICELN
GAMESTOP
FRG
43
.KOHLS
HUNT
D
BANFIELD-
VERIZONWRLSS
RIOT
OCULUS
WAL
MCW
SAVEMART
RAINBOW
BELK
C
FOODMAXX
MARINA
CANVA
ABC
BASHAS''
WALGREENS
MEIJER
GNC
SHEIN
STAPLES
L
CLAIRE'S
SEZZLE
FOOD4LESS
QFC
*EBAY
AMZ
SEI
GOODWILL
FIV
NYTIMES
V
CHECKERS
UPS
TILLYS
T
CLEO
SHOPIFY
IBI
OTT
ETT
POTBELLY
BLUESKY
DROPBOX
*UBER
SOUTHWES
EL
JOURNEYS
PARKMOBILE
E-Z
JACK'S
WEGMANS
ABCMOUSE
QUADPAY
EZPASS
ROSES
LUCKY
EVI
G
AYSTATIONNETWORK
LIQUOR
ENMARKET
UNITED
RVT
REI
CRT
NORTON
KIMS
EA
FBPAY
GERALD
YPSILANTI
ZULILY
NORDSTROM
HLLFRSH
*STEAM
PARADISE-MOOREMOORESVILLE
TLG
OPC
DRI
LJS
*MICROSOFT
FRED-MEYER
OUTLET
NIKE
STEAM
ZTL


In [35]:
df[df['memo_post'] == '@WORK']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
82341,CHECKCARD XXXX MARKET@WORK XXXXX RENTON WA XXX...,@WORK,@WORK
82342,CHECKCARD XXXX MARKET@WORK XXXXXXXXXX RENTON W...,@WORK,@WORK
111166,Check Card Purchase / MARKET@WORK XXXXXXXXXX R...,@WORK,@WORK
111167,Check Card Purchase / MARKET@WORK XXXXXXXXXX R...,@WORK,@WORK
111168,Check Card Purchase / MARKET@WORK XXXXXXXXXX R...,@WORK,@WORK
...,...,...,...
196084,MARKET@WORK XXXXXXXXXX RENTON WA 12/30,@WORK,@WORK
206958,Market@Work,@WORK,@WORK
254703,POS Withdrawal MARKET@WORK / XXXXXXXXXX,@WORK,@WORK
417519,Purchase MARKET@WORK XXXXXXXXXX,@WORK,@WORK


In [19]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
3820,365 MARKET,,
3821,365 MARKET 888,,
4048,365 MARKET TROY MI,,
4049,365 MARKET XXX XXX-XXXX,,
4050,365 MARKET XXX XXX-XXXX TROY MI,,
...,...,...,...
523840,XXXXXXXXXXX XXXXXX,,
523860,XXXXXXXXXXXX,,
524051,XXXXXXXXXXXXX,,
524215,XXXXXXXXXXXXXXX,,


In [20]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
1842,02-20-22 SEATTLE WA XXXX AMAZON.COM*1B8II62G0 ...,AMAZON *1B8II62G0 SUNTRUST,AMAZON
1843,02-20-22 SEATTLE WA XXXX AMAZON.COM*1B9CD29N0 ...,AMAZON *1B9CD29N0 SUNTRUST,AMAZON
1844,02-20-22 SEATTLE WA XXXX AMAZON.COM*1I32Q2NG1 ...,AMAZON *1I32Q2NG1 SUNTRUST,AMAZON
1845,02-20-22 SEATTLE WA XXXX AMAZON.COM*1I58N0TJ1 ...,AMAZON *1I58N0TJ1 SUNTRUST,AMAZON
1846,02-20-22 SEATTLE WA XXXX AMAZON.COM*1IXXXXAV1 ...,AMAZON *1I AV1 SUNTRUST,AMAZON
...,...,...,...
528725,www.Playgr* Bidiboo.Co,.PLAYGR* BIDIBOO.CO,.PLAYGR
528726,www.Playgr* Littlemiss,.PLAYGR* LITTLEMISS,.PLAYGR
528732,www.Styles* Luv N Hair,.STYLES* LUV N HAIR,.STYLES
528733,www.Stylese* West Brim,.STYLESE* BRIM,.STYLESE


In [21]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

424172                               [ADOBE , PR CREATIV S]
524273                     [AMAZON , 0X5WB1OH3 AMZN BILLWA]
35361                            [AMAZON , 1 FE2 AMZN BILL]
13725                                  [AMAZON , 133RT1H22]
34369                        [AMAZON , 142YC3YI0 AMZN BILL]
64238                   [AMAZON , 1J8B45JC3 AM AMZN BILLWA]
34775                        [AMAZON , 1K4AO16X1 AMZN BILL]
31886                          [AMAZON , 1L4IB2M AMZN BILL]
288519                       [AMAZON , 1N8PW99 AMZN BILL S]
14138                                  [AMAZON , 1O9RU1MX1]
35260                        [AMAZON , 1V81V3KF1 AMZN BILL]
64502                                        [AMAZON , 1X2]
35359                        [AMAZON , 1X9KG0T92 AMZN BILL]
14807                  [AMAZON , 2E9H505T0 AMZNAMZN BILLWA]
15016                                    [AMAZON , 2R5A US]
501143                       [AMAZON , 619PC71R3 US 25 # #]
228630        [AMAZON , 6B0IR4U AMZN B J

# Phase 2: Extract & Analyze N-Grams

In [22]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [23]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [24]:
%%time
corpus = df['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=100)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=100)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=100)
print(f"--- N-gram Analysis Complete ---")

Analyzing 528766 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 9.58 s, sys: 334 µs, total: 9.58 s
Wall time: 9.58 s


In [25]:
top_1grams
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [26]:
top_1grams

[('amazon', 32494),
 ('amzn', 22483),
 ('cash', 21185),
 ('app', 19351),
 ('mktp', 18994),
 ('wal', 13737),
 ('doordash', 12164),
 ('mcdonald', 8859),
 ('dollar', 8070),
 ('apple', 7966),
 ('google', 6652),
 ('food', 6246),
 ('target', 5990),
 ('wm', 5836),
 ('pizza', 5166),
 ('taco', 4720),
 ('prime', 4169),
 ('cafe', 3909),
 ('king', 3858),
 ('starbucks', 3855),
 ('burger', 3505),
 ('mobile', 3462),
 ('shop', 3310),
 ('circle', 3301),
 ('superc', 3285),
 ('publix', 3279),
 ('general', 3181),
 ('el', 3084),
 ('liquor', 3079),
 ('uber', 3034),
 ('bar', 3031),
 ('afterpay', 2997),
 ('ebay', 2737),
 ('bell', 2659),
 ('express', 2622),
 ('subway', 2597),
 ('chick', 2569),
 ('kroger', 2546),
 ('supercenter', 2459),
 ('house', 2393),
 ('dunkin', 2319),
 ('little', 2254),
 ('home', 2253),
 ('grill', 2237),
 ('family', 2206),
 ('depot', 1999),
 ('usps', 1907),
 ('coffee', 1888),
 ('instacart', 1812),
 ('mexican', 1810),
 ('brigit', 1801),
 ('wine', 1775),
 ('club', 1755),
 ('walmart', 1739),


In [27]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'wal',
 'doordash',
 'mcdonald',
 'dollar',
 'apple',
 'google',
 'food',
 'target',
 'wm',
 'pizza',
 'taco',
 'prime',
 'cafe',
 'king',
 'starbucks',
 'burger',
 'mobile',
 'shop',
 'circle',
 'superc',
 'publix',
 'general',
 'el',
 'liquor',
 'uber',
 'bar',
 'afterpay',
 'ebay',
 'bell',
 'express',
 'subway',
 'chick',
 'kroger',
 'supercenter',
 'house',
 'dunkin',
 'little',
 'home',
 'grill',
 'family',
 'depot',
 'usps',
 'coffee',
 'instacart',
 'mexican',
 'brigit',
 'wine',
 'club',
 'walmart',
 'frys',
 'ross',
 'aldi',
 'microsoft',
 'sams',
 'costco',
 'foods',
 'caesars',
 'kfc',
 'fresh',
 'smoke',
 'sonic',
 'lyft',
 'wendy',
 'safeway',
 'etsy',
 'queen',
 'shoprite',
 'helppay',
 'samsclub',
 'jacksonville',
 'com',
 'liquors',
 'del',
 'hunt',
 'chicken',
 'lowe',
 'nnt',
 '432',
 'spa',
 'street',
 'vons',
 'dairy',
 'lion',
 'tree',
 'eats',
 'sushi',
 'sports',
 'arbys',
 'jack',
 'box',
 'cmsvend',
 'papa',
 'can

In [28]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'wal',
 'doordash',
 'mcdonald',
 'dollar',
 'apple',
 'google',
 'food',
 'target',
 'wm',
 'pizza',
 'taco',
 'prime',
 'cafe',
 'king',
 'starbucks',
 'burger',
 'mobile',
 'shop',
 'circle',
 'superc',
 'publix',
 'general',
 'el',
 'liquor',
 'uber',
 'bar',
 'afterpay',
 'ebay',
 'bell',
 'express',
 'subway',
 'chick',
 'kroger',
 'supercenter',
 'house',
 'dunkin',
 'little',
 'home',
 'grill',
 'family',
 'depot',
 'usps',
 'coffee',
 'instacart',
 'mexican',
 'brigit',
 'wine',
 'club',
 'walmart',
 'frys',
 'ross',
 'aldi',
 'microsoft',
 'sams',
 'costco',
 'foods',
 'caesars',
 'kfc',
 'fresh',
 'smoke',
 'sonic',
 'lyft',
 'wendy',
 'safeway',
 'etsy',
 'queen',
 'shoprite',
 'helppay',
 'samsclub',
 'jacksonville',
 'com',
 'liquors',
 'del',
 'hunt',
 'chicken',
 'lowe',
 'nnt',
 '432',
 'spa',
 'street',
 'vons',
 'dairy',
 'lion',
 'tree',
 'eats',
 'sushi',
 'sports',
 'arbys',
 'jack',
 'box',
 'cmsvend',
 'papa',
 'can

In [29]:
top_2grams

[('amzn mktp', 18946),
 ('cash app', 18691),
 ('amazon prime', 3851),
 ('wm superc', 3278),
 ('superc wal', 3277),
 ('dollar general', 3050),
 ('wm supercenter', 2426),
 ('taco bell', 2408),
 ('burger king', 1830),
 ('little caesars', 1529),
 ('wal wal', 1528),
 ('home depot', 1482),
 ('family dollar', 1306),
 ('sams club', 1093),
 ('food lion', 950),
 ('uber eats', 947),
 ('dairy queen', 792),
 ('jack box', 774),
 ('dollar tree', 770),
 ('winco foods', 733),
 ('carls jr', 728),
 ('mobile sign', 725),
 ('sign based', 725),
 ('del taco', 694),
 ('panda express', 678),
 ('dollar ge', 663),
 ('bath body', 652),
 ('king soopers', 557),
 ('smoke shop', 556),
 ('panera bread', 533),
 ('routs farmers', 523),
 ('king soop', 521),
 ('ge dg', 505),
 ('waffle house', 482),
 ('quick chek', 478),
 ('pizza hut', 475),
 ('432 32', 462),
 ('body works', 453),
 ('432 329', 441),
 ('papa john', 427),
 ('total wine', 412),
 ('cash 40', 407),
 ('ingles markets', 392),
 ('coca cola', 386),
 ('klover app', 

In [30]:
ngrams_2 = [ngram for ngram, count in top_2grams]
ngrams_2

['amzn mktp',
 'cash app',
 'amazon prime',
 'wm superc',
 'superc wal',
 'dollar general',
 'wm supercenter',
 'taco bell',
 'burger king',
 'little caesars',
 'wal wal',
 'home depot',
 'family dollar',
 'sams club',
 'food lion',
 'uber eats',
 'dairy queen',
 'jack box',
 'dollar tree',
 'winco foods',
 'carls jr',
 'mobile sign',
 'sign based',
 'del taco',
 'panda express',
 'dollar ge',
 'bath body',
 'king soopers',
 'smoke shop',
 'panera bread',
 'routs farmers',
 'king soop',
 'ge dg',
 'waffle house',
 'quick chek',
 'pizza hut',
 '432 32',
 'body works',
 '432 329',
 'papa john',
 'total wine',
 'cash 40',
 'ingles markets',
 'coca cola',
 'klover app',
 'smart final',
 'harbor freight',
 'raising cane',
 'merchant issued',
 'app boost',
 'issued target',
 'stewarts shop',
 'aystation network',
 'cvs pharmacy',
 'winn dixie',
 'trader joe',
 'buc ee',
 'exchg rte',
 'fresh coffe',
 'sale debitl340',
 'wal sams',
 'royal house',
 'sports bar',
 'amzn com',
 'tropical smooth

In [31]:
top_3grams

[('wm superc wal', 3277),
 ('mobile sign based', 725),
 ('dollar ge dg', 502),
 ('bath body works', 451),
 ('klover app boost', 363),
 ('merchant issued target', 363),
 ('wal wal sams', 264),
 ('float corp payments', 263),
 ('mx nu peso', 231),
 ('quick chek food', 224),
 ('ref auth purchdate', 215),
 ('extra daily spend', 209),
 ('disney plus burbank', 207),
 ('sports bar jacksonville', 203),
 ('wal wal sto', 202),
 ('desc entry descr', 193),
 ('trace eed ind', 193),
 ('bcn mx nu', 192),
 ('orig desc entry', 191),
 ('fred meye fred', 188),
 ('rappi restaurantes col', 183),
 ('harbor freight tools', 180),
 ('mission lane vis', 178),
 ('stewarts shop 329', 176),
 ('danfoss cafe ames', 176),
 ('nayax 24 hunt', 169),
 ('tijuana bcn mx', 166),
 ('nayax 14 hunt', 165),
 ('lane vis mission', 162),
 ('routs farmers mkt', 157),
 ('beneva tobacco sarasota', 154),
 ('pizza hut https', 149),
 ('youtube tv helppay', 149),
 ('dutchbrosll grants pass', 145),
 ('portillos hot dogs', 144),
 ('el paso 

In [32]:
# Use 1 grams to find prefixes