# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 69f1ae9] regex
 1 file changed, 7039 insertions(+), 7428 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0                                AMZN Mktp US 1A9RP2W20
1             BASHAS'' #046 QUEEN CREEK AZ XXXXXX 12/12
2     CASH APP*TOXCINIA B XXXXXXXXXX CA            1...
3                CATFISH COFFEEHOUSE CLEARLAKE CA 05/13
4                                   COMPARK CONVENIENCE
5     DEBIT CARD PURCHASE STARBUCKS STORE XXXX WAXAH...
6     Debit Card purchase BULL CITY FLAVORS LLC XXXX...
7     Debit Purchase -visa Card XXXXtarget 00simi Va...
8                HELP.HBOMAX.COM HTTPSHBOMAX.C NY 03/26
9                                    HY-WIT CONVENIENCE
10    PIETROS PIZZA MILWAUKIE OR / 03-23-XXXX DEBIT ...
11    PUBLIX #XXXX PUBLIX #XXXX ORLANDO FLUS XXXXXX ...
12    PURCHASE AUTHORIZED ON 04/20 THRIF-TEE FOOD DA...
13    PURCHASE AUTHORIZED ON 06/25 CASH APP*GILEAD M...
14    PURCHASE AUTHORIZED ON 09/16 BROWN COW DRIVE P...
15    PURCHASE AUTHORIZED ON 10/13 DOORDASH*TACO BEL...
16    PURCHASE AUTHORIZED ON 11/11 DOORDASH*PERKINS ...
17                                     Papa Murp

## 2. Define Regex Rules

In [6]:
STATE_LIST = [ 
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO) 
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER) 
    "KS", "KY", 
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA 
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA) 
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY" 
] 
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
CITY_LIST = [
    "SAN", "MIAMI", "DIEGO", "PHOENIX", "SEATTLE", "HOUSTON", "SANTA", "ORLANDO", 
    "CHICAGO", "LAS", "ATLANTA", "LOS", "MESA", "VEGAS", "CHARLOTTE", "TAMPA", 
    "GREENVILLE", "BROOKLYN", "DENVER", "ANGELES", "ANTONIO", "MEMPHIS", "YORK",
    "RICHMOND", "BEACH", "PALM", "FORT", "ST", "LAKE", "WEST", "DES", "PARK",
    "HILL", "NORTH", "SPRING", "CREEK", "SAINT", "RIVER", "SOUTH", "MYERS",
    # Added from 1-grams
    "CITY", "NEW", "TROY", "VALLEY", "PORT",
    # Added from 2-grams
    "WAXAHACHIE", "EL CAJON", "PASO ROBLES", "BUENA VI", "CHULA VISTA", 
    "BOCA RATON", "PINE PLAINS", "HIGHLANDS RAN", "RENTON", "SALT", "CIT", "BOGOT"
]
CITY_REGEX = r"\b(" + "|".join(CITY_LIST) + r")\b"

In [8]:
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "CRD", "PUR", "PIN", "SIG", "LLC", "SIGNATURE", "WEB", "PAYMENT",
    "ACH", "DEB", "INTL", "RECURRING", "DIGIT", "ONLINE", "PROTECTION", "VSA",
    "800", "888", "WITHDRAWAL", "INDN", "STORE", "STORES", "RESTAURANT", 
    "STOP", "BUSINESS", "PARKING", "FIL",
    # Added from 1-gram
    "POS", "PURCHASE", "DEBIT", "NON", "TR", "SUP",
    # Added from 2-grams
    "SNACK", "SODA", "HELP", "HTTPSHBOMAX", "HTTPSINSTACAR", 'SUPER', 'MART', 'CENTER'
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [9]:
REGEX_PRE = [ 
    # === 0) Normalize spaces first === 
    (r"\u00A0", " "), # Replace non-breaking space with regular space 
    (r"\s{2,}", " "), # Collapse multiple spaces into one 

    # === 1) “Authorized / Recurring” headers === 
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "), 
    (r"\bRECURRING\s+PYMT\b", " "), 

    # === 2) Card & mask boilerplate === 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "), 
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "), 
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "), 
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "), 
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "), 
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "), 
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "), 
    (r"\bF[XF]{4,}\b", " "), # Handle FXXXXX, FXXXX
    (r"\bXXX\b", " "), # Handle masked number from n-grams
    (r"X{4,}", " "), # Remove generic masked numbers 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "), 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords 
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start 
    (r"^\s*REC\s+POS\b", " "), 
    (r"^\s*RECURRING\b", " "), 

    # === 2.5) Prefix Normalization === 
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM" 
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR 

    # === 3) State + mask tails === 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), 

    # === 4) Dates/times === 
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025 
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM 
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM 

    # === 4.5) Noise numbers (from n-grams) ===
    (r"\b(?:00|10|15|16|20|365)\b", " "),

    # === 5) Merchant-terminal boilerplate === 
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "), 
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "), 
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "), 
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID... 
     
    # === 6) Misc tails === 
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "), 
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "), 
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "), 
    (r"\bBILL\s+PAYMENT\b", " "), 
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    (r"-\s*MEMO=", " "), 
    (r"(?:USA|US)$", " "), # Remove USA or US at the end 
    (r"\s+FSP$", " "), 
    (r"\bL\d{3}\b", " "), # Handle L340

    # === 7) Phone numbers === 
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212 
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212 
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), 

    # === 8) URLs/domains === 
    (r"^\.COM\s+BILL\b.*", " "), 
    (r"\.COM\b", " "), # Remove .COM anywhere
    (r"\s+COM$", " "), # Remove .COM at end of string

    # === 9) State abbreviations === 
    (STATE_REGEX, " "), # Remove standalone state codes 
    
    # === 9.5) City abbreviations ===
    (CITY_REGEX, " "), # Remove standalone city names
    
    # === 10) Noise Words (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 11) Final Tidy (Punctuation) === 
    (r"[|%_=;\\/]+", " "), # Remove misc separators 
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s*|\s*-\s*$", " "), # Remove leading/trailing hyphens
]

In [10]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADDY
    re.compile(r".*(GODADDY\.COM|GODADDY)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # === NEW: High-Frequency Full Merchant Names (from 1-grams) ===
    re.compile(r"^(AFTERPAY)\b.*"),
    re.compile(r"^(ALDI)\b.*"),
    re.compile(r"^(AMZN(?:\s*MKTP)?)\b.*"), # Handles amzn, amzn mktp
    re.compile(r"^(AMAZON(?:\.COM|\s+PRIME)?)\b.*"), # Handles amazon, amazon prime
    re.compile(r"^(APPLE(?:\.COM)?)\b.*"),
    re.compile(r"^(BRIGIT)\b.*"),
    re.compile(r"^(BURGER\s+KING)\b.*"),
    re.compile(r"^(CASH\s+APP)\b.*"),
    re.compile(r"^(CHICK-FIL-A)\b.*"),
    re.compile(r"^(CHIPOTLE)\b.*"),
    re.compile(r"^(CIRCLE\s+K)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),
    re.compile(r"^(DAIRY\s+QUEEN)\b.*"),
    re.compile(r"^(DOLLAR\s+GENERAL)\b.*"),
    re.compile(r"^(DOLLAR\s+TREE)\b.*"),
    re.compile(r"^(DOORDASH)\b.*"),
    re.compile(r"^(DUNKIN)\b.*"),
    re.compile(r"^(EBAY)\b.*"), # Added from n-grams
    re.compile(r"^(ETSY)\b.*"),
    re.compile(r"^(FAMILY\s+DOLLAR)\b.*"),
    re.compile(r"^(FOOD\s+LION)\b.*"),
    re.compile(r"^(FRYS)\b.*"),
    re.compile(r"^(GOOGLE)\b.*"),
    re.compile(r"^(HELPPAY)\b.*"),
    re.compile(r"^(HOME\s+DEPOT)\b.*"),
    re.compile(r"^(INSTACART)\b.*"),
    re.compile(r"^(KFC)\b.*"),
    re.compile(r"^(KROGER)\b.*"),
    re.compile(r"^(LITTLE\s+CAESARS)\b.*"),
    re.compile(r"^(LOWE'?S)\b.*"),
    re.compile(r"^(LYFT)\b.*"),
    re.compile(r"^(MCDONALD'?S)\b.*"),
    re.compile(r"^(MICROSOFT)\b.*"),
    re.compile(r"^(PUBLIX)\b.*"),
    re.compile(r"^(ROSS)\b.*"),
    re.compile(r"^(SAFEWAY)\b.*"),
    re.compile(r"^(SAMS\s*CLUB|SAMSCLUB)\b.*"), # Updated from n-grams
    re.compile(r"^(SHOPRITE)\b.*"),
    re.compile(r"^(SONIC)\b.*"),
    re.compile(r"^(STARBUCKS)\b.*"),
    re.compile(r"^(SUBWAY)\b.*"),
    re.compile(r"^(TACO\s+BELL)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(UBER(?:\s+EATS)?)\b.*"),
    re.compile(r"^(USPS)\b.*"),
    re.compile(r"^(VONS)\b.*"),
    # Updated from n-grams (wal, mart, wm, superc, supercenter)
    re.compile(r"^(WALMART(?:\s*SUPERCENTER|\s*SUPER\s*C)?|WM\s*SUPERCENTER|WAL-MART|WAL\s*MART)\b.*"),
    re.compile(r"^(WENDY'?S)\b.*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s09'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    re.compile(r"^(7(?:-ELEVEN|\s+11))\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # NEW: Rule for HOME* AHS.COM pattern
    re.compile(r"^HOME\s*\*?\s*([A-Z\s0-9'.-]+?)(?:\s+[A-Z]{2,})?.*"),
    
    # === TST RULES ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s09'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0EXAMPLE'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [11]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 5.36 s, sys: 33.6 ms, total: 5.4 s
Wall time: 5.4 s


In [12]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            # Use the last non-null group if multiple are present
            # (handles the GODADDY rule)
            groups = match.groups()
            if groups:
                return groups[-1].strip()
    return memo

df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 921 ms, sys: 4.57 ms, total: 926 ms
Wall time: 924 ms


In [13]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [14]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [15]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
17449,AMAZON.COM*ZU9BF13F3 SEATTLE WA Card XXXX,AMAZON *ZU9BF13F3,AMAZON
25576,AMZN Mktp US*YB7MP3T Amzn.com/bill WA 0...,AMZN MKTP US*YB7MP3T AMZN BILL,AMZN MKTP
271506,PURCHASE AUTHORIZED ON 02/03 AMZN Mktp US*GU15...,AMZN MKTP US*GU15K AMZN BILL S,AMZN MKTP
39958,Amznfreetime G S Waamznfreetim,AMZNFREETIME G S WAAMZNFREETIM,AMZNFREETIME G S WAAMZNFREETIM
47378,BUC-EE'S #42 05/03 #XXXXXXXXX PURCHASE XXXXX C...,BUC-EE'S #42 # COUNTY RD 6,BUC-EE'S
107297,Cash App*Chace Massop,CASH APP*CHACE MASSOP,CASH APP
329232,PURCHASE AUTHORIZED ON 07/04 SUPER DUPER LIQUO...,DUPER LIQUOR INC BEA P,DUPER LIQUOR INC BEA P
152643,EL GALLO GIRO ANAHEIM CA 0...,EL GALLO GIRO ANAHEIM,EL GALLO GIRO ANAHEIM
465786,THE UPS STORE XXXX XXX-XXXXXXX CA 0...,THE UPS,THE UPS
335586,PURCHASE AUTHORIZED ON 07/21 Wal-Mart Super Ce...,WAL- MARCOS P,WAL- MARCOS P


In [16]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [17]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

MCDONALD'S
APPLE
TARGET
WAL-
STARBUCKS
AMZN
EBAY
SUBWAY
BRIGIT
USPS
ROSS
COSTCO
KFC
ETSY
SHOPRITE
CHIPOTLE
MARKET
A
MARKET@WORK
MCDONALDS
AMZNFREETIME
CHEWY
SAVE
GODADDY
EXPRESS
PETCO
DOLLAR
CCBILL
INTUIT
BETMGM
SLICE
DILLONS
WINN-DIXIE
IHOP
GOFAN
DOMINO'S
RALPHS
TIGER
POPEYES
CRYPTO
BELK
WAL
VANS
SHIPT
BASKIN
HUNT
S
ABC
P
SAVEMART
STEAK-N-SHAKE
.KOHLS
FOODMAXX
TOLLWAY-AUTOREPLEN
RAINBOW
WALGREENS
OCULUS
PRICELN
FOOD
MEIJER
L
BASHAS''
D
SHEIN
STATERBRO
GAMESTOP
FRG
VERIZONWRLSS
STAPLES
BANFIELD-PET
CANVA
QFC
CHECKERS
*UBER
SOUTHWES
UPS
UNITED
V
EA
NYTIMES
JOURNEYS
AMZ
GNC
OTT
GOODWILL
*EBAY
SKILLZ
ROSES
RIOT
TILLYS
REI
MARINA
BLUESKY
MGM
E-Z
SEDANOS
TLG
QUADPAY
GERALD
FH
G
OPC
POTBELLY
UBR
CLAIRE'S
DRI
EVI
ZTL
NIKE
T
AYSTATIONNETWORK
WEGMANS
STEAM
FIESTA
*MICROSOFT
ENMARKET
PACSUN
FOOD4LESS
FRED-MEYER
SEI
IBI
FBPAY
#
DROPBOX
LUCKY
CKO
RGP
CRT
EPC
BLIZ
BUCKLE
SHOPIFY
HANNAFORD
LIQUOR
HARDEES
ANCESTRY
NEWSSTAND
NORTON
GO
C
LJS
HLLFRSH
ECHST.NET
BOXYCHARM
TRTHFDR
NORDSTROM
SEZZLE
TOMMY'S


In [32]:
print(df[df['memo_post'] == ''].to_string())

                                                                                                memo                                                  memo_pre memo_post
87680                          CHECKCARD XXXX SOUTH BEACH PARK BOCA RATON FL XXXXXXXXXXXXXXXXXXXXXXX                                                                    
523814                                                                                    XXXXXXXXXX                                                                    
219525                         PAYPAL DES:INST XFER ID:UBER INDN:BRIAN VOORHEES CO ID:PAYPALSI77 WEB                PAYPAL : :UBER :BRIAN VOORHEES :PAYPALSI77          
219477                PAYPAL DES:INST XFER ID:MICROSOFT XBOX INDN:ALICIA FARNUM CO ID:PAYPALSI77 WEB       PAYPAL : :MICROSOFT XBOX :ALICIA FARNUM :PAYPALSI77          
219356                   PAYPAL DES:INST XFER ID:AIRBNB INDN:DEANNA MCCHRISTIAN CO ID:PAYPALSI77 WEB          PAYPAL : :AIRBNB :DEANNA MCCHRISTIAN :PAYPALS

In [19]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
385336,PURCHASE AUTHORIZED ON 11/25 TST* Sidelines Gr...,TST* SIDELINES GRI CANTON S,SIDELINES GRI CANTON S
123439,DDA PURCHASE *XXXX XXXXXXXX RUTTER S 92 ...,DDA * RUTTER S 92 KUTZTOWN *,A
426074,RECURRING PAYMENT AUTHORIZED ON 07/07 Microsof...,MICROSOFT*ULTIMATE MSBILL.INFO S,MICROSOFT
413148,Point of Sale Debit L340 DATE 12-26 SUMUP *WAK...,SUMUP *WAKE FORESTWAKE FOREST,SUMUP
446087,SP * ENERGY FIT WEAR XXXXXXXXXX,SP * ENERGY FIT WEAR,ENERGY FIT WEAR
...,...,...,...
137050,DOORDASH*MARBLE SLAB C,DOORDASH*MARBLE SLAB C,DOORDASH
427123,RECURRING PAYMENT AUTHORIZED ON 08/12 GOOGLE *...,GOOGLE *SCOPELY - - S,GOOGLE
137968,DOORDASH*YOLK WWW.DOORDASH. CA 0...,DOORDASH*YOLK .DOORDASH.,DOORDASH
91463,CHECKCARD XXXX TST* PARIS BAGUETTE - H MESA AZ...,TST* PARIS BAGUETTE - H,PARIS BAGUETTE - H


In [20]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

9230                     [ADOBE ,  - - TRACER: AI ,RR]
39305                    [AMAZON ,  N16I0 AMZN BILLWA]
145492                      [AMAZON , 148OPAMZN BILWA]
289635                  [AMAZON , 169PR3T AMZN BILL S]
34450                   [AMAZON , 196PK5B73 AMZN BILL]
64159              [AMAZON , 1F59V0GC1 AM AMZN BILLWA]
34598                   [AMAZON , 1F91I0OY1 AMZN BILL]
408902        [AMAZON , 1H7N AMAZON WAUS ## AJ8P01RE1]
64345              [AMAZON , 1O4P51CY0 AM AMZN BILLWA]
35159           [AMAZON , 1R95G0MG0 AMZN BILL : 23:03]
110507                     [AMAZON , 1T LC3 AMZN BILL]
64751              [AMAZON , 296ZA41V1 AM AMZN BILLWA]
50184                             [AMAZON , 2C4KQ38N2]
65050                 [AMAZON , 2G ZT0 AM AMZN BILLWA]
65017              [AMAZON , 2G6OP9XQ0 AM AMZN BILLWA]
35955                                  [AMAZON , 2L9N]
35976          [AMAZON , 2P1C01QZ1 AMZN BILL 07:55A #]
313535                     [AMAZON , 2R T AMZN BILL S]
15218     

# Phase 2: Extract & Analyze N-Grams

In [21]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [22]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [23]:
%%time
corpus = df['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=100)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=100)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=100)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 1.88 s, sys: 1.05 ms, total: 1.88 s
Wall time: 1.87 s


In [24]:
top_1grams
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [25]:
top_1grams

[('amazon', 6128),
 ('amzn', 4263),
 ('cash', 3997),
 ('app', 3675),
 ('mktp', 3612),
 ('wal', 2658),
 ('doordash', 2266),
 ('market', 1723),
 ('mcdonald', 1648),
 ('apple', 1523),
 ('dollar', 1519),
 ('google', 1209),
 ('food', 1171),
 ('wm', 1134),
 ('target', 1118),
 ('pizza', 938),
 ('taco', 918),
 ('prime', 827),
 ('starbucks', 772),
 ('cafe', 763),
 ('king', 745),
 ('superc', 656),
 ('burger', 654),
 ('shop', 642),
 ('publix', 637),
 ('mobile', 633),
 ('general', 613),
 ('circle', 606),
 ('liquor', 604),
 ('uber', 579),
 ('bar', 566),
 ('el', 538),
 ('express', 534),
 ('afterpay', 517),
 ('bell', 517),
 ('ebay', 500),
 ('house', 469),
 ('chick', 469),
 ('supercenter', 467),
 ('subway', 466),
 ('kroger', 465),
 ('dunkin', 456),
 ('little', 435),
 ('home', 420),
 ('grill', 415),
 ('family', 412),
 ('brigit', 382),
 ('depot', 366),
 ('usps', 355),
 ('coffee', 354),
 ('wine', 340),
 ('walmart', 333),
 ('mexican', 328),
 ('instacart', 319),
 ('frys', 314),
 ('club', 310),
 ('beauty', 

In [26]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'wal',
 'doordash',
 'market',
 'mcdonald',
 'apple',
 'dollar',
 'google',
 'food',
 'wm',
 'target',
 'pizza',
 'taco',
 'prime',
 'starbucks',
 'cafe',
 'king',
 'superc',
 'burger',
 'shop',
 'publix',
 'mobile',
 'general',
 'circle',
 'liquor',
 'uber',
 'bar',
 'el',
 'express',
 'afterpay',
 'bell',
 'ebay',
 'house',
 'chick',
 'supercenter',
 'subway',
 'kroger',
 'dunkin',
 'little',
 'home',
 'grill',
 'family',
 'brigit',
 'depot',
 'usps',
 'coffee',
 'wine',
 'walmart',
 'mexican',
 'instacart',
 'frys',
 'club',
 'beauty',
 'aldi',
 'ross',
 'costco',
 'sonic',
 'caesars',
 'kfc',
 'smoke',
 'sams',
 'foods',
 'microsoft',
 'deli',
 'etsy',
 'wendy',
 'safeway',
 'fresh',
 'lyft',
 'nails',
 'helppay',
 'liquors',
 'chicken',
 'grocery',
 'jacksonville',
 'queen',
 'shoprite',
 'com',
 'samsclub',
 'sports',
 'nnt',
 'del',
 'garden',
 'pet',
 'tree',
 'spa',
 'lowe',
 'canteen',
 '432',
 'box',
 'arbys',
 'street',
 'eats'

In [27]:
ngrams_1

['amazon',
 'amzn',
 'cash',
 'app',
 'mktp',
 'wal',
 'doordash',
 'market',
 'mcdonald',
 'apple',
 'dollar',
 'google',
 'food',
 'wm',
 'target',
 'pizza',
 'taco',
 'prime',
 'starbucks',
 'cafe',
 'king',
 'superc',
 'burger',
 'shop',
 'publix',
 'mobile',
 'general',
 'circle',
 'liquor',
 'uber',
 'bar',
 'el',
 'express',
 'afterpay',
 'bell',
 'ebay',
 'house',
 'chick',
 'supercenter',
 'subway',
 'kroger',
 'dunkin',
 'little',
 'home',
 'grill',
 'family',
 'brigit',
 'depot',
 'usps',
 'coffee',
 'wine',
 'walmart',
 'mexican',
 'instacart',
 'frys',
 'club',
 'beauty',
 'aldi',
 'ross',
 'costco',
 'sonic',
 'caesars',
 'kfc',
 'smoke',
 'sams',
 'foods',
 'microsoft',
 'deli',
 'etsy',
 'wendy',
 'safeway',
 'fresh',
 'lyft',
 'nails',
 'helppay',
 'liquors',
 'chicken',
 'grocery',
 'jacksonville',
 'queen',
 'shoprite',
 'com',
 'samsclub',
 'sports',
 'nnt',
 'del',
 'garden',
 'pet',
 'tree',
 'spa',
 'lowe',
 'canteen',
 '432',
 'box',
 'arbys',
 'street',
 'eats'

In [28]:
top_2grams

[('amzn mktp', 3603),
 ('cash app', 3532),
 ('amazon prime', 761),
 ('wm superc', 655),
 ('superc wal', 655),
 ('dollar general', 583),
 ('taco bell', 471),
 ('wm supercenter', 460),
 ('burger king', 355),
 ('wal wal', 293),
 ('little caesars', 282),
 ('home depot', 276),
 ('family dollar', 238),
 ('market 432', 190),
 ('sams club', 183),
 ('uber eats', 181),
 ('food lion', 166),
 ('jack box', 146),
 ('panda express', 145),
 ('dollar tree', 143),
 ('winco foods', 141),
 ('dairy queen', 139),
 ('bath body', 134),
 ('dollar ge', 127),
 ('market work', 126),
 ('del taco', 120),
 ('carls jr', 119),
 ('mobile sign', 118),
 ('sign based', 118),
 ('smoke shop', 117),
 ('king soop', 104),
 ('ge dg', 101),
 ('king soopers', 100),
 ('panera bread', 96),
 ('waffle house', 96),
 ('papa john', 95),
 ('market 43', 91),
 ('coca cola', 90),
 ('body works', 90),
 ('quick chek', 90),
 ('fresh market', 86),
 ('pizza hut', 86),
 ('total wine', 85),
 ('432 329', 85),
 ('432 32', 84),
 ('routs farmers', 82)

In [29]:
ngrams_2 = [ngram for ngram, count in top_2grams]
ngrams_2

['amzn mktp',
 'cash app',
 'amazon prime',
 'wm superc',
 'superc wal',
 'dollar general',
 'taco bell',
 'wm supercenter',
 'burger king',
 'wal wal',
 'little caesars',
 'home depot',
 'family dollar',
 'market 432',
 'sams club',
 'uber eats',
 'food lion',
 'jack box',
 'panda express',
 'dollar tree',
 'winco foods',
 'dairy queen',
 'bath body',
 'dollar ge',
 'market work',
 'del taco',
 'carls jr',
 'mobile sign',
 'sign based',
 'smoke shop',
 'king soop',
 'ge dg',
 'king soopers',
 'panera bread',
 'waffle house',
 'papa john',
 'market 43',
 'coca cola',
 'body works',
 'quick chek',
 'fresh market',
 'pizza hut',
 'total wine',
 '432 329',
 '432 32',
 'routs farmers',
 'klover app',
 'cash 40',
 'aystation network',
 'harbor freight',
 'ingles markets',
 'cvs pharmacy',
 'winn dixie',
 'app boost',
 'merchant issued',
 'issued target',
 'buc ee',
 'nails spa',
 'raising cane',
 'sports bar',
 'farmers market',
 'trader joe',
 'stewarts shop',
 'smart final',
 'sale debitl

In [30]:
top_3grams

[('wm superc wal', 655),
 ('mobile sign based', 118),
 ('dollar ge dg', 99),
 ('bath body works', 90),
 ('market 432 329', 85),
 ('market 432 32', 84),
 ('klover app boost', 71),
 ('merchant issued target', 71),
 ('fresh market coffe', 56),
 ('wal wal sams', 54),
 ('routs farmers market', 50),
 ('wal wal sto', 50),
 ('float corp payments', 48),
 ('sports bar jacksonville', 46),
 ('rappi restaurantes col', 45),
 ('ref auth purchdate', 41),
 ('danfoss cafe ames', 38),
 ('desc entry descr', 37),
 ('trace eed ind', 37),
 ('quick chek food', 37),
 ('orig desc entry', 36),
 ('disney plus burbank', 36),
 ('stewarts shop 329', 36),
 ('harbor freight tools', 35),
 ('fred meye fred', 35),
 ('adobe acropro subs', 33),
 ('extra daily spend', 33),
 ('spankys sports bar', 32),
 ('youtube tv helppay', 30),
 ('mission lane vis', 30),
 ('canteen vend2 hunt', 29),
 ('rrn cash app', 29),
 ('dutchbrosll grants pass', 29),
 ('pizza hut https', 28),
 ('mx nu peso', 28),
 ('buffalo wild wings', 26),
 ('lane 

In [31]:
# Use 1 grams to find prefixes