# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 09f7ac6] regex
 1 file changed, 11561 insertions(+), 4762 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy)_
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     AMAZON.COM*F05SE9HY3 SEATTLE WACard XXXX/Withd...
1                                  Amazon.com*TB40C2CW3
2                 BLUESKY #903         GRENADA       MS
3     BOLTON ORCHARD 01/12 #XXXXXXXXX PURCHASE BOLTO...
4     Brigit-com PROTECTION XXXXXX XXXXC539CXXXXDC N...
5     CHECKCARD XXXX TST* SUSHI OBSESS VALLEJO CA XX...
6     DBT CRD XXXX 09/25/22 XXXXXXX BOJANGLES XXXX F...
7     Debit Card Debit ARBYS GREENEATSTREET XXXXXXXX...
8                     KLOVER APP BOOST Chicago IL 09/09
9                         LITTLE CAESARS #XXXX CA 05/13
10    MCDONALD'S FXXXX TRACY CA                    0...
11    POS Debit - Visa Check Card XXXX - FOOD LION #...
12    POS Debit - Visa Check Card XXXX - GX MARKET A...
13    PURCHASE AUTHORIZED ON 01/08 7-ELEVEN XXXXX ME...
14    PURCHASE AUTHORIZED ON 01/08 MCDONALD'S MXXXX ...
15    PURCHASE AUTHORIZED ON 01/31 LYFT *1 RIDE 01- ...
16    PURCHASE AUTHORIZED ON 03/26 CIRCLE K # XXXXX ...
17    PURCHASE AUTHORIZED ON 04/07 LITTLE CAESAR

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# Create list of high-frequency noise words from 1-gram analysis
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) Authorization Headers ===
    (r"\b(?:(?:RECURRING\s+)?PAYMENT|(?:P?URCHASE)?)\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*RECURRING\b", " "),
    (r"\bDEBIT\s+PURCHASE\s+-(?:VISA|MASTERCARD|AMEX|DISCOVER)\b", " "),

    # === 2) Card & Masked Number Boilerplate ===
    # Specific card types
    (r"\b(?:(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+)?CHECK\s*CARD\b(?:\s*X+)?", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    
    # Masked numbers (X{4,})
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bCK\s*X+\b", " "),
    (r"\bCDX{4,}\b", " "),
    (r"\b[SP]X{6,}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), # State + mask
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), # State + mask
    (r"X{4,}", " "), # Aggressive X removal (needed for XXXXTST)

    # Misc card/auth keywords
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), 
    
    # === 3) POS (Point of Sale) Boilerplate ===
    (r"\bPOS\s+(?:PUR-|PURCHASE|WITHDRAWAL|DEBIT|SIGNATURE|SIG)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"^\s*REC\s+POS\b", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    
    # === 4) Dates, Times, & Misc Numbers ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), # Catch-all ref/phone
    (r"\b\d{5}\b", " "), # Remove standalone 5-digit (zip codes)
    (r"\b\d{2,4}\b", " "), # Remove other standalone 2-4 digit numbers
    
    # === 5) Transaction/Transfer Boilerplate & Tails ===
    (r"\b(?:(?:INST|PAYPAL)\s+XFER|RETRY\s+PYMT)(?:\s+ID)?\b", " "),
    (r"\b(?:XFER|WEB)\s+ID\b.*$", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*$", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*$", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*$", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),
    (r"^\.COM\s+BILL\b.*", " "),

    # === 6) Business/Commerce Noise (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 7) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 8) State Abbreviations (Run *after* other rules) ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 9) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [8]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # === PAYPAL RULES ===
    # Rules for 'PAYPAL DES:...' format
    re.compile(r"^PAYPAL\s+DES:.*?:(.*?)(?:\s+INDN:.*)?$"),
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),
    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+(?: [A-Z\s0-9'.-]+)*?)\s+[A-Z]{5,}.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+(?: [A-Z\s0-9'.-]+)*).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # This generic PAYPAL rule must come *after* specific PAYPAL rules
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    # === High-Frequency Merchant Prefixes (from 1-grams) ===
    re.compile(r"^MKTP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^AMZN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^APPLE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^GOOGLE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^MICROSOFT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^WAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^WM\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SAMS\s*\*?CLUB)\b.*"), # Specific "SAMS CLUB"
    re.compile(r"^SAMS\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Generic "SAMS"
    re.compile(r"^UBER\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^LYFT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^(CASH\s+APP)\b.*"), # Specific "CASH APP"
    re.compile(r"^AFTERPAY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^BRIGIT\s*\*?\s*([A-Z\s0G'.-]+).*"),
    re.compile(r"^EBAY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ETSY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^HELPPAY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^INSTACART\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^STARBUCKS\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SUBWAY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^VSA\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [9]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 21.9 s, sys: 204 ms, total: 22.1 s
Wall time: 22.1 s


In [10]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 4.39 s, sys: 8.3 ms, total: 4.39 s
Wall time: 4.39 s


In [11]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [12]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [13]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
225563,POS Card purchase WM SUPERCENTER #XXXX XXXX WA...,WM # WAL-MART SUPER SAINT CLOUD,
287703,PURCHASE AUTHORIZED ON 03/18 CASH APP*PETRI TI...,CASH APP*PETRI TIL,CASH APP
96095,CHINA EXPRESS CASC 11-23 ATLANTAGA XXXX DEBIT ...,CHINA EXPRESS CASC ATLANTAGA,CHINA EXPRESS CASC ATLANTAGA
133788,"DOLLAR TR XXXX STATE R HUDSON, FL, USA","DOLLAR TR STATE R HUDSON, ,","DOLLAR TR STATE R HUDSON, ,"
345848,PURCHASE AUTHORIZED ON 08/17 MCDONALD'S FXXXXX...,MCDONALD'S F HEMET,MCDONALD'S F HEMET
397611,PURCHASE AUTHORIZED ON 12/26 AMZN Mktp US*NN1S...,AMZN MKTP US*NN1ST AMZN.COM BILL,MKTP US
282512,PURCHASE AUTHORIZED ON 03/04 SQ *PARA TACOS EL...,SQ *PARA TACOS EL LOS ANGELES,PARA TACOS EL LOS ANGELES
308407,PURCHASE AUTHORIZED ON 05/10 7-ELEVEN PORT RIC...,7-ELEVEN PORT RICHEY,PORT RICHEY
258196,PUBLIX SUPER MAR 382 XXXXXX,PUBLIX SUPER MAR,PUBLIX SUPER MAR
411115,Point Of Sale Withdrawal THAI YUMMY PORTLAND ORUS,THAI YUMMY PORTLAND,THAI YUMMY PORTLAND


In [14]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'TARGET', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [15]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

-MART
O
ROSS
XXX
TROY
LION
WINCO
HTTPSINSTACAR
WAL-
ORLANDO
XXX-XXX
SUBWAY
EXPRESS
RICHMOND
SALLY
MIAMI
FAIRWAY
FREETIME
CHEWY.COM
TARGET.COM
PETCO
USPS
WINN-DIXIE
SHIPT
.COM
BASKIN
SLICE
RALPHS
BETMGM
MEIJER
GOFAN
IHOP
INTUIT
STATERBRO
STOCKTON
-COM
EATS
STEAK-N-SHAKE
DILLONS
*STARBUCKS
NORTHGATE
AYRIX
SKILLZ
VANS
DEPOT
ABC
BEAN
TOLLWAY-AUTOREPLEN
MART.COM
FRG
PRICELN
.KOHLS.COM
RIOT
GAMESTOP
POPEYES
MARINA
BANFIELD-PET
MCW
OCULUS
SMITHS
SAVEMART
RAINBOW
FOODMAXX
HOUSTON
VERIZONWRLSS
BASHAS''
CANVA
GREENS
CASH
NYTIMES
CLAIRE'S
MURRIETA
HARRISONBURG
SEZZLE
CENTS-ONLY
BELL''S
RANCH
GOODWILL
PHOENIX
MANSFIELD
BORO
THE
FOOD4LESS
BELK
GNC
TRIP
QFC
FRYS-MKTPLACE
STAPLES
MESA
L
TAMPA
FIV
A
V
MIRAMAR
AMZ
PARKMOBILE
TILLYS
IBI
UPS
GIANT
DROPBOX
HELLOFRESH
OTT
*MICROSOFT
BLUESKY
*EBAY
*UBER
FAYETEVILLE
STARBUCKS
POTBELLY
SOUTHWES
US
STORAG
MART
SHOPIFY
JACK'S
NORTON
ALIEXPRESS
TOWN
EBAY
WEGMANS
QUADPAY
LUCKY
ROSES
HLLFRSH
CHECKERS
VISALIA
BAKERSFIELD
EVI
ETT
ABCMOUSE.COM
CRYPTO.COM
STO
ENMARKET

In [16]:
df[df['memo_post'] == '0']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
68385,CHECKCARD XXXX AMZN DIGITAL*MSXXXXXX0 WA XXXXX...,AMZN * 0,0
453206,SUBWAY 0 XXXX,SUBWAY 0,0
460429,TARGET 0 XXXX,TARGET 0,0
460431,TARGET 0 XXXX XXXXX,TARGET 0,0
460548,TARGET DEBIT CRD ACH TRAN XXXXXX 470 0,TARGET 0,0
460550,TARGET DEBIT CRD ACH TRAN XXXXXX 642 0,TARGET 0,0


In [17]:
df[df['memo_pre'].str.contains('OPC')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
146098,Debit Purchase -visa Card XXXXfashionnova.com ...,FASHIONNOVA.COM FNOVA.MYSHOPCA,FASHIONNOVA.COM FNOVA.MYSHOPCA
393523,PURCHASE AUTHORIZED ON 12/16 GRUBHUBYOPCITYRES...,GRUBHUBYOPCITYREST GRUBHUB.COM,GRUBHUBYOPCITYREST GRUBHUB.COM
214659,OPC*CONN APPLIANCESINC TX XXXXX Debit Card Pur...,OPC*CONN APPLIANCESINC :13P #,OPC
175565,HOPCAT - BELTLINE GRAND RAPIDS MI 01/21,HOPCAT - BELTLINE GRAND RAPIDS,HOPCAT - BELTLINE GRAND RAPIDS
224360,POPCORN WORLD MICHIG MICHIGAN CITY IN 0...,POPCORN WORLD MICHIG MICHIGAN,POPCORN WORLD MICHIG MICHIGAN
214648,OPC*CONN APPLIANCESIN XXX-XXX-XXXX TX 0...,OPC*CONN APPLIANCESIN XXX-XXX,OPC
214645,OPC*CONN APPLIANCESIN XXX-XXX-XXXX TX 0...,OPC*CONN APPLIANCESIN XXX-XXX,OPC
144907,Debit Purchase -visa 10/02 Card XXXXsq *popcon...,SQ *POPCON SHOP ONALASKA,POPCON SHOP ONALASKA
503425,Withdrawal DEBIT CHIP / DOUBLE GOOD POPCORN HT...,DOUBLE GOOD POPCORN .DOUB,DOUBLE GOOD POPCORN .DOUB
151194,"Double Good Popcorn, Httpswww.Doub","DOUBLE GOOD POPCORN, .DOUB","DOUBLE GOOD POPCORN, .DOUB"


In [18]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

222419                                      PIN AMAZON.COM
17426                                           AMAZON.COM
14409                                           AMAZON.COM
482383                                   .COM BILL XXX-XXX
327781                                          CCBILL.COM
16724                                           AMAZON.COM
348                 #3PA4DDWDGKL5 POS AMAZON.COM SEATTLE #
384705                                          AMAZON.COM
302405                                       HTTPSINSTACAR
38120                                           AMAZON.COM
51455                                             -COM D D
249541    POS RECURRING DEBIT - DDA CRD APPLE.COM BILL - -
102657                                  CRD PUR AMAZON.COM
13588                                    AMAZON.COM SIG RF
422302                       HELP.HBOMAX.COM HTTPSHBOMAX.C
34589                                           AMAZON.COM
177552                             INSTACART HTTPSINSTAC

In [19]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
110,#00 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,# BP APPLE COM BILL CUPERTINO #,# BP APPLE COM BILL CUPERTINO #
135,#04 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,# BP APPLE COM BILL CUPERTINO #,# BP APPLE COM BILL CUPERTINO #
150,#07 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,# BP APPLE COM BILL CUPERTINO #,# BP APPLE COM BILL CUPERTINO #
179,#12 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,# BP APPLE COM BILL CUPERTINO #,# BP APPLE COM BILL CUPERTINO #
188,#14 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,# BP APPLE COM BILL CUPERTINO #,# BP APPLE COM BILL CUPERTINO #
...,...,...,...
526991,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 1 6,.COM BILL ONE APPLE PARK WAY 1 6
526992,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 8 2,.COM BILL ONE APPLE PARK WAY 8 2
526993,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 2 6,.COM BILL ONE APPLE PARK WAY 2 6
526994,debit card APPLE.COM/BILL ONE APPLE PARK XXXXX...,APPLE.COM BILL ONE APPLE PARK C 2 4,.COM BILL ONE APPLE PARK C 2 4


In [20]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
1878,03-XXXXXXXXXX TLF*FLOWERSHOPPING COM,TLF*FLOWERSHOPPING COM,TLF
2212,13FLDENVER* 13THFL HTTPSWWW.13TH CO,13FLDENVER* 13THFL .13TH,13FLDENVER* 13THFL .13TH
2331,1PASSWORD* TRIAL OVER TORONTO ON 01/07,1PASSWORD* TRIAL OVER TORONTO ON,1PASSWORD* TRIAL OVER TORONTO ON
3607,2CHECKO*KILOHEA Alpharetta GA USA,2CHECKO*KILOHEA ALPHARETTA,2CHECKO*KILOHEA ALPHARETTA
3608,2CO.COM*slideupli XXXXXXXXXX 04/03,2CO.COM*SLIDEUPLI,2CO.COM*SLIDEUPLI
...,...,...,...
528725,www.Playgr* Bidiboo.Co,.PLAYGR* BIDIBOO.CO,.PLAYGR
528726,www.Playgr* Littlemiss,.PLAYGR* LITTLEMISS,.PLAYGR
528732,www.Styles* Luv N Hair,.STYLES* LUV N HAIR,.STYLES
528733,www.Stylese* West Brim,.STYLESE* WEST BRIM,.STYLESE


In [21]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

7656                      [9FOLD,  VICTORI NEW YORK :46P #]
32060                      [AMAZON , 2X1PH7L AMZN.COM BILL]
32280                     [AMAZON , PF IV3 AMZN.COM BILLWA]
66911                        [AMAZON.COM,  AMZN.COM BILLWA]
13685            [AMAZON.COM, 0H62F1N93 AM AMZN.COM BILL #]
34504                 [AMAZON.COM, 1A6NM22E1 AMZN.COM BILL]
145497                    [AMAZON.COM, 1F5DFAMZN.COM BILWA]
34582                 [AMAZON.COM, 1F5HD4PD2 AMZN.COM BILL]
34640                     [AMAZON.COM, 1H5ZZ AMZN.COM BILL]
34777                 [AMAZON.COM, 1K4V355K1 AMZN.COM BILL]
225301                      [AMAZON.COM, 1Q3YP4WC0 SEATTLE]
14467                       [AMAZON.COM, 255XV1FV1 SEATTLE]
14627                  [AMAZON.COM, 2C2DQ7RAMZN.COM BILL #]
64976                  [AMAZON.COM, 2G0HN9 AMZN.COM BILLWA]
35923             [AMAZON.COM, 2J6SK5DH3 AMZN.COM BILL WAC]
65141            [AMAZON.COM, 2P9L93AI1 AM AMZN.COM BILLWA]
36170                 [AMAZON.COM, 2Y8L9

# Phase 2: Extract & Analyze N-Grams

In [22]:
df_p2 = pd.read_csv("memos_P1.csv")

In [23]:
print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

1           #
2           #
3           #
4           #
5           #
6           #
7           #
8           #
9           #
10          #
11          #
12          #
13          #
14          #
15          #
16          #
17          #
18          #
19          #
20          #
21          #
22          #
23          #
24          #
25          #
26          #
27          #
28          #
29          #
30          #
31          #
32          #
33          #
34          #
103         #
290        #3
1711        *
1780         
1781         
1878      TLF
2122       DB
2292      MKT
2296       TH
2310      WEB
2311      WEB
2312      WEB
2313      WEB
2416        B
2908        C
2909        C
2979      CRD
3322        C
3323        C
3324        C
3325        C
3364      LLC
3375         
3389      CRD
3776         
3777         
3820         
3821         
3840         
3842         
3843         
3847         
4013        C
4043       FP
4047        J
4328         
4329         
4330  

In [24]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [None]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 528766 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...


In [None]:
top_1grams.sort(reverse=True
    
)
top_1grams

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes