# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main ecb0cb2] regex
 1 file changed, 5794 insertions(+), 5947 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     CHECKCARD XXXX AMZN Mktp US*1O6C Amzn.com/bill...
1     CHECKCARD XXXX HOOTERS OF ROHNERT PARK ROHNERT...
2     CHECKCARD XXXX SALTYS FRESH MIX COTTLEVILLE MO...
3              DUANNE`S LIQUOR MARKET LAKEWOOD WA 11/18
4     FIVE BELO XXXX 10/12 #XXXXXXXXX PURCHASE FIVE ...
5     POS PURCHASE / Non-PINAmazon.com*KC11G8GJ3 Amz...
6     PURCHASE AUTHORIZED ON 01/01 AMAZON.COM*H55U42...
7     PURCHASE AUTHORIZED ON 02/05 FINGERHUT PAYMENT...
8     PURCHASE AUTHORIZED ON 02/25 INSTACASH REPAYME...
9     PURCHASE AUTHORIZED ON 03/10 BUFFET CITY BROOK...
10    PURCHASE AUTHORIZED ON 05/06 CHIPOTLE XXXX MIA...
11    PURCHASE AUTHORIZED ON 08/03 CINNABON #XXXXXX ...
12    PURCHASE AUTHORIZED ON 08/12 CPG 88 WAVERLY WA...
13    PURCHASE AUTHORIZED ON 08/17 CASTLEBRANCH APP ...
14    PURCHASE AUTHORIZED ON 09/24 SHOPRITE HOWELL S...
15    PURCHASE AUTHORIZED ON 10/13 AMAZON.COM*274FL2...
16    PURCHASE AUTHORIZED ON 11/18 VICTORIA'S SECRET...
17    PURCHASE XXXX DD DOORDASH GOOGIEGRI CA XXX

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# Create list of high-frequency noise words from 1-gram analysis
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) Authorization Headers ===
    (r"\b(?:(?:RECURRING\s+)?PAYMENT|(?:P?URCHASE)?)\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*RECURRING\b", " "),
    (r"\bDEBIT\s+PURCHASE\s+-(?:VISA|MASTERCARD|AMEX|DISCOVER)\b", " "),

    # === 2) Card & Masked Number Boilerplate ===
    # Specific card types
    (r"\b(?:(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+)?CHECK\s*CARD\b(?:\s*X+)?", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    
    # Masked numbers (X{4,})
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bCK\s*X+\b", " "),
    (r"\bCDX{4,}\b", " "),
    (r"\b[SP]X{6,}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), # State + mask
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), # State + mask
    (r"X{4,}", " "), # Aggressive X removal (needed for XXXXTST)

    # Misc card/auth keywords
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), 
    
    # === 3) POS (Point of Sale) Boilerplate ===
    (r"\bPOS\s+(?:PUR-|PURCHASE|WITHDRAWAL|DEBIT|SIGNATURE|SIG)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"^\s*REC\s+POS\b", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    
    # === 4) Dates, Times, & Misc Numbers ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), # Catch-all ref/phone
    (r"\b\d{5}\b", " "), # Remove standalone 5-digit (zip codes)
    (r"\b\d{2,4}\b", " "), # Remove other standalone 2-4 digit numbers
    
    # === 5) Transaction/Transfer Boilerplate & Tails ===
    (r"\b(?:(?:INST|PAYPAL)\s+XFER|RETRY\s+PYMT)(?:\s+ID)?\b", " "),
    (r"\b(?:XFER|WEB)\s+ID\b.*$", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*$", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*$", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*$", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),
    (r"^\.COM\s+BILL\b.*", " "),

    # === 6) Business/Commerce Noise (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 7) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 8) State Abbreviations (Run *after* other rules) ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 9) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [75]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # === PAYPAL RULES ===
    # Rules for 'PAYPAL DES:...' format
    re.compile(r"^PAYPAL\s+DES:.*?:(.*?)(?:\s+INDN:.*)?$"),
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),
    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Corrected

    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # This generic PAYPAL rule must come *after* specific PAYPAL rules
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OPC\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [9]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 22.4 s, sys: 178 ms, total: 22.6 s
Wall time: 22.6 s


In [76]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 3.61 s, sys: 91 µs, total: 3.61 s
Wall time: 3.61 s


In [11]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [12]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [31]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
9980,AFTERPAY 185-XXXXXXXX CA 1...,AFTERPAY 185,AFTERPAY 185
92234,CHECKCARD XXXX USA*CANTEEN OF COAST VE OXNARD ...,USA*CANTEEN OF COAST VE OXNARD,CANTEEN OF COAST VE OXNARD
333526,PURCHASE AUTHORIZED ON 07/16 HAMPTON INNS WEST...,HAMPTON INNS WEST PALM BCH S,HAMPTON INNS WEST PALM BCH S
379783,PURCHASE AUTHORIZED ON 11/11 HANSEL & GRETEL D...,HANSEL & GRETEL DULUTH DULUTH P,HANSEL & GRETEL DULUTH DULUTH P
335510,PURCHASE AUTHORIZED ON 07/21 SPORT CLIPS - VA2...,SPORT CLIPS - VA20 RICHMOND S,ORT CLIPS - VA20 RICHMOND S
224952,POS ATM DEBIT DBT CRD XXXX 12/01/21 08 CRACKER...,POS ATM DEBIT DBT CRD 08 CRACKER BARREL # NASH...,POS ATM DEBIT DBT CRD 08 CRACKER BARREL
256576,PRICE CHOPPER #325 GRAIN VALLEY MO 11/24,PRICE CHOPPER #325 GRAIN VALLEY,PRICE CHOPPER
249114,POS Purchase TIPSY ELVES XXXXXXXXXX CA XXXX,TIPSY ELVES,TIPSY ELVES
497398,Wal-Mart Super C XXXX WAL-SAMS ROSEBURGOR ...,WAL-MART SUPER C WAL-SAMS ROSEBURGOR C# * POS DEB,WAL-MART SUPER C WAL-SAMS ROSEBURGOR C
512381,Withdrawal at CAROLINA'S MEXICAN FOOD 0,WITHDRAWAL AT CAROLINA'S MEXICAN FOOD 0,WITHDRAWAL AT CAROLINA'S MEXICAN FOOD 0


In [14]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'TARGET', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [77]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

AMZNFREETIME
WINN-DIXIE
SHIPT
BASKIN
RALPHS
SLICE
PETCO
BETMGM
GOFAN
CHEWY.COM
IHOP
INTUIT
STEAK-N-SHAKE
DILLONS
STATERBRO
*STARBUCKS
SKILLZ
VANS
FRG
TOLLWAY-AUTOREPLEN
PRICELN
WWW.KOHLS.COM
RIOT
P
BANFIELD-PET
MCW
STORE
MARINA
SAVEMART
GAMESTOP
FOODMAXX
OCULUS
BASHAS''
VERIZONWRLSS
RAINBOW
CANVA
WALGREENS
CLAIRE'S
SEZZLE
ABC
GNC
FOOD4LESS
QFC
BELK
POPEYES
FRYS-MKTPLACE
STAPLES
SUBWAY
NYTIMES
FIV
AMZ
*EBAY
L
V
IBI
TILLYS
SHOPIFY
*MICROSOFT
UPS
OTT
BLUESKY
*UBER
DROPBOX
POTBELLY
GOODWILL
JACK'S
QUADPAY
WEGMANS
EBAY
LUCKY
CHECKERS
EVI
ETT
MEIJER
ABCMOUSE.COM
RVT
NORTON
ENMARKET
FBPAY
EA
JOURNEYS
CRYPTO.COM
TLG
HOME
CRT
HLLFRSH
PARKMOBILE
NORDSTROM
GERALD
MESA
SEDANOS
REI
FH
NEWSSTAND
LJS
APPLE.COM
ZTL
EZPASS
STOCKTON
EVERYPLATE
PAR
FRED-MEYER
*STEAM
RGP
PACSUN
M
E-Z
TRTHFDR
DRI
TACOMA
UBR
TLF
PCH
EPC
MOE'S
BUCKLE
EXPRESS
PEET'S
CKO
ECS
BOXYCHARM
FACEBK
ARBY'S
GOFNDME
SOUTHWES
FAMOUSFOOTWEAR
TOMMY'S
PAM
PAVILIONS
LEGALSHIELD
VONS.COM
LYNWOOD
OPS
GLOSS
AF
OFFICE
VOLA
CDSR
COLDSTONE
WWP
ETS

In [37]:
df[df['memo_post'] == 'OPC']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post


In [49]:
df[df['memo_pre'].str.contains('OPC')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
142409,Debit Card POPCORN TAVERN,POPCORN TAVERN,POPCORN TAVERN
336777,PURCHASE AUTHORIZED ON 07/24 SP WWW.INKEDSHOP....,SP WWW.INKEDSHOP.C WWWINKEDSHOPC S,WWW.INKEDSHOP.C WWWINKEDSHOPC S
490131,WDW POPCORN CARTS LAKE BUENA VI FL 09/27,WDW POPCORN CARTS LAKE BUENA VI,WDW POPCORN CARTS LAKE BUENA VI
224571,POPPIN POPCORN XXXXXX,POPPIN POPCORN,POPPIN POPCORN
44808,BIOPHARMCATALYST HTTPSWWW.BIOPC,BIOPHARMCATALYST HTTPSWWW.BIOPC,BIOPHARMCATALYST HTTPSWWW.BIOPC
214648,OPC*CONN APPLIANCESIN XXX-XXX-XXXX TX 0...,OPC*CONN APPLIANCESIN XXX-XXX,CONN APPLIANCESIN XXX-XXX
216380,Opc*Leon Vlly Phototkt,OPC*LEON VLLY PHOTOTKT,LEON VLLY PHOTOTKT
281963,PURCHASE AUTHORIZED ON 03/03 DOUBLE GOOD POPCO...,DOUBLE GOOD POPCOR HTTPSWWW.DOUB S,DOUBLE GOOD POPCOR HTTPSWWW.DOUB S
518049,XXXXSP * WWW.INKED WWWINKEDSHOPC,SP * WWW.INKED WWWINKEDSHOPC,WWW.INKED WWWINKEDSHOPC
175568,HOPCAT EAST LANSING XXXXXXXXX,HOPCAT EAST LANSING,HOPCAT EAST LANSING


In [57]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

234367                                    LIFE STORAGE ECOM
422804                                          GODADDY.COM
149787    DEBIT: SIGNATURE PURCHASE FROM AMAZON.COM*5S5J...
517145      VSA RECUR GOODRX GOLD HTTPSGOLD.GOO ( 03:37:42)
915                     # AMAZON.COM*KF8OC87Y3 SEATTLE 15 #
431271                    SCAVENGERHUNT.COM HTTPSWWW.LETS S
93600                              WWW.BROWNELLSINC.COM XXX
334148                          UNIVERSITY PLACE B TACOMA S
290135                        FORMSWIFT.COM CHAR XXX-XXX- S
175303                                        HOMEDEPOT.COM
496736                                   WWW.HOTTOPIC.COM C
392187                                           AMAZON.COM
51688     BRIGIT.COM DES:PROTECTION :B B881DB74B3 INDN:E...
205748                        MORNINGSAVE.COM HTTPSMERCATAL
65051                                            AMAZON.COM
37285                                            AMAZON.COM
349410                            INSTAC

In [19]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
110,#00 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#00 BP APPLE COM BILL CUPERTINO 10 #,#00 BP APPLE COM BILL CUPERTINO 10 #
135,#04 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#04 BP APPLE COM BILL CUPERTINO 10 #,#04 BP APPLE COM BILL CUPERTINO 10 #
150,#07 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#07 BP APPLE COM BILL CUPERTINO 10 #,#07 BP APPLE COM BILL CUPERTINO 10 #
179,#12 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#12 BP APPLE COM BILL CUPERTINO 10 #,#12 BP APPLE COM BILL CUPERTINO 10 #
188,#14 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#14 BP APPLE COM BILL CUPERTINO 10 #,#14 BP APPLE COM BILL CUPERTINO 10 #
...,...,...,...
526991,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 1 6,APPLE.COM BILL ONE APPLE PARK WAY 1 6
526992,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 8 2,APPLE.COM BILL ONE APPLE PARK WAY 8 2
526993,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 2 6,APPLE.COM BILL ONE APPLE PARK WAY 2 6
526994,debit card APPLE.COM/BILL ONE APPLE PARK XXXXX...,APPLE.COM BILL ONE APPLE PARK C 2 4,APPLE.COM BILL ONE APPLE PARK C 2 4


In [78]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
2212,13FLDENVER* 13THFL HTTPSWWW.13TH CO,13FLDENVER* 13THFL HTTPSWWW.13TH,13FLDENVER* 13THFL HTTPSWWW.13TH
2331,1PASSWORD* TRIAL OVER TORONTO ON 01/07,1PASSWORD* TRIAL OVER TORONTO ON,1PASSWORD* TRIAL OVER TORONTO ON
3607,2CHECKO*KILOHEA Alpharetta GA USA,2CHECKO*KILOHEA ALPHARETTA,2CHECKO*KILOHEA ALPHARETTA
3608,2CO.COM*slideupli XXXXXXXXXX 04/03,2CO.COM*SLIDEUPLI,2CO.COM*SLIDEUPLI
3609,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM
...,...,...,...
528725,www.Playgr* Bidiboo.Co,WWW.PLAYGR* BIDIBOO.CO,WWW.PLAYGR
528726,www.Playgr* Littlemiss,WWW.PLAYGR* LITTLEMISS,WWW.PLAYGR
528732,www.Styles* Luv N Hair,WWW.STYLES* LUV N HAIR,WWW.STYLES
528733,www.Stylese* West Brim,WWW.STYLESE* WEST BRIM,WWW.STYLESE


In [73]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

66873                [AMAZON.COM,  L4 AMZN.COM BILLWA CKCD]
363303                [AMAZON.COM, 144IP4M AMZN.COM BILL S]
34381                 [AMAZON.COM, 145KA9QO0 AMZN.COM BILL]
71800                  [AMAZON.COM, 163J45 AMZN.COM BILLWA]
14194                       [AMAZON.COM, 1R2HO7IC0 SEATTLE]
14259                             [AMAZON.COM, 1U8JG4I02 A]
324815                [AMAZON.COM, 210TM2Q AMZN.COM BILL S]
321779                [AMAZON.COM, 216OV8O AMZN.COM BILL S]
14653     [AMAZON.COM, 2C8KC9MT1 SEATTLE WACARD WITHDRAW...
14672                               [AMAZON.COM, 2D0TY1SV2]
35853                 [AMAZON.COM, 2G52N2QK2 AMZN.COM BILL]
319437                [AMAZON.COM, 2X6ZG5P AMZN.COM BILL S]
36620                 [AMAZON.COM, 9J1B22IF3 AMZN.COM BILL]
15540                               [AMAZON.COM, A370M8I33]
273170                [AMAZON.COM, CA0L62B AMZN.COM BILL S]
65670            [AMAZON.COM, CF8Q39W53 AM AMZN.COM BILLWA]
15747                                   

# Phase 2: Extract & Analyze N-Grams

In [79]:
df_p2 = pd.read_csv("memos_P1.csv")

In [93]:
print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

103         #
290        #3
1711        *
1780         
1781         
2122       DB
2310      183
2311      183
2312      183
2313      183
4932       56
4996      621
5088         
5089         
5090         
5092         
5093         
5094         
5095         
5096         
5097         
5098         
5099         
5100         
5101         
5102         
5103         
5104         
5105         
5106         
5107         
5108         
5109         
5110         
5111         
5112         
5113         
5114         
5115         
5116         
5117         
5118         
5119         
5120         
5121         
5122         
5123         
5124         
5125         
5126         
5127         
5128         
5129         
5130         
5131         
5132         
5133         
5134         
5135         
5136         
5137         
5138         
5139         
5140         
5141         
5142         
5143         
5144         
5145         
5146         
5147         
5155  

In [80]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [81]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 528766 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 11.6 s, sys: 100 ms, total: 11.7 s
Wall time: 11.7 s


In [88]:
top_1grams.sort(reverse=True
    
)
top_1grams

[('york', 2058),
 ('xxx', 48560),
 ('www', 2876),
 ('wm', 5829),
 ('withdrawal', 15211),
 ('wine', 1775),
 ('whse', 1483),
 ('west', 2368),
 ('wendy', 1330),
 ('web', 2761),
 ('walmart', 1754),
 ('wal', 13802),
 ('vsa', 1921),
 ('vons', 1051),
 ('vending', 2700),
 ('vegas', 1562),
 ('valley', 2733),
 ('usps', 1933),
 ('uber', 4783),
 ('tst', 1095),
 ('troy', 2605),
 ('transaction', 3139),
 ('tr', 1943),
 ('target', 2931),
 ('tampa', 1292),
 ('taco', 4734),
 ('supercenter', 2444),
 ('superc', 3285),
 ('super', 6126),
 ('sup', 2977),
 ('subway', 2621),
 ('street', 1185),
 ('stores', 1886),
 ('store', 6737),
 ('stop', 1348),
 ('starbucks', 3874),
 ('st', 4131),
 ('spring', 1493),
 ('spa', 1073),
 ('south', 1041),
 ('sonic', 1394),
 ('smoke', 1422),
 ('signature', 2784),
 ('sig', 3524),
 ('shoprite', 1249),
 ('shop', 3300),
 ('seattle', 2413),
 ('santa', 2338),
 ('san', 7968),
 ('samsclub', 1205),
 ('sams', 1627),
 ('saint', 1172),
 ('safeway', 1299),
 ('ross', 1619),
 ('river', 1054),
 ('

In [89]:
top_2grams

[('amazon com', 24676),
 ('xxx xxx', 19204),
 ('amzn mktp', 18956),
 ('cash app', 18722),
 ('wal mart', 11847),
 ('apple com', 6329),
 ('dbt crd', 6033),
 ('com xxx', 4927),
 ('amazon prime', 3836),
 ('mart super', 3421),
 ('wm superc', 3278),
 ('superc wal', 3277),
 ('dollar general', 3050),
 ('mobile purchase', 3008),
 ('pos transaction', 2931),
 ('mart sup', 2774),
 ('san diego', 2729),
 ('amzn digital', 2527),
 ('starbucks store', 2486),
 ('taco bell', 2426),
 ('wm supercenter', 2411),
 ('super center', 2363),
 ('chick fil', 2348),
 ('365 market', 2205),
 ('help uber', 1839),
 ('dollar tr', 1835),
 ('burger king', 1827),
 ('withdrawal debit', 1824),
 ('pos sig', 1807),
 ('vsa pur', 1793),
 ('uber com', 1784),
 ('publix super', 1775),
 ('market 888', 1733),
 ('new york', 1687),
 ('brigit com', 1590),
 ('pos deb', 1557),
 ('little caesars', 1534),
 ('usps po', 1505),
 ('home depot', 1484),
 ('las vegas', 1477),
 ('costco whse', 1463),
 ('debit signature', 1460),
 ('signature purchase

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes