# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 7f7028e] regex
 1 file changed, 761 insertions(+), 14357 deletions(-)
 rewrite Week 2/Haris_Saif.ipynb (94%)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in#.sample(100_000).copy()
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0      AMAZON.COM*213UP14IAMZN.COM/BILL WAUS Card #XXXX
1     AMZN Mktp US*HB8ZA 11-10 Amzn.com/bill WA XXXX...
2                               Cash App*Jeffrey Ward S
3     DBT CRD XXXX DSNF6J5K AMAZON.COM1Z9NH7G62 A AM...
4     DEBIT CARD DEBIT / auth #XXXX 08-06-XXXX DOLLA...
5     DEBIT CARD PURCHASE XXXXXXXXX XXXX GREAT CLIPS...
6               EL CAPITAN SEAFOOD & ME HOBART IN 04/30
7                                      HIVE BLOOMINGTON
8                         POS Debit #2 LUCILLE'S-BREA /
9     PURCHASE AUTHORIZED ON 01/22 SAN DIEGO MARKET ...
10    PURCHASE AUTHORIZED ON 02/04 FOOD LION #XXXX R...
11    PURCHASE AUTHORIZED ON 02/12 Subway XXXXX Char...
12    PURCHASE AUTHORIZED ON 05/07 TST* CASA DE REYE...
13    PURCHASE AUTHORIZED ON 06/04 CASH APP*JEFFERY ...
14    PURCHASE AUTHORIZED ON 06/15 MCDONALD'S FXXXX ...
15    PURCHASE AUTHORIZED ON 07/31 MCW#23-WALLISVILL...
16    PURCHASE AUTHORIZED ON 08/14 LYFT 1 RIDE 08-1 ...
17    PURCHASE AUTHORIZED ON 12/15 KING SOOP XXX

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [60]:
# From 1-gram 
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "CRD", "PUR", "PIN", "SIG", "LLC", "SIGNATURE", "WEB", "PAYMENT",
    "ACH", "DEB", "INTL", "RECURRING", "DIGIT", "ONLINE", "PROTECTION", "VSA",
    "800", "888", "WITHDRAWAL", "INDN", "STORE", "STORES", "RESTAURANT", "SUPER",
    "MARKET", "MART", "FOOD", "FOODS", "CAFE", "SHOP", "BAR", "GRILL", "HOUSE",
    "CLUB", "COFFEE", "WINE", "LIQUOR", "LIQUORS", "BEAUTY", "DELI", "SMOKE",
    "GROCERY", "NAILS", "STOP", "BUSINESS", "PARKING", "SPA", "PET", "GARDEN"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [61]:
REGEX_PRE = [ 
    # === 0) Normalize spaces first === 
    (r"\u00A0", " "), # Replace non-breaking space with regular space 
    (r"\s{2,}", " "), # Collapse multiple spaces into one 

    # === 1) “Authorized / Recurring” headers === 
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "), 
    (r"\bRECURRING\s+PYMT\b", " "), 

    # === 2) Card & mask boilerplate === 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "), 
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "), 
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "), 
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "), 
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "), 
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "), 
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "), 
    (r"\bF[XF]{4,}\b", " "), # Handle FXXXXX, FXXXX
    (r"X{4,}", " "), # Remove generic masked numbers 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "), 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords 
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start 
    (r"^\s*REC\s+POS\b", " "), 
    (r"^\s*RECURRING\b", " "), 

    # === 2.5) Prefix Normalization === 
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM" 
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR 

    # === 3) State + mask tails === 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "), 
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "), 

    # === 4) Dates/times === 
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025 
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM 
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM 

    # === 5) Merchant-terminal boilerplate === 
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "), 
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "), 
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "), 
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID... 
     
    # === 6) Misc tails === 
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "), 
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "), 
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "), 
    (r"\bBILL\s+PAYMENT\b", " "), 
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    (r"-\s*MEMO=", " "), 
    (r"(?:USA|US)$", " "), # Remove USA or US at the end 
    (r"\s+FSP$", " "), 
    (r"\bL\d{3}\b", " "), # Handle L340

    # === 7) Phone numbers === 
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212 
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212 
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), 

    # === 8) URLs/domains === 
    (r"^\.COM\s+BILL\b.*", " "), 
    (r"\.COM$", " "), # Remove .COM at end of string

    # === 9) State abbreviations === 
    (STATE_REGEX, " "), # Remove standalone state codes 
    
    # === 10) Noise Words (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 11) Final Tidy (Punctuation) === 
    (r"[|%_=;\\/]+", " "), # Remove misc separators 
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s*|\s*-\s*$", " "), # Remove leading/trailing hyphens
]

In [47]:
# In[9]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADDY
    re.compile(r".*(GODADDY\.COM|GODADDY)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # === NEW: High-Frequency Full Merchant Names (from 1-grams) ===
    re.compile(r"^(AMZN(?:\s*MKTP)?)\b.*"),
    re.compile(r"^(AMAZON(?:\.COM|\s+PRIME)?)\b.*"),
    re.compile(r"^(APPLE(?:\.COM)?)\b.*"),
    re.compile(r"^(CASH\s+APP)\b.*"),
    re.compile(r"^(WALMART(?:\s*SUPERCENTER)?|WM\s*SUPERCENTER|WAL-MART)\b.*"),
    re.compile(r"^(SAMS\s*CLUB)\b.*"),
    re.compile(r"^(DOORDASH)\b.*"),
    re.compile(r"^(MCDONALD'?S)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(GOOGLE)\b.*"),
    re.compile(r"^(UBER(?:\s+EATS)?)\b.*"),
    re.compile(r"^(TACO\s+BELL)\b.*"),
    re.compile(r"^(STARBUCKS)\b.*"),
    re.compile(r"^(BURGER\s+KING)\b.*"),
    re.compile(r"^(PUBLIX)\b.*"),
    re.compile(r"^(CIRCLE\s+K)\b.*"),
    re.compile(r"^(AFTERPAY)\b.*"),
    re.compile(r"^(CHICK-FIL-A)\b.*"),
    re.compile(r"^(SUBWAY)\b.*"),
    re.compile(r"^(KROGER)\b.*"),
    re.compile(r"^(HOME\s+DEPOT)\b.*"),
    re.compile(r"^(DUNKIN)\b.*"),
    re.compile(r"^(LITTLE\s+CAESARS)\b.*"),
    re.compile(r"^(DOLLAR\s+GENERAL)\b.*"),
    re.compile(r"^(DOLLAR\s+TREE)\b.*"),
    re.compile(r"^(FAMILY\s+DOLLAR)\b.*"),
    re.compile(r"^(USPS)\b.*"),
    re.compile(r"^(BRIGIT)\b.*"),
    re.compile(r"^(INSTACART)\b.*"),
    re.compile(r"^(FRYS)\b.*"),
    re.compile(r"^(ROSS)\b.*"),
    re.compile(r"^(MICROSOFT)\b.*"),
    re.compile(r"^(ALDI)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),
    re.compile(r"^(KFC)\b.*"),
    re.compile(r"^(SONIC)\b.*"),
    re.compile(r"^(LYFT)\b.*"),
    re.compile(r"^(HELPPAY)\b.*"),
    re.compile(r"^(DAIRY\s+QUEEN)\b.*"),
    re.compile(r"^(SAFEWAY)\b.*"),
    re.compile(r"^(WENDY'?S)\b.*"),
    re.compile(r"^(ETSY)\b.*"),
    re.compile(r"^(SHOPRITE)\b.*"),
    re.compile(r"^(LOWE'?S)\b.*"),
    re.compile(r"^(CHIPOTLE)\b.*"),
    re.compile(r"^(VONS)\b.*"),
    re.compile(r"^(FOOD\s+LION)\b.*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    re.compile(r"^(7(?:-ELEVEN|\s+11))\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 25 s, sys: 171 ms, total: 25.2 s
Wall time: 25.2 s


In [48]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 3.55 s, sys: 4.05 ms, total: 3.56 s
Wall time: 3.56 s


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [54]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
37232,Amazon.com*H25AB5J12 Amzn.com/bill WA 11/08,AMAZON.COM*H25AB5J12 AMZN.COM BILL,AMAZON.COM
17414,AMAZON.COM*ZB3KNXXXX SEATTLE WA Card XXXX,AMAZON.COM*ZB3KNXXXX SEATTLE,AMAZON.COM
229732,POS Debit - Visa Check Card XXXX - AMZN MKTP U...,AMZN MKTP US*KM2IE AMZN.COM B,AMZN MKTP US
137205,DOORDASH*MUNCHIE MAGIC WWW.DOORDASH.,DOORDASH*MUNCHIE MAGIC .DOORDASH.,DOORDASH
527948,eBay O*08-XXXXX-XXXXX CA 05/18,EBAY O*08,EBAY O
77972,CHECKCARD XXXX FOSTERS FREEZE ROSAMOND CA XXXX...,FOSTERS FREEZE ROSAMOND,FOSTERS FREEZE ROSAMOND
194031,Lodge At Mc Carran Ra,LODGE AT MC CARRAN RA,LODGE AT MC CARRAN RA
192877,LUSH OLD ORCHARD (681) SKOKIE IL 09/12,LUSH OLD ORCHARD (681) SKOKIE,LUSH OLD ORCHARD (681) SKOKIE
257002,PROSPER MARKETPL DES:PWIIT ID:XXXXXXXX INDN:Ch...,PROSPER MARKETPL DES:PWIIT : INDN:CHIPING HWAN...,PROSPER MARKETPL DES:PWIIT : INDN:CHIPING HWAN...
337205,PURCHASE AUTHORIZED ON 07/25 VONS #XXXX WALNUT...,VONS # WALNUT,VONS


In [16]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [55]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

TARGET.COM
XXXXAMAZON.COM
AMZNFREETIME
CHEWY.COM
WINN-DIXIE
DES
SHIPT
BASKIN
SLICE
RALPHS
BETMGM
PETCO
GOFAN
XXXXTST
IHOP
INTUIT
STEAK-N-SHAKE
STATERBRO
DILLONS
*STARBUCKS
AYRIX
SKILLZ
VANS
TOLLWAY-AUTOREPLEN
FRG
PRICELN
RIOT
BANFIELD-PET
STORE
MCW
XXXXGOOGLE
GAMESTOP
.KOHLS.COM
SAVEMART
OCULUS
MARINA
FOODMAXX
BASHAS''
CANVA
RAINBOW
NYTIMES
SEZZLE
XXXXSQ
WALGREENS
CLAIRE'S
VERIZONWRLSS
ABC
GNC
POPEYES
SUBWAY
FRYS-MKTPLACE
BELK
FOOD4LESS
FIV
STAPLES
V
*EBAY
AMZ
*MICROSOFT
SHOPIFY
IBI
TILLYS
UPS
QFC
OTT
BLUESKY
*UBER
HELLOFRESH
GOODWILL
DROPBOX
L
JACK'S
POTBELLY
QUADPAY
WEGMANS
SOUTHWES
LUCKY
EVI
NORTON
ETT
CHECKERS
RVT
CRYPTO.COM
ENMARKET
PARKMOBILE
ABCMOUSE.COM
FBPAY
EA
JOURNEYS
TLG
OPC
CRT
HOME
PRESSNET
GERALD
NORDSTROM
SEDANOS
REI
VOLA
ANCESTRY.COM
MEIJER
NEWSSTAND
LJS
EBAY
FH
EZPASS
*STEAM
HLLFRSH
EVERYPLATE
ZTL
FIVERR
RGP
TRTHFDR
XXXXWAL-MART
PAR
E-Z
DRI
PACSUN
NIKE.COM
UBR
MOE'S
EPC
M
PCH
TLF
UNITED
EXPRESS
FRED-MEYER
GOFNDME
APPLE.COM
BUCKLE
PEET'S
ETSY.COM
ECS
BOXYCHARM
FACEBK
H

In [18]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
74774,CHECKCARD XXXX CK XXXXXXX,,
82514,CHECKCARD XXXX MCC NC XXXXXXXXXXXXXXXXXXXXXXX,,
88157,"CHECKCARD XXXX SQ *""IT'S ALL PEACHY"" F Centenn...","SQ *""IT'S ALL PEACHY"" F CENTENNIAL",
93788,CHECKCARD XXXX XXXX XXXXXXXXXX TX XXXXXXXXXXXX...,,
93843,CHECKCARD XXXX XXXXXXXXXXXX,,
93846,CHECKCARD XXXX XXXXXXXXXXXXXX,,
98408,CK XXXXXXX,,
104841,Card,,
112295,Ck XXXXXXX,,
112576,Co,,


In [19]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
74774,CHECKCARD XXXX CK XXXXXXX,,
82514,CHECKCARD XXXX MCC NC XXXXXXXXXXXXXXXXXXXXXXX,,
88157,"CHECKCARD XXXX SQ *""IT'S ALL PEACHY"" F Centenn...","SQ *""IT'S ALL PEACHY"" F CENTENNIAL",
93788,CHECKCARD XXXX XXXX XXXXXXXXXX TX XXXXXXXXXXXX...,,
93843,CHECKCARD XXXX XXXXXXXXXXXX,,
93846,CHECKCARD XXXX XXXXXXXXXXXXXX,,
98408,CK XXXXXXX,,
104841,Card,,
112295,Ck XXXXXXX,,
112576,Co,,


In [20]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
2212,13FLDENVER* 13THFL HTTPSWWW.13TH CO,13FLDENVER* 13THFL .13TH,13FLDENVER* 13THFL .13TH
2331,1PASSWORD* TRIAL OVER TORONTO ON 01/07,1PASSWORD* TRIAL OVER TORONTO ON,1PASSWORD* TRIAL OVER TORONTO ON
3607,2CHECKO*KILOHEA Alpharetta GA USA,2CHECKO*KILOHEA ALPHARETTA,2CHECKO*KILOHEA ALPHARETTA
3608,2CO.COM*slideupli XXXXXXXXXX 04/03,2CO.COM*SLIDEUPLI,2CO.COM*SLIDEUPLI
3609,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM
...,...,...,...
528725,www.Playgr* Bidiboo.Co,.PLAYGR* BIDIBOO.CO,.PLAYGR
528726,www.Playgr* Littlemiss,.PLAYGR* LITTLEMISS,.PLAYGR
528732,www.Styles* Luv N Hair,.STYLES* LUV N HAIR,.STYLES
528733,www.Stylese* West Brim,.STYLESE* WEST BRIM,.STYLESE


In [21]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

430263                                 [ADOBE , ACROPRO SU]
404087                 [AMAZON.COM, 1R1ZI0 AMZN.COM BILLWA]
521017             [AMAZON.COM, 215UY SEATTLE 2Y12PUJYA2LZ]
14838             [AMAZON.COM, 2G4IP6BV1 A AMZN.COM BILLWA]
71904               [AMAZON.COM, 2P7VX2YB2 AMZN.COM BILLWA]
228593                     [AMAZON.COM, 3G2OU9L AMZN.COM B]
36427                 [AMAZON.COM, 6E11YXXXX AMZN.COM BILL]
65442                  [AMAZON.COM, 6T3R82 AMZN.COM BILLWA]
269683                  [AMAZON.COM, 706VR6E AMZN.COM BILL]
65463                                     [AMAZON.COM, 784]
394316                  [AMAZON.COM, 8A5EA9C AMZN.COM BILL]
15484                               [AMAZON.COM, 942P027Z3]
36669                 [AMAZON.COM, AA37T9FA3 AMZN.COM BILL]
404171                 [AMAZON.COM, BM6EI4 AMZN.COM BILLWA]
37311                 [AMAZON.COM, H75WK03H2 AMZN.COM BILL]
383921                  [AMAZON.COM, HW1MU9G AMZN.COM BILL]
16303              [AMAZON.COM, K69NK0BI

# Phase 2: Extract & Analyze N-Grams

In [23]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [40]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [56]:
%%time
corpus = df['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=100)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=100)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=100)
print(f"--- N-gram Analysis Complete ---")

Analyzing 528766 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 12.2 s, sys: 26.9 ms, total: 12.2 s
Wall time: 12.2 s


In [57]:
top_1grams
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [58]:
top_1grams

[('com', 51503),
 ('xxx', 47867),
 ('amazon', 32109),
 ('cash', 22770),
 ('amzn', 22605),
 ('app', 19387),
 ('mktp', 19131),
 ('withdrawal', 15202),
 ('mart', 14900),
 ('wal', 13668),
 ('pos', 12899),
 ('doordash', 12078),
 ('purchase', 11134),
 ('market', 8930),
 ('mcdonald', 8750),
 ('debit', 8339),
 ('apple', 8114),
 ('dollar', 7971),
 ('san', 7681),
 ('food', 7207),
 ('crd', 6799),
 ('store', 6724),
 ('target', 6654),
 ('google', 6566),
 ('pur', 6115),
 ('super', 6114),
 ('wm', 5723),
 ('city', 5173),
 ('pizza', 5148),
 ('uber', 4773),
 ('taco', 4672),
 ('pin', 4554),
 ('fxxxxx', 4406),
 ('prime', 4078),
 ('cafe', 3890),
 ('00', 3865),
 ('king', 3865),
 ('st', 3854),
 ('starbucks', 3846),
 ('mobile', 3807),
 ('center', 3590),
 ('sig', 3524),
 ('new', 3509),
 ('burger', 3458),
 ('el', 3451),
 ('llc', 3387),
 ('fort', 3329),
 ('shop', 3301),
 ('publix', 3299),
 ('superc', 3285),
 ('circle', 3280),
 ('miami', 3232),
 ('general', 3179),
 ('liquor', 3080),
 ('afterpay', 3009),
 ('sup', 

In [59]:
ngrams_1

['com',
 'xxx',
 'amazon',
 'cash',
 'amzn',
 'app',
 'mktp',
 'withdrawal',
 'mart',
 'wal',
 'pos',
 'doordash',
 'purchase',
 'market',
 'mcdonald',
 'debit',
 'apple',
 'dollar',
 'san',
 'food',
 'crd',
 'store',
 'target',
 'google',
 'pur',
 'super',
 'wm',
 'city',
 'pizza',
 'uber',
 'taco',
 'pin',
 'fxxxxx',
 'prime',
 'cafe',
 '00',
 'king',
 'st',
 'starbucks',
 'mobile',
 'center',
 'sig',
 'new',
 'burger',
 'el',
 'llc',
 'fort',
 'shop',
 'publix',
 'superc',
 'circle',
 'miami',
 'general',
 'liquor',
 'afterpay',
 'sup',
 '15',
 'bar',
 '20',
 'signature',
 'fxxxx',
 'web',
 'valley',
 'bell',
 'vending',
 'diego',
 'ebay',
 'digital',
 'express',
 'troy',
 'chick',
 'subway',
 'phoenix',
 '16',
 'kroger',
 'payment',
 'lake',
 'help',
 'home',
 'supercenter',
 'seattle',
 'houston',
 'house',
 'fil',
 'west',
 'dunkin',
 'des',
 'santa',
 '365',
 'little',
 'grill',
 'family',
 '10',
 'beach',
 'park',
 'club',
 'york',
 'depot',
 'restaurant',
 'orlando']

In [28]:
top_2grams

[('amazon com', 4656),
 ('xxx xxx', 4275),
 ('cash app', 3442),
 ('com xxx', 949),
 ('wal mart', 898),
 ('amazon prime', 729),
 ('mart super', 669),
 ('mobile purchase', 605),
 ('withdrawal debit', 568),
 ('dollar general', 561),
 ('mart sup', 515),
 ('san diego', 474),
 ('chick fil', 472),
 ('taco bell', 464),
 ('apple com', 413),
 ('dollar tr', 380),
 ('help uber', 379),
 ('uber com', 365),
 ('publix super', 343),
 ('new york', 333),
 ('burger king', 332),
 ('home depot', 313),
 ('little caesars', 296),
 ('debit signature', 293),
 ('signature purchase', 293),
 ('pos deb', 289),
 ('family dollar', 279),
 ('las vegas', 275),
 ('costco whse', 262),
 ('pos pur', 244),
 ('super mar', 242),
 ('sonic drive', 230),
 ('mart com', 225),
 ('amzn com', 219),
 ('amzn mktp', 205),
 ('sams club', 202),
 ('los angeles', 202),
 ('non pin', 193),
 ('hunt valley', 192),
 ('san antonio', 187),
 ('trip help', 186),
 ('stop shop', 184),
 ('signature debit', 181),
 ('eats help', 174),
 ('fort myers', 171),

In [29]:
top_3grams

[('com xxx xxx', 805),
 ('wal mart sup', 515),
 ('help uber com', 356),
 ('debit signature purchase', 293),
 ('publix super mar', 242),
 ('wal mart super', 172),
 ('trip help uber', 169),
 ('eats help uber', 167),
 ('apple com xxx', 162),
 ('mobile purchase sign', 159),
 ('purchase sign based', 159),
 ('amazon com seattle', 155),
 ('withdrawal signature debit', 119),
 ('nayax hunt valley', 115),
 ('bath body works', 94),
 ('purchase amazon com', 93),
 ('debit pin purchase', 92),
 ('com aa xxx', 88),
 ('aa xxx xxx', 88),
 ('xxx xxx troy', 87),
 ('withdrawal amazon com', 85),
 ('pur amazon com', 81),
 ('mart com aa', 80),
 ('pos amazon com', 78),
 ('salt lake cit', 77),
 ('info target om', 76),
 ('domino xxx xxx', 75),
 ('point sale debitl340', 73),
 ('purchase cash app', 72),
 ('help hbomax com', 72),
 ('klover app boost', 70),
 ('merchant issued payment', 68),
 ('issued payment target', 67),
 ('payment target target', 67),
 ('ppd info target', 66),
 ('pin amazon com', 65),
 ('fresh cof

In [30]:
# Use 1 grams to find prefixes