# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 02506f5] regex
 1 file changed, 8878 insertions(+), 57 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     7-ELEVEN FORT WORTH TX CARD: XXXXXXX 04/29/XXX...
1     CHECKCARD XXXX CURIOSITYSTREAM SILVER SPRINGMD...
2     CHECKCARD XXXX STORM NORM VEND INC POUGHKEEPSI...
3     DDA PURCHASE *XXXX XXXXXXX AUBUCHON 41 CONCORD...
4     DEBIT CARD PURCHASE XXXXXXXXX CHATURBILL XXXXX...
5                    FAT BURRITO RANCHO CUCAMO CA 01/14
6     GOOGLE*PLAYRIX GAMES SUPPORT.GOOGL CA        0...
7                     ICHIYUMMY INC CAPE CORAL FL 07/14
8                      KUNG FU TEA - CHAR 04-01 CHARLOT
9     MARKET@WORK XXXXXXXXXX RENTON WA             0...
10                                          Mailunyijia
11    PURCHASE AUTHORIZED ON 02/08 SANDSTON BISTRO &...
12    PURCHASE AUTHORIZED ON 05/05 TUTOR TIME XXX-XX...
13    PURCHASE AUTHORIZED ON 08/28 BAHAMA BREEZE XXX...
14    PURCHASE AUTHORIZED ON 11/22 TJ MAXX # XXXX CL...
15    Point Of Sale Withdrawal CAROLINAS DINER101 HI...
16    Point Of Sale Withdrawal JOES / KANSAS CITY BB...
17    SQ *DJS BEAUTY SUYPPLY Bronx NY           

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [8]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [9]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    # These now capture '7-ELEVEN' or '7 11' as the merchant
    re.compile(r"^(7-ELEVEN)\s*\*?#?.*"),
    re.compile(r"^(7\s+11)\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    # re.compile(r"^DD\s*BR\s*\*?#?\s*([A-Z\s0-9'.-]+).*"), # Removed, redundant
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 4.45 s, sys: 39.3 ms, total: 4.49 s
Wall time: 4.49 s


In [11]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 711 ms, sys: 4.25 ms, total: 716 ms
Wall time: 714 ms


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [14]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
63160,CHECKCARD XXXX 22 KLICKS BAR AND HOPE MILLS NC...,22 KLICKS BAR AND HOPE MILLS,22 KLICKS BAR AND HOPE MILLS
23776,AMZN Mktp US*H23QX77 Amzn.com/bill WA 1...,AMZN MKTP US*H23QX77 AMZN.COM BILL,AMZN MKTP US
20812,AMZN Mktp US*1A3ZZ72 Amzn.com/bill WA 0...,AMZN MKTP US*1A3ZZ72 AMZN.COM BILL,AMZN MKTP US
395288,PURCHASE AUTHORIZED ON 12/20 BURLINGTON STORES...,BURLINGTON STORES 266 ATLANTA P,BURLINGTON STORES 266 ATLANTA P
152347,EFT POS ANC*ANCEST RF#XXXXXX 032,EFT POS ANC*ANCEST RF# 032,EFT POS ANC
214011,OLIVER'S MARKE 05/01 #XXXXXXXXX PURCHASE OLIVE...,OLIVER'S MARKE # PURCHASE OLIVER'S MARKET #,OLIVER'S MARKE
214708,OPS*ORCHARD MESA MESA AZ 0...,OPS*ORCHARD MESA MESA,OPS
321654,PURCHASE AUTHORIZED ON 06/14 PUBLIX SUPER MAR ...,PUBLIX SUPER MAR S L RICHMOND P,PUBLIX SUPER MAR S L RICHMOND P
514594,XXXX PUR KROGER 714 CARBONDALE IL (07/19/22 21...,PUR KROGER 714 CARBONDALE ( 21:14:13),PUR KROGER 714 CARBONDALE ( 21:14:13)
489754,WARRENS COLLISION CENT BRAIDWOOD IL 02/01,WARRENS COLLISION CENT BRAIDWOOD,WARRENS COLLISION CENT BRAIDWOOD


In [15]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [16]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

TARGET.COM
BETMGM
SHIPT
DES
GOFAN
CHEWY.COM
AMZNFREETIME
WINN-DIXIE
BASKIN
RALPHS
STEAK-N-SHAKE
INTUIT
SAVEMART
SLICE
DILLONS
STATERBRO
TOLLWAY-AUTOREPLEN
VANS
PRICELN
MCW
P
FRYS-MKTPLACE
QFC
*EBAY
FOODMAXX
SKILLZ
IHOP
PETCO
BASHAS''
VERIZONWRLSS
*STARBUCKS
WALGREENS
BANFIELD-PET
GAMESTOP
*MICROSOFT
POPEYES
NEWSSTAND
POTBELLY
OTT
RAINBOW
ABC
OCULUS
*UBER
UPS
EVI
IBI
CLAIRE'S
CANVA
GOODWILL
SHOPIFY
NORTON
FOOD4LESS
LJS
L
WEGMANS
ETT
AMZ
RIOT
SEZZLE
TILLYS
NYTIMES
GNC
FRG
STAPLES
GERALD
ABCMOUSE.COM
HOME
MARINA
STORE
JACK'S
FIV
JOURNEYS
GOFNDME
PCH
CHECKERS
WWW.KOHLS.COM
HLLFRSH
OPC
BELK
FBPAY
RVT
EZPASS
DROPBOX
DRI
CRT
ENMARKET
ECS
UBR
ETSY.COM
REI
NORDSTROM
MOE'S
BOXYCHARM
PAY
SUBWAY
PACSUN
*SHEIN
DOLLARTRE
GLOSS
EBAY
LUCKY
EVERYPLATE
CRUMBL
EA
PAR
BLUESKY
PEET'S
BUCKLE
AJ'S
DOTERRA
TLF
OPS
*EPIC
CDSR
TOMMY'S
M
*PLAYSTATIO
RGP
MEIJER
RONAN
AGI
GIV
CKO
ZTL
G
DECKERS
WISH
BIRD
*DOMINO'S
ROBLOX
RPS
APPLE.COM
FRED-MEYER
EPC
FLANIGANS
LEGALSHIELD
ANCESTRY.COM
COSMOPROF
TLG
WINRED
EXPRESS
VO

In [17]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
178224,ILXXXX,,
512862,XXXX,,
479942,Usa,,
445959,SP & G SHOOTING RANGEXXXXVIRGINIA BEACVA,SP & G SHOOTING RANGE VIRGINIA BEACVA,
88157,"CHECKCARD XXXX SQ *""IT'S ALL PEACHY"" F Centenn...","SQ *""IT'S ALL PEACHY"" F CENTENNIAL",
232980,POS Debit - Visa Check Card XXXX - FLXXXX,,
523798,XXXXXXXXX,,
523552,XXXXXXX,,
523860,XXXXXXXXXXXX,,
523840,XXXXXXXXXXX XXXXXX,,


In [18]:
df[df['memo_pre'].str.contains('ELEVEN')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
293609,PURCHASE AUTHORIZED ON 04/02 7-ELEVEN MANSFIEL...,7-ELEVEN MANSFIELD P,7-ELEVEN
322096,PURCHASE AUTHORIZED ON 06/16 7-ELEVEN NORTH RI...,7-ELEVEN NORTH RICHLAN P,7-ELEVEN
6663,7-ELEVEN SAN MARCOS CA 0...,7-ELEVEN SAN MARCOS,7-ELEVEN
522405,XXXXXX PURCHASE-PIN 10/03 09:01 7-ELEVEN MURRI...,PIN 09:01 7-ELEVEN MURRIETA 00MTH801,PIN 09:01 7-ELEVEN MURRIETA 00MTH801
390455,PURCHASE AUTHORIZED ON 12/09 7-ELEVEN XXXXX PL...,7-ELEVEN PLANO P,7-ELEVEN
63348,CHECKCARD XXXX 7-ELEVEN XXXXX SARASOTA FL CKCD...,7-ELEVEN SARASOTA CKCD,7-ELEVEN
6196,7-ELEVEN HUNTINGTON ST NY 0...,7-ELEVEN HUNTINGTON ST,7-ELEVEN
296010,PURCHASE AUTHORIZED ON 04/08 7-ELEVEN FORT MYE...,7-ELEVEN FORT MYERS P,7-ELEVEN
305277,PURCHASE AUTHORIZED ON 05/02 7-ELEVEN WEST PAL...,7-ELEVEN WEST PALM BEA P,7-ELEVEN
349202,PURCHASE AUTHORIZED ON 08/26 7-ELEVEN PEMBROKE...,7-ELEVEN PEMBROKE PINE P,7-ELEVEN


In [19]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

291266                                       AMAZON.COM
393783                                       AMAZON.COM
374565                                       AMAZON.COM
27951                           APPLE.COM BILL 04:40P #
227131                             16:19 AMAZON COM #31
228511                                       AMAZON.COM
75528           DAILY HARVEST HTTPSWWW.DAILNY RECURRING
500759                            WITHDRAWAL AMAZON.COM
38184                                        AMAZON.COM
174420                    HELP.HBOMAX.COM HTTPSHBOMAX.C
66518                                        AMAZON.COM
79955                                 IHERB IHERB.COM H
482703                                       CCBILL.COM
496941                                       WWW.STYLES
110203                                       AMAZON.COM
517025       VSA PUR WWW.KOHLS.COM XXX-XXX- ( 06:37:09)
520628    00:26 DOORDASH*CHRONIC WWW.DOORDASH. APGEHDCX
228817                                       AMA

In [20]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
518362,XXXXX POS SIGNATURE APPLE.COM/BILL CA INA000 X...,APPLE.COM BILL INA000,APPLE.COM BILL INA000
291625,PURCHASE AUTHORIZED ON 03/28 APPLE.COM/BILL XX...,APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S
244051,POS PURCHASE / MERCHANT PURCHASE TERMINAL XXXX...,PP*APPLE.COM BILL,APPLE.COM BILL
501319,Withdrawal CONSUMER DEBIT / APPLE.COM/BILL XXX...,WITHDRAWAL CONSUMER DEBIT APPLE.COM BILL XXX-X...,WITHDRAWAL CONSUMER DEBIT APPLE.COM BILL XXX-X...
27333,APPLE.COM/BILL 08-27 XXX-XXX-XXXX CA XXXX DEBI...,APPLE.COM BILL XXX-XXX,APPLE.COM BILL XXX-XXX
...,...,...,...
248691,POS Purchase Non-PIN APPLE.COM/BILL CA XXXXXX ...,NON-PIN APPLE.COM BILL ***** 23:09,NON-PIN APPLE.COM BILL
27195,APPLE.COM/BILL 06-08 XXX-XXX-XXXX CA XXXX DEBI...,APPLE.COM BILL XXX-XXX,APPLE.COM BILL XXX-XXX
28545,APPLE.COM/BILL XXX-XXX-XXXX CADate 03/12/22 0 ...,APPLE.COM BILL XXX-XXX- CADATE 0 0 RECURRING W...,APPLE.COM BILL XXX-XXX- CADATE 0 0 RECURRING W...
279831,PURCHASE AUTHORIZED ON 02/25 PP*APPLE.COM/BILL...,PP*APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S


In [21]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
34559,Amazon.com*1F0P04VM0 Amzn.com/bill WA 0...,AMAZON.COM*1F0P04VM0 AMZN.COM BILL,AMAZON.COM
206097,"MSFT * EXXXXJJ0BO,E070 MSBILL.INFO, WA, USA","MSFT * E JJ0BO,E070 MSBILL.INFO, ,",MSFT
188504,L2G*CITYOFSPOKANEFEES WA Date 08/20/21 XXXXXXXXXX,L2G*CITYOFSPOKANEFEES,L2G
208029,Microsoft*Ultimate 1 M XXX-XXXXXXX WA 0...,MICROSOFT*ULTIMATE 1 M XXX,MICROSOFT
12403,ALLEGNT*A BPLRFV,ALLEGNT*A BPLRFV,ALLEGNT
...,...,...,...
66212,CHECKCARD XXXX AMAZON.COM*JO9XZ4 AMZN.COM/BILL...,AMAZON.COM*JO9XZ4 AMZN.COM BILLWA,AMAZON.COM
279831,PURCHASE AUTHORIZED ON 02/25 PP*APPLE.COM/BILL...,PP*APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S
99359,CMSVEND*CRICKLE ROCHESTER NY USA,CMSVEND*CRICKLE ROCHESTER,CMSVEND
469953,TST* CHEMISTRY TAPAS & TOVIRGINIA BEACVA,TST* CHEMISTRY TAPAS & TOVIRGINIA BEACVA,CHEMISTRY TAPAS


In [22]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
178224,ILXXXX,,
512862,XXXX,,
479942,Usa,,
445959,SP & G SHOOTING RANGEXXXXVIRGINIA BEACVA,SP & G SHOOTING RANGE VIRGINIA BEACVA,
88157,"CHECKCARD XXXX SQ *""IT'S ALL PEACHY"" F Centenn...","SQ *""IT'S ALL PEACHY"" F CENTENNIAL",
232980,POS Debit - Visa Check Card XXXX - FLXXXX,,
523798,XXXXXXXXX,,
523552,XXXXXXX,,
523860,XXXXXXXXXXXX,,
523840,XXXXXXXXXXX XXXXXX,,


In [23]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

426235                           [ADOBE ,  ADOBE.LY ENUS S]
66934                 [AMAZON.COM,  U83 AM AMZN.COM BILLWA]
64023            [AMAZON.COM, 115PQ1AK3 AM AMZN.COM BILLWA]
297256                [AMAZON.COM, 1A9IX59 AMZN.COM BILL S]
228317                     [AMAZON.COM, 1C3SC5E AMZN.COM B]
408904          [AMAZON.COM, 1H DV2 AMAZON.COM SEATTLE WAU]
64322            [AMAZON.COM, 1N2JI9GO1 AM AMZN.COM BILLWA]
14272           [AMAZON.COM, 1V3IY6FJ3 AMZNAMZN.COM BILLWA]
35253                   [AMAZON.COM, 1V6I89EJ0 AMZN.COM BI]
35297                 [AMAZON.COM, 1W4UQ51E0 AMZN.COM BILL]
14314                               [AMAZON.COM, 1X27S9A81]
35358                 [AMAZON.COM, 1X9KE0GH3 AMZN.COM BILL]
64592            [AMAZON.COM, 216IL60D2 AM AMZN.COM BILLWA]
35578                               [AMAZON.COM, 286BK5IY1]
35592                 [AMAZON.COM, 291G35RQ2 AMZN.COM BILL]
346851                [AMAZON.COM, 2D9IG09 AMZN.COM BILL S]
35908                    [AMAZON.COM, 2G

# Phase 2: Extract & Analyze N-Grams

In [24]:
df_p2 = pd.read_csv("memos_P1.csv")

In [25]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [26]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [27]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 2.05 s, sys: 1.15 ms, total: 2.05 s
Wall time: 2.05 s


In [28]:
top_1grams.sort(reverse=True)
top_1grams

[('york', 396),
 ('xxx', 10544),
 ('worth', 163),
 ('wm', 165),
 ('withdrawal', 2859),
 ('wine', 303),
 ('whse', 267),
 ('west', 436),
 ('wendys', 179),
 ('wendy', 245),
 ('web', 448),
 ('wayflyer', 159),
 ('wal', 1119),
 ('vons', 205),
 ('visalia', 185),
 ('vegas', 285),
 ('valley', 510),
 ('usps', 377),
 ('uber', 486),
 ('tst', 234),
 ('troy', 476),
 ('trip', 234),
 ('tree', 214),
 ('tr', 391),
 ('tobacco', 177),
 ('time', 186),
 ('temecula', 168),
 ('target', 546),
 ('tampa', 249),
 ('taco', 912),
 ('sushi', 181),
 ('super', 1203),
 ('sup', 550),
 ('stop', 268),
 ('st', 777),
 ('sq', 160),
 ('spring', 277),
 ('sports', 210),
 ('spa', 200),
 ('south', 180),
 ('sonic', 283),
 ('snack', 168),
 ('smoke', 250),
 ('signature', 541),
 ('sign', 160),
 ('sig', 316),
 ('shoprite', 242),
 ('shop', 642),
 ('seattle', 429),
 ('santa', 409),
 ('san', 1533),
 ('samsclub', 244),
 ('sams', 279),
 ('saint', 223),
 ('safeway', 245),
 ('s1', 196),
 ('royal', 163),
 ('ross', 320),
 ('river', 200),
 ('ri

In [29]:
top_2grams

[('amazon com', 4656),
 ('xxx xxx', 4275),
 ('cash app', 3442),
 ('com xxx', 949),
 ('wal mart', 898),
 ('amazon prime', 729),
 ('mart super', 669),
 ('mobile purchase', 605),
 ('withdrawal debit', 568),
 ('dollar general', 561),
 ('mart sup', 515),
 ('san diego', 474),
 ('chick fil', 472),
 ('taco bell', 464),
 ('apple com', 413),
 ('dollar tr', 380),
 ('help uber', 379),
 ('uber com', 365),
 ('publix super', 343),
 ('new york', 333),
 ('burger king', 332),
 ('home depot', 313),
 ('little caesars', 296),
 ('debit signature', 293),
 ('signature purchase', 293),
 ('pos deb', 289),
 ('family dollar', 279),
 ('las vegas', 275),
 ('costco whse', 262),
 ('pos pur', 244),
 ('super mar', 242),
 ('sonic drive', 230),
 ('mart com', 225),
 ('amzn com', 219),
 ('amzn mktp', 205),
 ('sams club', 202),
 ('los angeles', 202),
 ('non pin', 193),
 ('hunt valley', 192),
 ('san antonio', 187),
 ('trip help', 186),
 ('stop shop', 184),
 ('signature debit', 181),
 ('eats help', 174),
 ('fort myers', 171),

In [30]:
top_3grams

[('com xxx xxx', 805),
 ('wal mart sup', 515),
 ('help uber com', 356),
 ('debit signature purchase', 293),
 ('publix super mar', 242),
 ('wal mart super', 172),
 ('trip help uber', 169),
 ('eats help uber', 167),
 ('apple com xxx', 162),
 ('mobile purchase sign', 159),
 ('purchase sign based', 159),
 ('amazon com seattle', 155),
 ('withdrawal signature debit', 119),
 ('nayax hunt valley', 115),
 ('bath body works', 94),
 ('purchase amazon com', 93),
 ('debit pin purchase', 92),
 ('com aa xxx', 88),
 ('aa xxx xxx', 88),
 ('xxx xxx troy', 87),
 ('withdrawal amazon com', 85),
 ('pur amazon com', 81),
 ('mart com aa', 80),
 ('pos amazon com', 78),
 ('salt lake cit', 77),
 ('info target om', 76),
 ('domino xxx xxx', 75),
 ('point sale debitl340', 73),
 ('purchase cash app', 72),
 ('help hbomax com', 72),
 ('klover app boost', 70),
 ('merchant issued payment', 68),
 ('issued payment target', 67),
 ('payment target target', 67),
 ('ppd info target', 66),
 ('pin amazon com', 65),
 ('fresh cof

In [31]:
# Use 1 grams to find prefixes