# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 7184c0d] regex
 1 file changed, 18569 insertions(+), 8268 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     CHECK CARD PURCHASE MERCHANT PURCHASE TERMINAL...
1     DBT CRD XXXX DSUGAHZ5 WWW.MICHIGAN XXXXXXXXX X...
2                   DD/BR #XXXXXX Q35 HODGKINS IL 08/01
3     DDA PUR 2L1D60H5RSI3 AMAZON.COM SEATTLE WA AMA...
4     Debit Purchase -visa Card XXXXthanh Huong Sand...
5                                           El Mariachi
6                                       MARIOS FAMOUS P
7     PENN JERSEY PEN ARGYL 03-24-22 PEN ARGYL PA XX...
8                                 PORTMAN S FARM MARKET
9     POS Debit - Visa Check Card XXXX - AMZN MKTP U...
10    POS Debit - Visa Check Card XXXX - CASH APP*KE...
11    POS Debit - Visa Check Card XXXX - HABIT VACAV...
12    POS Debit - Visa Check Card XXXX - PIZZA HUT X...
13             POS Withdrawal Day's Jewelers (Aug XXXXX
14    PURCHASE AUTHORIZED ON 02/07 PUBLIX SUPER MAR ...
15    PURCHASE AUTHORIZED ON 02/19 AMZN Mktp US*1B6I...
16    PURCHASE AUTHORIZED ON 04/07 HERMANOS TAQUERIA...
17    PURCHASE AUTHORIZED ON 04/25 AMZN Mktp US*

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [31]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [32]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...

    # === 6) Misc tails ===
    # === UPDATED RULE ===
    (r"\b(?:INST\s+XFER|RETRY\s+PYMT)\s+(?:ID)?\b", " "), # Handles PAYPAL DES:INST XFER ID...
    (r"\bPAYPAL\s+XFER\b", " "), # Handles PAYPAL XFER
    
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    
    (NOISE_WORDS_REGEX, " "),
    

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [33]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # === PAYPAL RULES ===
    # Rules for 'PAYPAL DES:...' format
    re.compile(r"^PAYPAL\s+DES:.*?:(.*?)(?:\s+INDN:.*)?$"),
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),
    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Corrected

    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # This generic PAYPAL rule must come *after* specific PAYPAL rules
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OPC\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [34]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 4.64 s, sys: 24.9 ms, total: 4.67 s
Wall time: 4.66 s


In [35]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 622 ms, sys: 4.02 ms, total: 626 ms
Wall time: 623 ms


In [11]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [12]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [36]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
228170,POS Debit - Visa Check Card XXXX - AMAZON PRIM...,AMAZON PRIME*J78PR AMZN.COM B,AMAZON PRIME
322913,PURCHASE AUTHORIZED ON 06/18 AMZN Digital*MS7O...,AMZN *MS7OS S,AMZN
136615,DOORDASH*EUREKA!,DOORDASH*EUREKA!,DOORDASH
353494,PURCHASE AUTHORIZED ON 09/05 FLORIDA NAILS & S...,FLORIDA NAILS & SPA TAMPA P,FLORIDA NAILS & SPA TAMPA P
300108,PURCHASE AUTHORIZED ON 04/18 HAPPINESS NAILS B...,HAPPINESS NAILS BAKERSFIELD S,HAPPINESS NAILS BAKERSFIELD S
187263,KROGER #6 XXXX CENTER ESSEXVILLE MI 0...,KROGER #6 ESSEXVILLE,KROGER
448171,SPROUTS FARMERS MKT#26 SAN JOSE CA XXXXXX 08/12,SPROUTS FARMERS MKT#26 SAN JOSE,ROUTS FARMERS MKT
355687,PURCHASE AUTHORIZED ON 09/10 UBER *EATS HELP.U...,UBER *EATS HELP.UB XXX- S,UBER
498409,Windsor Fashions,WINDSOR FASHIONS,WINDSOR FASHIONS
505885,Withdrawal Debit Card / STUDY.COM CA Date 07/2...,WITHDRAWAL STUDY.COM,WITHDRAWAL STUDY.COM


In [14]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'TARGET', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [37]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

AMZN
WM
ROSS
LION
WINCO
EXPRESS
AMZNFREETIME
CHEWY.COM
SALLY
WINN-DIXIE
RALPHS
GOFAN
FAIRWAY
SHIPT
SLICE
NORTHGATE
MEIJER
SKILLZ
IHOP
RIOT
P
STEAK-N-SHAKE
RAINBOW
ABC
BASKIN
BETMGM
GNC
INTUIT
STATERBRO
TILLYS
PETCO
VANS
MARINA
CANVA
VERIZONWRLSS
BASHAS''
BANFIELD-PET
TOLLWAY-AUTOREPLEN
BELL''S
DEPOT
MCW
BEAN
SMITHS
SEZZLE
DILLONS
FOOD4LESS
POPEYES
FRYS-MKTPLACE
FOODMAXX
EBAY
GAMESTOP
JOURNEYS
FRG
GOODWILL
FIV
OCULUS
*STARBUCKS
CLAIRE'S
CRD
*MICROSOFT
QFC
UPS
STOCKTON
BORO
NORDSTROM
PRICELN
BELK
FBPAY
*UBER
QUADPAY
SUBWAY
BLUESKY
.KOHLS.COM
KEY
NYTIMES
EA
STAPLES
DROPBOX
ETT
COSMOPROF
IBI
CRT
LJS
ABCMOUSE.COM
CRYPTO.COM
MESA
CUB
SAVEMART
BUCKLE
TOWN
TRTHFDR
LEGALSHIELD
PACSUN
GERALD
M
EVI
AMZ
OPS
TACOMA
*EBAY
NORTON
LUCKY
CHECKERS
GLOSS
AF
OTT
WALGREENS
FACEBK
RGP
FRED-MEYER
HOMEDEPOT.COM
FH
V
DRI
ROBLOX
NEWSSTAND
SEDANOS
ROSES
PAR
SMITH'S
CDSR
SHOPIFY
POSHMARK
TOMMY'S
HSN
SIMPLISAFE
E-Z
VONS.COM
NIKE.COM
PAM
JEANNETTE
CKO
HLLFRSH
REI
EZPASS
SAMSCLUB.COM
*STEAM
PORTLAND
PARKMOBILE
DAVIS

In [46]:
df[df['memo_post'] == 'TARGET']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post


In [17]:
df[df['memo_pre'].str.contains('OPC')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
236551,POS Debit - Visa Check Card XXXX - SP * LOVEPO...,SP * LOVEPOPCARDS. .LOVEPO,LOVEPOPCARDS. .LOVEPO
320808,PURCHASE AUTHORIZED ON 06/12 DOC POPCORN Tempe...,DOC POPCORN TEMPE,DOC POPCORN TEMPE
165131,GARRETT POPCORN SHOPS,GARRETT POPCORN SHOPS,GARRETT POPCORN SHOPS
151194,"Double Good Popcorn, Httpswww.Doub","DOUBLE GOOD POPCORN, .DOUB","DOUBLE GOOD POPCORN, .DOUB"
214652,OPC*CONN APPLIANCESIN XXX-XXX-XXXX TX 1...,OPC*CONN APPLIANCESIN XXX-XXX,OPC
118531,DBT CRD XXXX 10/03/22 XXXXXXX WDW POPCORN CART...,CRD WDW POPCORN CARTS LAKE BUENA VI C#**,CRD WDW POPCORN CARTS LAKE BUENA VI C
214641,OPC*CONN APPLIANCESIN XXX-XXX-XXXX TX 0...,OPC*CONN APPLIANCESIN XXX-XXX,OPC
214654,OPC*CONN APPLIANCESIN XXX-XXX-XXXX TX 1...,OPC*CONN APPLIANCESIN XXX-XXX,OPC
214657,OPC*CONN APPLIANCESINC,OPC*CONN APPLIANCESINC,OPC
201509,METROPCS MOBIL,METROPCS MOBIL,METROPCS MOBIL


In [18]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

482964           A PUR AMZN MKTP US HT1AX8KB2 AMZN COM BILL
16493                                            AMAZON.COM
267281                                           AMAZON.COM
125575                                        AT AMAZON.COM
94294                      GOTINDER.COM HELP XXX- RECURRING
16737                                            AMAZON.COM
79636                                 HOMEDEPOT.COM XXX-XXX
141728                                           AMAZON.COM
174421                        HELP.HBOMAX.COM HTTPSHBOMAX.C
47038                              BROOKLYN BAGELS GOSQ.COM
485343                                             VONS.COM
39265                                            AMAZON.COM
441167                                           SEGPAY.COM
528735                                  .TIME4 LEARNING.COM
420300                          HELP.HBOMAX.COM HTTPSHBOMAX
87776                      FIVE MARYS FARMS HTTPSM5FIVEMACA
384708                                  

In [19]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
244078,POS PURCHASE / MERCHANT PURCHASE TERMINAL XXXX...,PP*APPLE.COM BILL,APPLE.COM BILL
116390,DBT CRD XXXX DJI55YME APPLE.COM/BILL XXX-XXX-X...,CRD DJI55YME APPLE.COM BILL XXX-XXX- C#,CRD DJI55YME APPLE.COM BILL XXX-XXX- C
255742,PP*APPLE.COM/BILL XXX-XXX-XXXX CA 0...,PP*APPLE.COM BILL XXX-XXX,APPLE.COM BILL XXX-XXX
253658,POS Withdrawal (FIS) APPLE.COM/BILL APPLE.COM/...,(FIS) APPLE.COM BILL APPLE.COM BILL ( ),(FIS) APPLE.COM BILL APPLE.COM BILL ( )
115991,DBT CRD XXXX 54 APPLE.COM/BILL 866-712-XXXXCA ...,CRD APPLE.COM BILL - - #,CRD APPLE.COM BILL - -
...,...,...,...
505107,Withdrawal Debit Card / APPLE.COM/BILL CA Date...,WITHDRAWAL APPLE.COM BILL,WITHDRAWAL APPLE.COM BILL
105609,Card purchase APPLE.COM/BILL CA 11-18-XXXX,PURCHASE APPLE.COM BILL,PURCHASE APPLE.COM BILL
244013,POS PURCHASE / MERCHANT PURCHASE TERMINAL XXXX...,PP*APPLE.COM BILL,APPLE.COM BILL
412139,Point of Sale Debit L340 DATE 05-11 APPLE COM/...,L340 APPLE COM BILL XXX-XXX,L340 APPLE COM BILL XXX-XXX


In [20]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
37687,Amazon.com*HT9HF4LA1 Amzn.com/bill WA 1...,AMAZON.COM*HT9HF4LA1 AMZN.COM BILL,AMAZON.COM
66306,CHECKCARD XXXX AMAZON.COM*LW76D0EZ3 AM AMZN.CO...,AMAZON.COM*LW76D0EZ3 AM AMZN.COM BILLWA,AMAZON.COM
244078,POS PURCHASE / MERCHANT PURCHASE TERMINAL XXXX...,PP*APPLE.COM BILL,APPLE.COM BILL
390809,PURCHASE AUTHORIZED ON 12/09 TST* LITTLE HAVAN...,TST* LITTLE HAVANA NORTH MIAMI,LITTLE HAVANA NORTH
317494,PURCHASE AUTHORIZED ON 06/03 SQ *CAESAR'S DELI...,SQ *CAESAR'S BAKERSFIELD,CAESAR'S BAKERSFIELD
...,...,...,...
38453,Amazon.com*MV0NE6HJ0 Amzn.com/bill WA Order Nu...,AMAZON.COM*MV0NE6HJ0 AMZN.COM BILL ORDER NUMBE...,AMAZON.COM
124351,DEBIT CARD DEBIT / auth #XXXXXX 04-13-XXXX RVT...,RVT*WYLIE BULLD,RVT
425982,RECURRING PAYMENT AUTHORIZED ON 07/04 GOOGLE *...,GOOGLE *FUNJOY XXX-XXX,FUNJOY XXX-XXX
65877,CHECKCARD XXXX AMAZON.COM*H24JY7BQ0 AM AMZN.CO...,AMAZON.COM*H24JY7BQ0 AM AMZN.COM BILLWA,AMAZON.COM


In [21]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

3612                            [2COCOM,  STUDI ALPHARETTA]
265925                          [AMAZON ,  G AMZN.COM BILL]
371369                       [AMAZON , 2Y3QW AMZN.COM BILL]
32135                       [AMAZON , BW L63 AMZN.COM BILL]
32171                      [AMAZON , GY1DO66 AMZN.COM BILL]
302267                       [AMAZON , QJ27Q AMZN.COM BILL]
527007             [AMAZON , W2 TERRY AVE N AMZN.COM B 8 6]
39380                      [AMAZON.COM,  LJ1 AMZN.COM BILL]
13871                   [AMAZON.COM, 1F98D75 AMZN.COM BILL]
404030              [AMAZON.COM, 1H4VO72F2 AMZN.COM BILLWA]
278560                  [AMAZON.COM, 1I3BM5B AMZN.COM BILL]
34740                      [AMAZON.COM, 1I9Y AMZN.COM BILL]
228451                     [AMAZON.COM, 1X5Y60Z AMZN.COM B]
228463                     [AMAZON.COM, 1Z3P06H AMZN.COM B]
15030                 [AMAZON.COM, 2R8T14JI0 SEATT LE JX6L]
391324                  [AMAZON.COM, 395K270 AMZN.COM BILL]
71914                  [AMAZON.COM, 479X

# Phase 2: Extract & Analyze N-Grams

In [22]:
df_p2 = pd.read_csv("memos_P1.csv")

In [23]:
print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

199936    MCW
523533       
94267       O
91723     THE
427615    XXX
415793    ABC
504092    QVC
366205       
31288     XXX
427952      M
428075    CRT
210233    NIC
354884    AWL
496123       
154214    EVI
5363         
157615    FDH
504141    QVC
265016    IBI
9640       AF
331275      O
521914      S
397402       
124201       
26312      AP
418226    MR.
495650       
77813     FIV
424585     NS
166476    GNC
206623    MYP
7997      ABC
7969       AB
280737       
418747    QFC
527768      O
520028      A
418466      O
244969    ABC
99289     CMS
467307    TLF
425163    XXX
495364       
495274       
286463      O
151907      T
199965    MCW
255225    POS
366114     ST
118166    CRD
10340     XXX
419151    QVC
439265       
79902     IBI
94281       O
365604    NIC
177391    IBI
42537     BCY
151539     EA
495244       
426823    STO
206088    MSB
12232     ALG
192949      L
424300    XXX
427662    XXX
372210    YPS
176017    HSN
460543       
142925    RPS
355282      O
335595

In [24]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [25]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 2.06 s, sys: 8.79 ms, total: 2.07 s
Wall time: 2.07 s


In [26]:
top_1grams.sort(reverse=True
    
)
top_1grams

[('york', 396),
 ('xxx', 10544),
 ('worth', 163),
 ('wm', 165),
 ('withdrawal', 2859),
 ('wine', 303),
 ('whse', 267),
 ('west', 436),
 ('wendys', 179),
 ('wendy', 245),
 ('web', 448),
 ('wayflyer', 159),
 ('wal', 1119),
 ('vons', 205),
 ('visalia', 185),
 ('vegas', 285),
 ('valley', 510),
 ('usps', 377),
 ('uber', 486),
 ('tst', 234),
 ('troy', 476),
 ('trip', 234),
 ('tree', 214),
 ('tr', 391),
 ('tobacco', 177),
 ('time', 186),
 ('temecula', 168),
 ('target', 546),
 ('tampa', 249),
 ('taco', 912),
 ('sushi', 181),
 ('super', 1203),
 ('sup', 550),
 ('stop', 268),
 ('st', 777),
 ('sq', 160),
 ('spring', 277),
 ('sports', 210),
 ('spa', 200),
 ('south', 180),
 ('sonic', 283),
 ('snack', 168),
 ('smoke', 250),
 ('signature', 541),
 ('sign', 160),
 ('sig', 316),
 ('shoprite', 242),
 ('shop', 642),
 ('seattle', 429),
 ('santa', 409),
 ('san', 1533),
 ('samsclub', 244),
 ('sams', 279),
 ('saint', 223),
 ('safeway', 245),
 ('s1', 196),
 ('royal', 163),
 ('ross', 320),
 ('river', 200),
 ('ri

In [27]:
top_2grams

[('amazon com', 4656),
 ('xxx xxx', 4275),
 ('cash app', 3442),
 ('com xxx', 949),
 ('wal mart', 898),
 ('amazon prime', 729),
 ('mart super', 669),
 ('mobile purchase', 605),
 ('withdrawal debit', 568),
 ('dollar general', 561),
 ('mart sup', 515),
 ('san diego', 474),
 ('chick fil', 472),
 ('taco bell', 464),
 ('apple com', 413),
 ('dollar tr', 380),
 ('help uber', 379),
 ('uber com', 365),
 ('publix super', 343),
 ('new york', 333),
 ('burger king', 332),
 ('home depot', 313),
 ('little caesars', 296),
 ('debit signature', 293),
 ('signature purchase', 293),
 ('pos deb', 289),
 ('family dollar', 279),
 ('las vegas', 275),
 ('costco whse', 262),
 ('pos pur', 244),
 ('super mar', 242),
 ('sonic drive', 230),
 ('mart com', 225),
 ('amzn com', 219),
 ('amzn mktp', 205),
 ('sams club', 202),
 ('los angeles', 202),
 ('non pin', 193),
 ('hunt valley', 192),
 ('san antonio', 187),
 ('trip help', 186),
 ('stop shop', 184),
 ('signature debit', 181),
 ('eats help', 174),
 ('fort myers', 171),

In [28]:
top_3grams

[('com xxx xxx', 805),
 ('wal mart sup', 515),
 ('help uber com', 356),
 ('debit signature purchase', 293),
 ('publix super mar', 242),
 ('wal mart super', 172),
 ('trip help uber', 169),
 ('eats help uber', 167),
 ('apple com xxx', 162),
 ('mobile purchase sign', 159),
 ('purchase sign based', 159),
 ('amazon com seattle', 155),
 ('withdrawal signature debit', 119),
 ('nayax hunt valley', 115),
 ('bath body works', 94),
 ('purchase amazon com', 93),
 ('debit pin purchase', 92),
 ('com aa xxx', 88),
 ('aa xxx xxx', 88),
 ('xxx xxx troy', 87),
 ('withdrawal amazon com', 85),
 ('pur amazon com', 81),
 ('mart com aa', 80),
 ('pos amazon com', 78),
 ('salt lake cit', 77),
 ('info target om', 76),
 ('domino xxx xxx', 75),
 ('point sale debitl340', 73),
 ('purchase cash app', 72),
 ('help hbomax com', 72),
 ('klover app boost', 70),
 ('merchant issued payment', 68),
 ('issued payment target', 67),
 ('payment target target', 67),
 ('ppd info target', 66),
 ('pin amazon com', 65),
 ('fresh cof

In [29]:
# Use 1 grams to find prefixes