# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

On branch main
Your branch is ahead of 'origin/main' by 11 commits.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   Kyle_Choi.ipynb
	deleted:    memos_cleaned.csv

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	memos_P1.csv
	sample.csv

no changes added to commit (use "git add" and/or "git commit -a")


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=1)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     229 DBT CRD XXXX 12/30/21 XXXXXXXXXXXX GREAT C...
1                        CASH APP*FRANK COHE XXXXXXXXXX
2      Card purchase BOJANGLES XXXX GREER SC 08-26-XXXX
3     DDA PURCHASE *XXXX XXXXXXXX   QUICK CHEK FOOD ...
4     DOLLAR GENERAL # DG 14 DACULA GA     XXXXXX  0...
5                                    DOLLAR TR XXXX SIB
6       GOOGLE APPS_COMME US002D2GYG WEB ID: FXXXXXXXXX
7                               MARIETTA DINER GA 02/17
8     PURCHASE AUTHORIZED ON 03/16 AMZN Mktp US*R27T...
9     PURCHASE AUTHORIZED ON 06/26 UBER* EATS HTTPSW...
10    PURCHASE AUTHORIZED ON 07/03 VONS #XXXX PALM S...
11    PURCHASE AUTHORIZED ON 07/23 EXXPRESS MART # P...
12    PURCHASE AUTHORIZED ON 08/05 MCDONALD'S FXXXX ...
13    PURCHASE AUTHORIZED ON 08/30 SQ *FAMOUS FAMIGL...
14    PURCHASE AUTHORIZED ON 12/15 WORLDREMIT XXX-XX...
15    Point of Sale Debit L340 DATE 09-23 SOUTHEASTP...
16    RCH TRANSPENINSULAR ROSARITO BCN             1...
17    ROSS STORES #125 SAN FRANCISCO CA         

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [8]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    # === UPDATED RULE ===
    (r"\b(?:INST\s+XFER|RETRY\s+PYMT)\s+(?:ID)?\b", " "), # Handles PAYPAL DES:INST XFER ID...
    (r"\bPAYPAL\s+XFER\b", " "), # Handles PAYPAL XFER
    
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [9]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # === PAYPAL RULES ===
    # Rules for 'PAYPAL DES:...' format
    re.compile(r"^PAYPAL\s+DES:.*?:(.*?)(?:\s+INDN:.*)?$"),
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),
    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Corrected

    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # This generic PAYPAL rule must come *after* specific PAYPAL rules
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OPC\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 4.48 s, sys: 36.5 ms, total: 4.52 s
Wall time: 4.52 s


In [11]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 694 ms, sys: 376 µs, total: 694 ms
Wall time: 692 ms


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [35]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
228547,POS Debit - Visa Check Card XXXX - AMAZON.COM*...,AMAZON.COM*2L8SQ1J AMZN.COM B,AMAZON.COM
404400,PURCHASE XXXX BHAKTI MARGA AMER ELMIRA NY XXXX...,BHAKTI MARGA AMER ELMIRA RECURRING,BHAKTI MARGA AMER ELMIRA RECURRING
325204,PURCHASE AUTHORIZED ON 06/24 CASH APP*BRIANCA*...,CASH APP*BRIANCA*A S,CASH APP
145969,Debit Purchase -visa Card XXXXdelta Air XXXXat...,DELTA AIR ATLANTA,DELTA AIR ATLANTA
122098,DD DOORDASH TGIFRIDAY XXXXXXXXXX CA XXXXX,DD DOORDASH TGIFRIDAY,DOORDASH TGIFRIDAY
243855,POS PURCHASE / MERCHANT PURCHASE TERMINAL XXXX...,MURPHY ATWALMAR T CHESTER,MURPHY ATWALMAR T CHESTER
471358,TST* NO.38 TENA DENVER CO USA,TST* NO.38 TENA DENVER,NO.38 TENA DENVER
418687,QALYTUDE CHICAGO IL,QALYTUDE CHICAGO,QALYTUDE CHICAGO
349526,PURCHASE AUTHORIZED ON 08/26 SMITH'S F 689 NOR...,SMITH'S F 689 NORTH RE SARATOGA SPRI P,SMITH'S F 689 NORTH RE SARATOGA SPRI P
515534,XXXX VSA PUR AFTERPAY CA (12/04/21 16:45:23),VSA PUR AFTERPAY ( 16:45:23),VSA PUR AFTERPAY ( 16:45:23)


In [39]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [40]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

AMZNFREETIME
WINN-DIXIE
PETCO
GOFAN
SLICE
IHOP
BETMGM
INTUIT
DILLONS
SHIPT
BASKIN
FRG
RALPHS
RIOT
VANS
STAPLES
SKILLZ
AMZ
*STARBUCKS
TOLLWAY-AUTOREPLEN
FOODMAXX
GAMESTOP
DROPBOX
POPEYES
CHEWY.COM
STEAK-N-SHAKE
BELK
FOOD4LESS
SEZZLE
BASHAS''
WALGREENS
P
BANFIELD-PET
VERIZONWRLSS
ETT
OCULUS
STATERBRO
SAVEMART
STORE
UPS
POTBELLY
WWW.KOHLS.COM
QFC
SUBWAY
PRICELN
BLUESKY
FRYS-MKTPLACE
MARINA
IBI
SHOPIFY
NYTIMES
GNC
APPLE.COM
MCW
*UBER
CLAIRE'S
RAINBOW
LUCKY
CRT
PARKMOBILE
*MICROSOFT
CANVA
L
CRYPTO.COM
*EBAY
FIV
TLG
ENMARKET
V
*STEAM
ABC
OPS
FAMOUSFOOTWEAR
OTT
TILLYS
DENNY''S
GOODWILL
EIG
FBPAY
GERALD
UBR
TRTHFDR
CHECKERS
ABCMOUSE.COM
FRED-MEYER
CRUMBL
SEDANOS
EA
NORTON
LIM
MESA
GRUBHUB
WASHINGTON
JACK'S
TACOMA
EXPRESS
QUADPAY
LYNWOOD
WEGMANS
EBAY
PAY
MEIJER
COLDSTONE
DOLLARTRE
ROBLOX
DOLLARSHAVECLUBUS
PAR
RVT
OFFICE
EPC
CKO
OUTBACK
MGM
TCB
GRID
E-Z
CINNABON
PAM
M
*PLAYSTATIO
TLF
LJS
TOMMY'S
MARIANOS
MURRIETA
JOURNEYS
REI
PAVILIONS
MOE'S
NEWSSTAND
ARBY'S
FS
ZTL
SL.NORD
BIRD
K
EZPASS
DRI
RONA

In [37]:
df[df['memo_post'] == 'TARGET']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post


In [41]:
df[df['memo_pre'].str.contains('TARGET')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
310201,PURCHASE AUTHORIZED ON 05/14 TARGET XXXX SAINT...,TARGET SAINT PAUL S,SAINT PAUL S
204688,MOBILE PURCHASE XXXX TARGET 000 TUCKER GA XXXX...,MOBILE PURCHASE TARGET 000 TUCKER,MOBILE PURCHASE TARGET 000 TUCKER
461745,TARGET T- XXXX E Hill Broken Arrow OK 0...,TARGET T- E HILL BROKEN ARROW,T- E HILL BROKEN ARROW
127528,DEBIT CARD PURCHASE POSXXXX TARGET T-XXXX Waxa...,POS TARGET T- WAXAHACHIE T B5F6JVN (CASH),POS TARGET T- WAXAHACHIE T B5F6JVN (CASH)
520185,XXXXXX POS DDA W/D 03/06 15:22 TARGET T-XXXX H...,POS DDA W D 15:22 TARGET T- HUNTINGTON BE,POS DDA W D 15:22 TARGET T- HUNTINGTON BE
461701,TARGET T- XXXX Camino San Diego CA 0...,TARGET T- CAMINO SAN DIEGO,T- CAMINO SAN DIEGO
460356,TARGET XXXXXXXX PLANO TX 0...,TARGET PLANO,PLANO
461946,TARGET T-XXXX 03/02 #XXXXXXXXX PURCHASE XXXX S...,TARGET T- # PURCHASE S POWER RD MESA,T- # PURCHASE S POWER RD MESA
127565,DEBIT CARD PURCHASE POSXXXX TARGET T-XXXX Waxa...,POS TARGET T- WAXAHACHIE T JDF6JW0 (CASH),POS TARGET T- WAXAHACHIE T JDF6JW0 (CASH)
461217,TARGET MARKETING XXX-XXX-XXXX MD 09/29/21 Ca...,TARGET MARKETING XXX-XXX- 16 #,MARKETING XXX-XXX- 16 #


In [19]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

290227                            APPLE.COM BILL XXX-XXX- S
310432                           GRUBHUBARBYS GRUBHUB.COM S
176814                                 HEWOULDLOVEFIRST.COM
178305                        IMPERFECT FOODS HTTPSWWW.IMPE
258588                      PURA SCENTS, INC. HTTPSWWW.PURA
222975                               PIZZA HUT HTTPS: IPCHA
517844                                           AMAZON.COM
174760                        HIGHLAND PARK WINE CLOVER.COM
334946                            APPLE.COM BILL XXX-XXX- S
13975                                            AMAZON.COM
501101                                WITHDRAWAL AMAZON.COM
151477                                           E-FILE.COM
116863    DBT CRD DSJNZHWN AMAZON PRIMENV5WS05L3 AMZN.CO...
179974                                          INTLSONGCOM
273824                            APPLE.COM BILL XXX-XXX- S
125365                        APPLE.COM BILL MQ2Z6EV (CASH)
34221                    AMAZON.COM DH5F

In [20]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
427969,RECURRING PAYMENT AUTHORIZED ON 09/09 APPLE.CO...,APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S
429204,RECURRING PAYMENT AUTHORIZED ON 10/17 APPLE.CO...,APPLE.COM BILL S,APPLE.COM BILL S
504777,Withdrawal Debit APPLE.COM/BILL XXX-XXX-XXXX C...,WITHDRAWAL DEBIT APPLE.COM BILL XXX-XXX- 138 20 #,WITHDRAWAL DEBIT APPLE.COM BILL XXX-XXX- 138 20
27704,APPLE.COM/BILL CA 07/07,APPLE.COM BILL,APPLE.COM BILL
28698,APPLE.COM/US XXXXX,APPLE.COM US,APPLE.COM US
...,...,...,...
27992,APPLE.COM/BILL CA XXXXX Debit Card Purchase 04...,APPLE.COM BILL 06:34P #,APPLE.COM BILL 06:34P #
518362,XXXXX POS SIGNATURE APPLE.COM/BILL CA INA000 X...,APPLE.COM BILL INA000,APPLE.COM BILL INA000
397979,PURCHASE AUTHORIZED ON 12/27 APPLE.COM/BILL CA...,APPLE.COM BILL S,APPLE.COM BILL S
425293,RECURRING PAYMENT AUTHORIZED ON 06/11 APPLE.CO...,APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S


In [21]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
123608,DDV *DiscoveryPlus XXX-XXXXXXX TN 0...,DDV *DISCOVERYPLUS XXX,V
282422,PURCHASE AUTHORIZED ON 03/04 LYFT *RIDE FRI 8 ...,LYFT *RIDE FRI 8 LYFT.COM S,LYFT
423778,RECURRING PAYMENT AUTHORIZED ON 04/19 Microsof...,MICROSOFT*REALMS P XXX- S,MICROSOFT
367492,PURCHASE AUTHORIZED ON 10/11 AMAZON.COM*278QF8...,AMAZON.COM*278QF85 AMZN.COM BILL S,AMAZON.COM
38574,Amazon.com*NU16 Amzn.com/bill WA USA,AMAZON.COM*NU16 AMZN.COM BILL,AMAZON.COM
...,...,...,...
168159,GOOGLE *SVCS49b117b9-d G.CO/WALLETH#CA,GOOGLE *SVCS49B117B9-D G.CO WALLETH#,GOOGLE
338637,PURCHASE AUTHORIZED ON 07/29 TST* Rapture Char...,TST* RAPTURE CHARLOTTESVIL S,RAPTURE CHARLOTTESVIL S
331322,PURCHASE AUTHORIZED ON 07/10 Amazon.com*298X54...,AMAZON.COM*298X54K AMZN.COM BILL S,AMAZON.COM
14658,AMAZON.COM*2C9KU6I13 SEATT LE WAXXX...,AMAZON.COM*2C9KU6I13 SEATT LE SHJWS935,AMAZON.COM


In [22]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

34324                     [AMAZON.COM, 132YO AMZN.COM BILL]
34691                 [AMAZON.COM, 1I2H869V2 AMZN.COM BILL]
34847                      [AMAZON.COM, 1L6B AMZN.COM BILL]
312802                [AMAZON.COM, 1X14S50 AMZN.COM BILL S]
381203                [AMAZON.COM, 203NG7Z AMZN.COM BILL S]
408926    [AMAZON.COM, 2C4N AMAZON.COM SEATTLE WAUS ## A...
65143            [AMAZON.COM, 2P9U836X0 AM AMZN.COM BILLWA]
245106                     [AMAZON.COM, 2R22S4WS0 AMZN.COM]
245110                   [AMAZON.COM, 3E4BE5V33 A AMZN.COM]
36347                    [AMAZON.COM, 5B K83 AMZN.COM BILL]
15285                               [AMAZON.COM, 5H3U01HB3]
36618                 [AMAZON.COM, 9H2YD9OO3 AMZN.COM BILL]
15834             [AMAZON.COM, F87SD8I63 A AMZN.COM BILLWA]
404204                 [AMAZON.COM, GG2WP8 AMZN.COM BILLWA]
65856            [AMAZON.COM, H06KG9J02 AM AMZN.COM BILLWA]
37350                 [AMAZON.COM, H83B152H1 AMZN.COM BILL]
332638                [AMAZON.COM, HH8KB

# Phase 2: Extract & Analyze N-Grams

In [23]:
df_p2 = pd.read_csv("memos_P1.csv")

In [24]:
print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

123608      V
183264    JEM
427119    CRT
327185    RGP
383259    FRG
373997    RGP
123587      L
419176    QVC
5457         
235900    RDA
17923     AMZ
111315    RGP
155015    EOC
5098         
251881    BNL
5609         
161384    FRG
275228     IT
78329     GDP
423894      K
441462     SE
355993    OPS
354591    FRG
5518         
5393         
485401    ABC
311763     LL
504057    QVC
62388     CFK
166469    GNC
78068     FRG
280981    OTT
98866     CKO
94116     ZTL
81029      KJ
217932    PAR
153678    EPC
358818    BCS
206088    MSB
92777     WAL
526270    ZTL
5696         
429705     HP
5534         
113134    CRT
5139         
142288     MW
429905    STK
526274    ZTL
45431     BLN
104478      C
191346    LJS
80661     JPP
414830    PAR
199921    MCP
377505    FIV
161382    FRG
166496    GNC
46557      BP
80722     K12
209679    NBS
45694      BM
6927         
96613     CHK
151545     EA
210236    NIC
422380     SR
240318    POS
512760     X.
192761    LUG
75308     CRT
98868 

In [25]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [26]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 2.11 s, sys: 16.3 ms, total: 2.12 s
Wall time: 2.12 s


In [27]:
top_1grams.sort(reverse=True
    
)
top_1grams

[('york', 396),
 ('xxx', 10544),
 ('worth', 163),
 ('wm', 165),
 ('withdrawal', 2859),
 ('wine', 303),
 ('whse', 267),
 ('west', 436),
 ('wendys', 179),
 ('wendy', 245),
 ('web', 448),
 ('wayflyer', 159),
 ('wal', 1119),
 ('vons', 205),
 ('visalia', 185),
 ('vegas', 285),
 ('valley', 510),
 ('usps', 377),
 ('uber', 486),
 ('tst', 234),
 ('troy', 476),
 ('trip', 234),
 ('tree', 214),
 ('tr', 391),
 ('tobacco', 177),
 ('time', 186),
 ('temecula', 168),
 ('target', 546),
 ('tampa', 249),
 ('taco', 912),
 ('sushi', 181),
 ('super', 1203),
 ('sup', 550),
 ('stop', 268),
 ('st', 777),
 ('sq', 160),
 ('spring', 277),
 ('sports', 210),
 ('spa', 200),
 ('south', 180),
 ('sonic', 283),
 ('snack', 168),
 ('smoke', 250),
 ('signature', 541),
 ('sign', 160),
 ('sig', 316),
 ('shoprite', 242),
 ('shop', 642),
 ('seattle', 429),
 ('santa', 409),
 ('san', 1533),
 ('samsclub', 244),
 ('sams', 279),
 ('saint', 223),
 ('safeway', 245),
 ('s1', 196),
 ('royal', 163),
 ('ross', 320),
 ('river', 200),
 ('ri

In [28]:
top_2grams

[('amazon com', 4656),
 ('xxx xxx', 4275),
 ('cash app', 3442),
 ('com xxx', 949),
 ('wal mart', 898),
 ('amazon prime', 729),
 ('mart super', 669),
 ('mobile purchase', 605),
 ('withdrawal debit', 568),
 ('dollar general', 561),
 ('mart sup', 515),
 ('san diego', 474),
 ('chick fil', 472),
 ('taco bell', 464),
 ('apple com', 413),
 ('dollar tr', 380),
 ('help uber', 379),
 ('uber com', 365),
 ('publix super', 343),
 ('new york', 333),
 ('burger king', 332),
 ('home depot', 313),
 ('little caesars', 296),
 ('debit signature', 293),
 ('signature purchase', 293),
 ('pos deb', 289),
 ('family dollar', 279),
 ('las vegas', 275),
 ('costco whse', 262),
 ('pos pur', 244),
 ('super mar', 242),
 ('sonic drive', 230),
 ('mart com', 225),
 ('amzn com', 219),
 ('amzn mktp', 205),
 ('sams club', 202),
 ('los angeles', 202),
 ('non pin', 193),
 ('hunt valley', 192),
 ('san antonio', 187),
 ('trip help', 186),
 ('stop shop', 184),
 ('signature debit', 181),
 ('eats help', 174),
 ('fort myers', 171),

In [29]:
top_3grams

[('com xxx xxx', 805),
 ('wal mart sup', 515),
 ('help uber com', 356),
 ('debit signature purchase', 293),
 ('publix super mar', 242),
 ('wal mart super', 172),
 ('trip help uber', 169),
 ('eats help uber', 167),
 ('apple com xxx', 162),
 ('mobile purchase sign', 159),
 ('purchase sign based', 159),
 ('amazon com seattle', 155),
 ('withdrawal signature debit', 119),
 ('nayax hunt valley', 115),
 ('bath body works', 94),
 ('purchase amazon com', 93),
 ('debit pin purchase', 92),
 ('com aa xxx', 88),
 ('aa xxx xxx', 88),
 ('xxx xxx troy', 87),
 ('withdrawal amazon com', 85),
 ('pur amazon com', 81),
 ('mart com aa', 80),
 ('pos amazon com', 78),
 ('salt lake cit', 77),
 ('info target om', 76),
 ('domino xxx xxx', 75),
 ('point sale debitl340', 73),
 ('purchase cash app', 72),
 ('help hbomax com', 72),
 ('klover app boost', 70),
 ('merchant issued payment', 68),
 ('issued payment target', 67),
 ('payment target target', 67),
 ('ppd info target', 66),
 ('pin amazon com', 65),
 ('fresh cof

In [30]:
# Use 1 grams to find prefixes