# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

On branch main
Your branch is ahead of 'origin/main' by 12 commits.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   Kyle_Choi.ipynb
	deleted:    memos_cleaned.csv

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	memos_P1.csv
	sample.csv

no changes added to commit (use "git add" and/or "git commit -a")


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=1)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     CASH APP*NICHOLAS K XXXXXXXXXX CA XXXXX Debit ...
1                            CHECKCARD XXXX MINITS #115
2     CMSVEND*CV CHICAGO ELMHURST IL               0...
3         Card purchase WENDY'S #54 GREER SC 06-20-XXXX
4                                  Cotton Patch Cafe So
5     DBT CRD XXXX 02/11/21 XXXXXXXX IN N OUT BURGER...
6               DOLLARTRE XXXX W MADIS CHICAGO IL 12/21
7     ERENTERPLAN.COM PS://WWWER CARD: XXXXXXX 09/04...
8                  Etsy.com - BattleBornWood718-XXXXXXX
9                                  Hollys Beauty Supply
10    MCDONALD'S FXXXXX LEHI UT                    0...
11                              Mnrd Columbs Ne 340 Eas
12    POS Debit - Visa Check Card XXXX - AMZN MKTP U...
13    POS PURCHASE AFTERPAY 185-XXXXXXXX CA 77TC3H *...
14    PURCHASE AUTHORIZED ON 01/11 IDENTITYCLUB XXXX...
15    PURCHASE AUTHORIZED ON 03/08 STARBUCKS STORE 0...
16    PURCHASE AUTHORIZED ON 05/05 MASABI_RTD XXX-XX...
17    PURCHASE AUTHORIZED ON 06/13 AMAZON.COM*Y9

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [8]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    # === UPDATED RULE ===
    (r"\b(?:INST\s+XFER|RETRY\s+PYMT)\s+(?:ID)?\b", " "), # Handles PAYPAL DES:INST XFER ID...
    (r"\bPAYPAL\s+XFER\b", " "), # Handles PAYPAL XFER
    
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [9]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # === PAYPAL RULES ===
    # Rules for 'PAYPAL DES:...' format
    re.compile(r"^PAYPAL\s+DES:.*?:(.*?)(?:\s+INDN:.*)?$"),
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),
    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Corrected

    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # This generic PAYPAL rule must come *after* specific PAYPAL rules
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OPC\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 4.58 s, sys: 38.3 ms, total: 4.62 s
Wall time: 4.62 s


In [11]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 682 ms, sys: 4.47 ms, total: 687 ms
Wall time: 684 ms


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [33]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
483784,VISA DDA PUR XXXXXX AMAZON COM 163YW1BX2 AMZN ...,DDA PUR AMAZON COM 163YW1BX2 AMZN COM BILL *,A PUR AMAZON COM 163YW1BX2 AMZN COM BILL
341512,PURCHASE AUTHORIZED ON 08/06 BUC-EE'S #43 XXXX...,BUC-EE'S #43 BUC- LEEDS P,BUC-EE'S
75199,CHECKCARD XXXX COUNTRY SKILLIT LAKE CITY FL XX...,COUNTRY SKILLIT LAKE CITY,COUNTRY SKILLIT LAKE CITY
381341,PURCHASE AUTHORIZED ON 11/15 FIV*STEVES PIZZA ...,FIV*STEVES PIZZA FOLSOM S,FIV
183144,Jackpot Party Store,JACKPOT PARTY STORE,JACKPOT PARTY STORE
182360,JOES TOBACCO OUTLET EL CAJON CA 0...,JOES TOBACCO OUTLET EL CAJON,JOES TOBACCO OUTLET EL CAJON
262434,PURCHASE AUTHORIZED ON 01/08 KING SOOP XXXXX E...,KING SOOP E. MAR DENVER P,KING SOOP E. MAR DENVER P
124180,DEBIT CARD DEBIT / auth #XXXXXX 02-12-XXXX LEM...,LEMI APPS MOUNTAIN VIEW EFF. :,LEMI APPS MOUNTAIN VIEW EFF. :
309726,PURCHASE AUTHORIZED ON 05/13 PARADISE CLEANERS...,PARADISE CLEANERS SAN DIEGO S,PARADISE CLEANERS SAN DIEGO S
443999,SLIM CHICKENS XXXXX CENTENNIAL CO 01/02,SLIM CHICKENS CENTENNIAL,SLIM CHICKENS CENTENNIAL


In [15]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [34]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

BASKIN
WINN-DIXIE
SHIPT
BETMGM
AMZNFREETIME
STATERBRO
IHOP
INTUIT
STEAK-N-SHAKE
PETCO
SKILLZ
SAVEMART
SLICE
BASHAS''
GAMESTOP
WWW.KOHLS.COM
DILLONS
CLAIRE'S
RALPHS
CHEWY.COM
TOLLWAY-AUTOREPLEN
VANS
GOFAN
FOODMAXX
P
AMZ
STORE
GNC
MCW
FRG
VERIZONWRLSS
BANFIELD-PET
*STARBUCKS
OTT
FRYS-MKTPLACE
UPS
FOOD4LESS
SHOPIFY
CANVA
MARINA
GOODWILL
BELK
OCULUS
FIV
SUBWAY
PRICELN
HOME
JACK'S
RAINBOW
WALGREENS
IBI
POPEYES
ABC
ABCMOUSE.COM
WEGMANS
PARKMOBILE
APPLE.COM
STAPLES
TILLYS
NYTIMES
QFC
*UBER
V
*EBAY
OFFICE
FRED-MEYER
SEZZLE
BLUESKY
MEIJER
POSHMARK
RIOT
EBAY
ENMARKET
ZTL
CRT
STOCKTON
DROPBOX
QUADPAY
COLDSTONE
PACSUN
NEWSSTAND
HLLFRSH
M
LUCKY
TAMPA
CRYPTO.COM
EVI
POTBELLY
VONS.COM
EA
G
EPC
BUCKLE
RVT
*MICROSOFT
EVERYPLATE
PAR
PEET'S
ARBY'S
NORDSTROM
TLG
RUBIO'S
FH
DOTERRA
GERALD
L
BOXYCHARM
AF
MOE'S
NORTON
EXPRESS
SHEIN
*STEAM
PAM
PAY
DENNY''S
FBPAY
ECS
CLKBANK
TRTHFDR
REI
SHOPNGO
PCH
CHECKERS
JOURNEYS
UBR
OXNARD
GLOSS
ANCESTRY.COM
MESA
EZPASS
FACEBK
ETT
GIV
CMS
GOFNDME
PMT
WWP
ULTA.COM
PACOIMA
S

In [35]:
df[df['memo_post'] == 'TARGET']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
460552,TARGET DEBIT CRD ACH TRAN XXXXXX TARGET,TARGET TARGET,TARGET


In [36]:
df[df['memo_pre'].str.contains('TARGET')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
313748,PURCHASE AUTHORIZED ON 05/24 TARGET T- XXXX Fr...,TARGET T- FRUITVI SARASOTA P,T- FRUITVI SARASOTA P
473833,Target T- 430 Blue Rav (Spent),TARGET T- 430 BLUE RAV (SPENT),T- 430 BLUE RAV (SPENT)
461900,TARGET T- XXXXX FM 297 Magnolia TX 0...,TARGET T- FM 297 MAGNOLIA,T- FM 297 MAGNOLIA
462275,TARGET XXXXXXXX GREELEY CO 07/11,TARGET GREELEY,GREELEY
460747,TARGET DEBIT CRD ACH TRAN XXXXXX XXXXXXXXXXXXX...,TARGET 642 TARGET.COM,642 TARGET.COM
462180,TARGET XXXXXXXX BROOKLYN NY XXXXX Debit Card P...,TARGET BROOKLYN 09:38A #,BROOKLYN 09:38A #
348391,PURCHASE AUTHORIZED ON 08/23 TARGET XXXX PHOEN...,TARGET PHOENIX S,PHOENIX S
462156,TARGET XXXXXXXX BROOKLYN NY 01/09,TARGET BROOKLYN,BROOKLYN
461327,TARGET T- 128 Bailey F Monroe NY 0...,TARGET T- 128 BAILEY F MONROE,T- 128 BAILEY F MONROE
127594,DEBIT CARD PURCHASE POSXXXX TARGET T-XXXX Waxa...,POS TARGET T- WAXAHACHIE T PFF6JVM (CASH),POS TARGET T- WAXAHACHIE T PFF6JVM (CASH)


In [19]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

10752                    AIRDNA MARKET DATA HTTPSWWW.AIRD
503662                      HELP.HBOMAX.COM HTTPSHBOMAX.C
244962                              AARONS EZPAY HTTPS: W
476173                            UBER TRIP HELP.UBER.COM
222403                                     PIN AMAZON.COM
476162                            UBER TRIP HELP.UBER.COM
16073                                          AMAZON.COM
278237                 AMAZON SELLER REPA AMZN.COM BILL S
528719                                         WWW.MYSOBO
510270                        WITHDRAWAL PFCU NIKE.COM 30
321127                                         AMAZON.COM
334883                                WISH.COM XXX-XXX- S
306016                 WWW.PAYSTUBSNOW.CO WWW.PAYSTUBSN S
421805                    SWORD AND SCALE HTTPSSWORDAND S
249327               WITH PIN LITTLE ROCK AFB COMM LR AFB
297670                                         AMAZON.COM
90138                    THE KNOWLEDGE TREE GOSQ.COM CKCD
36524         

In [20]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
246012,POS PURCHASE NON-PIN APPLE.COM/BILL CA XXXXXX ...,NON-PIN APPLE.COM BILL ***** 10:16,NON-PIN APPLE.COM BILL
378209,PURCHASE AUTHORIZED ON 11/07 APPLE.COM/BILL CA...,APPLE.COM BILL S,APPLE.COM BILL S
114024,"DB DEBIT - 10-23-XXXX, APPLE.COM/BILL CA,AUTH#...","DB DEBIT - - , APPLE.COM BILL ,","DB DEBIT - - , APPLE.COM BILL ,"
249560,POS Recurring Debit - DDA DBT CRD XXXX 11/29/2...,POS RECURRING DEBIT - DDA DBT CRD APPLE.COM BI...,POS RECURRING DEBIT - DDA DBT CRD APPLE.COM BI...
152358,EFT POS APPLE.COM/ RF#XXXXXX 072,EFT POS APPLE.COM RF# 072,EFT POS APPLE.COM RF
...,...,...,...
418491,Purchase: XXXXXXXX APPLE.COM/BILL CA Card: ***...,: APPLE.COM BILL : ****,: APPLE.COM BILL : ****
425592,RECURRING PAYMENT AUTHORIZED ON 06/21 APPLE.CO...,APPLE.COM BILL S,APPLE.COM BILL S
28270,APPLE.COM/BILL XXX-XXX-XXXX CA 0...,APPLE.COM BILL XXX-XXX,APPLE.COM BILL XXX-XXX
301021,PURCHASE AUTHORIZED ON 04/21 APPLE.COM/BILL XX...,APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S


In [21]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
99494,CMSVEND*CV CHICAGO ELMHURST IL 0...,CMSVEND*CV CHICAGO ELMHURST,CMSVEND
223130,PL*HOAPAYMENT WEB PMTS Q68JD2 WEB ID: XXXXXXXXXX,PL*HOAPAYMENT WEB PMTS Q68JD2,HOAPAYMENT WEB PMTS Q68JD2
322778,PURCHASE AUTHORIZED ON 06/17 SP * FLUTTERHABIT...,SP * FLUTTERHABIT HTTPSFLUTTERH S,FLUTTERHABIT HTTPSFLUTTERH S
146961,Debit Purchase -visa Card XXXXsq *the Cave Dsm...,SQ *THE CAVE DSMDES MOINES,THE CAVE DSMDES MOINES
120517,"DD *DOORDASH WENDYS SAN FRANCISCO, CA, USA","DD *DOORDASH WENDYS SAN FRANCISCO, ,",DOORDASH WENDYS SAN FRANCISCO
...,...,...,...
14586,AMAZON.COM*295ZV5RJ1 SEATTLE WACard XXXX/Withd...,AMAZON.COM*295ZV5RJ1 SEATTLE WACARD WITHDRAWAL #,AMAZON.COM
211297,NOR*NORTHERN TOOL MN 02/22,NOR*NORTHERN TOOL,NORTHERN TOOL
91501,CHECKCARD XXXX TST* Petterinos Chicago IL XXXX...,TST* PETTERINOS CHICAGO CKCD,PETTERINOS CHICAGO CKCD
37640,Amazon.com*HS68DXXXX Amzn.com/bill WA 04/01,AMAZON.COM*HS68D AMZN.COM BILL,AMAZON.COM


In [22]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

426375                                   [ABCMOUSE.COM,  S]
34266                 [AMAZON.COM, 0H1JR7YZ3 AMZN.COM BILL]
267633                [AMAZON.COM, 0J4CQ4I AMZN.COM BILL S]
34306                 [AMAZON.COM, 100Y886V3 AMZN.COM BILL]
64023            [AMAZON.COM, 115PQ1AK3 AM AMZN.COM BILLWA]
304888                [AMAZON.COM, 131HS6P AMZN.COM BILL S]
404010                 [AMAZON.COM, 148HT1 AMZN.COM BILLWA]
64247                        [AMAZON.COM, 1K2 SEATTLE CKCD]
64426               [AMAZON.COM, 1R UL0 AM AMZN.COM BILLWA]
35174                 [AMAZON.COM, 1U07N54Q1 AMZN.COM BILL]
35250                   [AMAZON.COM, 1V5OE65 AMZN.COM BILL]
104880                              [AMAZON.COM, 1W02Q8JA1]
14367                          [AMAZON.COM, 206B09AO0 AMZN]
35568                 [AMAZON.COM, 282PX80T2 AMZN.COM BILL]
14638           [AMAZON.COM, 2C4DW21F0 AMZNAMZN.COM BILLWA]
64823            [AMAZON.COM, 2C8JB80C2 AM AMZN.COM BILLWA]
14761           [AMAZON.COM, 2E2OX6U72 A

# Phase 2: Extract & Analyze N-Grams

In [23]:
df_p2 = pd.read_csv("memos_P1.csv")

In [24]:
print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

243728    GIV
180433    IBI
406870      P
259215    UBR
41316     BAR
17935     AMZ
5597         
397976    ANN
79220     HAD
417978    RED
429955      M
77682     FGM
222147    PIN
428514    AGI
429451     HP
526262    ZTL
5484         
86604     RVT
131934    DNH
9634       AF
406855      P
406772    PWP
366002    LIM
237039    SVK
206623    MYP
138267    DRI
426694    OTT
151535     EA
429952     GM
526613    ZOX
5691         
526259    ZTL
436866     RH
473254    TVY
78211     FYF
484564    VIT
163548    FYF
447297    ECS
504111    QVC
154006    ETT
81898      LS
191352    LJS
175943     HP
462802    TBE
428295    QVC
175931     HP
279352    EPC
406865      P
201586    MGM
410046    E-Z
84888     PAY
72820     BLS
90553     TLF
138261    DRI
391296    WWP
504127    QVC
473255    TVY
50341     BKG
336865    WSS
5277         
201584    MGM
386632    FIV
177863    IGA
138283    DRI
5488         
61708      CC
94160     DOX
175944     HP
258593     BH
218945    *KA
219737    BUY
215246

In [25]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [26]:
%%time
# corpus = df_p2['memo_post'].fillna('')
# print(f"Analyzing {len(corpus)} cleaned memos...")
# # Get the top 200 of each n-gram type
# top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
# top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
# top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
# print(f"--- N-gram Analysis Complete ---")

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.34 µs


In [27]:
top_1grams.sort(reverse=True
    
)
top_1grams

NameError: name 'top_1grams' is not defined

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes