# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 33affff] regex
 1 file changed, 3184 insertions(+), 3232 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0         APPLE CASH 04-29 CAXXXX DEBIT CARD MISC DEBIT
1     DBT PURCHASE ON 05/20 @ 18:10 / IN *THERE YA G...
2     DUNKIN #XXXXXX MIAMI FL                      0...
3     FAIRWAY LIQUOR 11/04 #XXXXXXXXX PURCHASE 340 W...
4     KROGER #5 XXXX 02/10 #XXXXXXXXX PURCHASE KROGE...
5     MACYS : RETRY PYMT ID: CITIAUTFDR CO: MACYS NA...
6       POS Debit - Visa Check Card XXXX - STANLEY MART
7     PURCHASE AUTHORIZED ON 01/18 CANTEEN VENDING R...
8     PURCHASE AUTHORIZED ON 02/06 Wal-Mart Super Ce...
9     PURCHASE AUTHORIZED ON 03/13 TOMS FAMOUS FAMIL...
10    PURCHASE AUTHORIZED ON 05/24 FAREWAY STORES IN...
11    PURCHASE AUTHORIZED ON 07/25 FLEXJOBS.COM CA S...
12    PURCHASE AUTHORIZED ON 08/10 TWISTER TACO Y BU...
13    PURCHASE AUTHORIZED ON 09/22 WINE AND SPIRITS ...
14                      PURCHASE VIP TANNING STORE # 12
15    PURCHASE XXXX DD DOORDASH DENNY XXXXXXXXXX CA ...
16    Point of Sale Debit L340 DATE 11-08 Amazon com...
17    RECURRING PAYMENT AUTHORIZED ON 01/27 GOOG

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [33]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"\bX{4,}\b", " "), # PRECISE: Remove standalone masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times (INCLUSIVE) ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{4}-\d{2}-\d{2}\b", " "), # YYYY-MM-DD
    (r"\b\d{1,2}-(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-\d{2,4}\b", " "), # DD-MMM-YYYY
    (r"\b(?:19|20)\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[12]\d|3[01])\b", " "), # YYYYMMDD
    (r"\b(?:0[1-9]|1[0-2])(?:0[1-9]|[12]\d|3[01])\d{2}\b", " "), # MMDDYY
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails (INCLUSIVE) ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\bID\b", " "), # Remove standalone 'ID' (from 'ID: DSW')
    (r"\b(?:REF|TERM|TRN|INV|ACCT|TID|MID)\s*#?[\d\w-]+\b", " "), # REF 123, TERM 001, etc.
    (r"\bAUTH\s+CODE\s*[\d\w-]+\b", " "), # AUTH CODE 0123
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers (INCLUSIVE) ===
    (r"\b1[\s.-]\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}\b", " "), # 1-800-555-1212
    (r"\b\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}\b", " "), # (800) 555-1212, 800.555.1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\bXXX-XXX-XXXX\b", " "), # Masked phone

    # === 8) URLs/domains (INCLUSIVE) ===
    (r"^\.COM\s+BILL\b.*", " "),
    (r"\s+\.(?:COM|NET|ORG|GOV|EDU|IO|CO)\b", " "), # Remove trailing .COM, .NET etc.

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s+", " ") # Remove leading hyphens
]

In [9]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    # These now capture '7-ELEVEN' or '7 11' as the merchant
    re.compile(r"^(7-ELEVEN)\s*\*?#?.*"),
    re.compile(r"^(7\s+11)\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    # re.compile(r"^DD\s*BR\s*\*?#?\s*([A-Z\s0-9'.-]+).*"), # Removed, redundant
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 4.53 s, sys: 33.8 ms, total: 4.56 s
Wall time: 4.56 s


In [11]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 695 ms, sys: 8.17 ms, total: 703 ms
Wall time: 701 ms


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [14]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
111857,Check Card: TST* NOTHING BUNDT CAKES MENIFEE C...,: TST* NOTHING BUNDT CAKES MENIFEE,: TST* NOTHING BUNDT CAKES MENIFEE
170495,GUSTO NET XXXXXX 6semjrprp0a,GUSTO NET 6SEMJRPRP0A,GUSTO NET 6SEMJRPRP0A
179661,INSTACART*159 11-14 WWW.PUBLIX.CO CA XXXX DEBI...,INSTACART*159 .PUBLIX.CO,INSTACART
190519,LIQUOR PLANET 05/29 #XXXXXXXXX PURCHASE LIQUOR...,LIQUOR PLANET # PURCHASE LIQUOR PLANET MURFREE...,LIQUOR PLANET
256492,PRESIDENTE SUPE N BAY VLG FL 10/07,PRESIDENTE SUPE N BAY VLG,PRESIDENTE SUPE N BAY VLG
334781,PURCHASE AUTHORIZED ON 07/19 PUBLIX SUPER MAR ...,PUBLIX SUPER MAR 950 E OAKLAND PARK P,PUBLIX SUPER MAR 950 E OAKLAND PARK P
433238,RIVER CITY MARKET CHICAGO IL 0...,RIVER CITY MARKET CHICAGO,RIVER CITY MARKET CHICAGO
272155,PURCHASE AUTHORIZED ON 02/04 SILVER DOG BED & ...,SILVER DOG BED & B WEST SAINT S,SILVER DOG BED & B WEST SAINT S
509712,Withdrawal Fee / SHOPRITE EDDYSTONE S1 EDDYSTO...,WITHDRAWAL FEE SHOPRITE EDDYSTONE S1 EDDYSTONE...,WITHDRAWAL FEE SHOPRITE EDDYSTONE S1 EDDYSTONE 30
507797,Withdrawal Debit Card Signature Debit/SAMSCLUB...,WITHDRAWAL SIGNATURE DEBIT SAMSCLUB # GAINESVI...,WITHDRAWAL SIGNATURE DEBIT SAMSCLUB


In [15]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [16]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

TARGET.COM
WINN-DIXIE
AMZNFREETIME
SLICE
BASKIN
DES
VANS
PETCO
SHIPT
TOLLWAY-AUTOREPLEN
GOFAN
RALPHS
IHOP
CHEWY.COM
STATERBRO
BETMGM
CANVA
DILLONS
SKILLZ
STEAK-N-SHAKE
*STARBUCKS
GAMESTOP
STAPLES
SAVEMART
.KOHLS.COM
PRICELN
FRYS-MKTPLACE
BANFIELD-PET
ABC
GNC
INTUIT
WALGREENS
BASHAS''
SUBWAY
SEZZLE
V
FRG
ETT
BELK
MCW
RIOT
JOURNEYS
VERIZONWRLSS
QFC
P
RAINBOW
POTBELLY
DROPBOX
OCULUS
FIV
MARINA
GERALD
SHOPIFY
CHECKERS
FACEBK
NORTON
IBI
EVERYPLATE
FOOD4LESS
TILLYS
MOE'S
TOMMY'S
PARKMOBILE
PACSUN
STORE
ECS
QUADPAY
GOODWILL
CLAIRE'S
TLG
L
ENMARKET
OTT
ZTL
EZPASS
NYTIMES
FBPAY
EA
FOODMAXX
*MICROSOFT
NEWSSTAND
*EBAY
ROBLOX
EVI
ABCMOUSE.COM
HOME
AMZ
E-Z
*EPIC
OPC
MEIJER
TRTHFDR
VOLA
POPEYES
WEGMANS
UPS
LEGALSHIELD
PAR
TARGET
AIRBNB
NIKE.COM
EPC
RVT
*UBER
CDSR
EBAY
KFC
SIMPLISAFE
CKO
BLIZ
*LYFT
JACK'S
LUCKY
RONAN
GRUBHUB
CRT
LIM
EIG
SEDANOS
CMS
HLLFRSH
ANCESTRY.COM
HSN
M
FLANIGANS
PAM
BLUESKY
SEGPAY.COM
G
GPI
RANDALLS
PCH
GIV
UNITED
ANTIN
BOXYCHARM
HANNAFORD
OFFICE
PAY
LZC
SMITHS-FO
VONS.COM
MSB


In [17]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
208413,Mo,,
524215,XXXXXXXXXXXXXXX,,
232980,POS Debit - Visa Check Card XXXX - FLXXXX,,
512862,XXXX,,
98408,CK XXXXXXX,,
518121,XXXXX,,
159148,FLXXXX,,
254398,POS Withdrawal ACE #XXXX ECHO VALLEY CLEVELAN...,ACE # ECHO VALLEY CLEVELAND,
523779,XXXXXXXX XXXXXXXXXX,,


In [18]:
df[df['memo_pre'].str.contains('ELEVEN')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
5045,7 ELEVEN XXXXX PROVO UT XXXXXX 1...,7 ELEVEN PROVO,7 ELEVEN PROVO
6932,7-ELEVEN XXXXX 09/07 #XXXXXXXXX PURCHASE 7-ELE...,7-ELEVEN # PURCHASE 7-ELEVEN WEBSTER,7-ELEVEN
330017,PURCHASE AUTHORIZED ON 07/07 7-ELEVEN MIRAMAR ...,7-ELEVEN MIRAMAR P,7-ELEVEN
169944,GRUBHUB7ELEVEN GRUBHUB.COM NY 1...,GRUBHUB7ELEVEN GRUBHUB.COM,GRUBHUB7ELEVEN GRUBHUB.COM
389327,PURCHASE AUTHORIZED ON 12/06 7-ELEVEN PEMBROKE...,7-ELEVEN PEMBROKE PINE P,7-ELEVEN
299592,PURCHASE AUTHORIZED ON 04/17 7-ELEVEN Lehigh A...,7-ELEVEN LEHIGH ACRES P,7-ELEVEN
522354,XXXXXX PURCHASE-PIN 09/05 18:02 7-ELEVEN MURRI...,PIN 18:02 7-ELEVEN MURRIETA 00MTH801,PIN 18:02 7-ELEVEN MURRIETA 00MTH801
202965,MOBILE PURCHASE XXXX 7-ELEVEN XXXXX WESTFIELD ...,MOBILE PURCHASE 7-ELEVEN WESTFIELD,MOBILE PURCHASE 7-ELEVEN WESTFIELD
308404,PURCHASE AUTHORIZED ON 05/10 7-ELEVEN MIAMI FL...,7-ELEVEN MIAMI P,7-ELEVEN
6011,7-ELEVEN Brentwood CA 0...,7-ELEVEN BRENTWOOD,7-ELEVEN


In [19]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

16684                                          AMAZON.COM
34396                                          AMAZON.COM
110210                                         AMAZON.COM
385786                                   APPLE.COM BILL S
404044                                         AMAZON.COM
332022                       COME BACK SHACK CHARLESTON S
116755                      CRD DSGAUDEQ WALMART.COM AA C
278585                                      ALIBABA.COM S
245425                             ETSY.COM - JONAT 7 18-
140252               DAVE.COM P2P SINIUA SAMARITA ACCOUNT
65325                                          AMAZON.COM
417939                        DEMI LOVATO HTTPSDEMILOVAPA
83555                                          MYFICO.COM
318054                                         AMAZON.COM
50252                               WITHDRAWAL AMAZON.COM
9679                       AFFIRM.COM PAYMENTS AFFIRM.COM
462639                                         TARGET.COM
476330        

In [20]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
125362,DEBIT CARD PURCHASE APPLE.COM/BILL CAXXXXXXXXX...,APPLE.COM BILL L835D3J (CASH),APPLE.COM BILL L835D3J (CASH)
114684,DBT CRD XXXX 04/26/21 XXXXXXXX APPLE.COM/BILL ...,CRD APPLE.COM BILL APPLE.COM BILL XXX-XXX- C#,CRD APPLE.COM BILL APPLE.COM BILL XXX-XXX- C
421724,RECURRING PAYMENT AUTHORIZED ON 02/14 APPLE.CO...,APPLE.COM BILL S,APPLE.COM BILL S
334040,PURCHASE AUTHORIZED ON 07/17 PP*APPLE.COM/BILL...,PP*APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S
27016,APPLE.COM/BILL 02-24 XXX-XXX-XXXX CA XXXX DEBI...,APPLE.COM BILL XXX-XXX,APPLE.COM BILL XXX-XXX
...,...,...,...
429053,RECURRING PAYMENT AUTHORIZED ON 10/12 APPLE.CO...,APPLE.COM BILL S,APPLE.COM BILL S
436193,Recurring Debit Purchase Card XXXXapple.com/bi...,APPLE.COM BILL 866-712,APPLE.COM BILL 866-712
418526,Purchase: XXXXXXXX APPLE.COM/BILL CA Card: ***...,: APPLE.COM BILL : ****,: APPLE.COM BILL : ****
348493,PURCHASE AUTHORIZED ON 08/24 APPLE.COM/BILL XX...,APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S


In [21]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
142040,Debit Card IC* INSTACART*SUBSCRIP HTTPSINSTACA...,IC* INSTACART*SUBSCRIP HTTPSINSTACARCA,INSTACART
83944,CHECKCARD XXXX NIC*- FL DL T04 MIAMI FL XXXXXX...,NIC*- DL T04 MIAMI,NIC
293881,PURCHASE AUTHORIZED ON 04/02 LYFT *RIDE SAT 6 ...,LYFT *RIDE SAT 6 LYFT.COM S,LYFT
284761,PURCHASE AUTHORIZED ON 03/10 TST* JACKS URBAN ...,TST* JACKS URBAN E XXX-XXX- S,JACKS URBAN E XXX-XXX- S
177571,IC* INSTACART HTTPSINSTACAR CA Card XXXX,IC* INSTACART HTTPSINSTACAR,INSTACART HTTPSINSTACAR
...,...,...,...
15092,AMAZON.COM*2X55K3P62 SEATT LE WAXXX...,AMAZON.COM*2X55K3P62 SEATT LE WEQWR2P,AMAZON.COM
72958,CHECKCARD XXXX BOOKOFMONTH *1 XXXXXXXXXX NY XX...,BOOKOFMONTH *1 RECURRING,BOOKOFMONTH
36961,Amazon.com*EE5PX1CP3 Amzn.com/bill WA 03/11,AMAZON.COM*EE5PX1CP3 AMZN.COM BILL,AMAZON.COM
66737,CHECKCARD XXXX AMAZON.COM*W00ZQ5T03 AM AMZN.CO...,AMAZON.COM*W00ZQ5T03 AM AMZN.COM BILLWA,AMAZON.COM


In [22]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
208413,Mo,,
524215,XXXXXXXXXXXXXXX,,
232980,POS Debit - Visa Check Card XXXX - FLXXXX,,
512862,XXXX,,
98408,CK XXXXXXX,,
518121,XXXXX,,
159148,FLXXXX,,
254398,POS Withdrawal ACE #XXXX ECHO VALLEY CLEVELAN...,ACE # ECHO VALLEY CLEVELAND,
523779,XXXXXXXX XXXXXXXXXX,,


In [23]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

8911                                  [ACI, FINRA FORM U10]
34239                 [AMAZON.COM, 011FG1K43 AMZN.COM BILL]
34262                 [AMAZON.COM, 0E8DE0Q33 AMZN.COM BILL]
34458                 [AMAZON.COM, 1A12Z3F82 AMZN.COM BILL]
13867         [AMAZON.COM, 1F7IC9ZP2 AMAZON.COM SEATTLE, ,]
292760                [AMAZON.COM, 1H52K7C AMZN.COM BILL S]
356195                [AMAZON.COM, 1M43G84 AMZN.COM BILL S]
34926                 [AMAZON.COM, 1M9TC4VD0 AMZN.COM BILL]
228390                        [AMAZON.COM, 1O U AMZN.COM B]
35067                 [AMAZON.COM, 1Q51J2V81 AMZN.COM BILL]
14195         [AMAZON.COM, 1R2IS79X1 AMAZON.COM SEATTLE, ,]
64424            [AMAZON.COM, 1R9ZT3FN2 AM AMZN.COM BILLWA]
35371                 [AMAZON.COM, 1Z1K53O52 AMZN.COM BILL]
404118                 [AMAZON.COM, 1Z6WP3 AMZN.COM BILLWA]
35599                 [AMAZON.COM, 293TW7Z63 AMZN.COM BILL]
35759                    [AMAZON.COM, 2D YS2 AMZN.COM BILL]
35715                 [AMAZON.COM, 2D4CM

# Phase 2: Extract & Analyze N-Grams

In [24]:
df_p2 = pd.read_csv("memos_P1.csv")

In [25]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [26]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [27]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 2.05 s, sys: 12.6 ms, total: 2.07 s
Wall time: 2.06 s


In [32]:
top_1grams.sort(key=lambda x: x[0])
top_1 = [ngram for ngram, count if ngram not in NOISE_WORDS]

SyntaxError: expected 'else' after 'if' expression (3434167287.py, line 2)

In [29]:
top_2grams

[('amazon com', 4656),
 ('xxx xxx', 4275),
 ('cash app', 3442),
 ('com xxx', 949),
 ('wal mart', 898),
 ('amazon prime', 729),
 ('mart super', 669),
 ('mobile purchase', 605),
 ('withdrawal debit', 568),
 ('dollar general', 561),
 ('mart sup', 515),
 ('san diego', 474),
 ('chick fil', 472),
 ('taco bell', 464),
 ('apple com', 413),
 ('dollar tr', 380),
 ('help uber', 379),
 ('uber com', 365),
 ('publix super', 343),
 ('new york', 333),
 ('burger king', 332),
 ('home depot', 313),
 ('little caesars', 296),
 ('debit signature', 293),
 ('signature purchase', 293),
 ('pos deb', 289),
 ('family dollar', 279),
 ('las vegas', 275),
 ('costco whse', 262),
 ('pos pur', 244),
 ('super mar', 242),
 ('sonic drive', 230),
 ('mart com', 225),
 ('amzn com', 219),
 ('amzn mktp', 205),
 ('sams club', 202),
 ('los angeles', 202),
 ('non pin', 193),
 ('hunt valley', 192),
 ('san antonio', 187),
 ('trip help', 186),
 ('stop shop', 184),
 ('signature debit', 181),
 ('eats help', 174),
 ('fort myers', 171),

In [30]:
top_3grams

[('com xxx xxx', 805),
 ('wal mart sup', 515),
 ('help uber com', 356),
 ('debit signature purchase', 293),
 ('publix super mar', 242),
 ('wal mart super', 172),
 ('trip help uber', 169),
 ('eats help uber', 167),
 ('apple com xxx', 162),
 ('mobile purchase sign', 159),
 ('purchase sign based', 159),
 ('amazon com seattle', 155),
 ('withdrawal signature debit', 119),
 ('nayax hunt valley', 115),
 ('bath body works', 94),
 ('purchase amazon com', 93),
 ('debit pin purchase', 92),
 ('com aa xxx', 88),
 ('aa xxx xxx', 88),
 ('xxx xxx troy', 87),
 ('withdrawal amazon com', 85),
 ('pur amazon com', 81),
 ('mart com aa', 80),
 ('pos amazon com', 78),
 ('salt lake cit', 77),
 ('info target om', 76),
 ('domino xxx xxx', 75),
 ('point sale debitl340', 73),
 ('purchase cash app', 72),
 ('help hbomax com', 72),
 ('klover app boost', 70),
 ('merchant issued payment', 68),
 ('issued payment target', 67),
 ('payment target target', 67),
 ('ppd info target', 66),
 ('pin amazon com', 65),
 ('fresh cof

In [31]:
# Use 1 grams to find prefixes