# Phase 1: Preprocessing with Regular Expressions

## 1. Load Dataset


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [2]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [3]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     BJS WHOLESALE #0 XXXX BEN 06-19-22 FORT MYERS ...
1     CHECKCARD XXXX CAKERY CLASSIC BUTT gosq.com MO...
2           FERRY PARKING LARKSP SAN FRANCSICO CA 07/29
3     MIDWESTERN MEATS RESTAU MESA AZ              0...
4     NNT TIPSY T'S DISCO00 INDIANAPOLIS INXXXXXX  0...
5     NST THE HOME D 05/15 #XXXXXXXXX PURCHASE XXXX ...
6         Online Payment XXXXXXXXXXX To JC PENNEY 12/01
7     POS Debit - Visa Check Card XXXX - 7-ELEVEN SA...
8     POS Debit - Visa Check Card XXXX - TST* STANDA...
9     PURCHASE AUTHORIZED ON 03/06 DOORDASH*JACK IN ...
10    PURCHASE AUTHORIZED ON 07/18 SHOPRITE HOWELL S...
11    PURCHASE AUTHORIZED ON 08/26 Amazon.com*MM3KB0...
12    PURCHASE AUTHORIZED ON 08/31 TACO BELL XXXX ST...
13    PURCHASE AUTHORIZED ON 10/24 COSTCO WHSE #XXXX...
14    PURCHASE AUTHORIZED ON 11/29 WAL-MART #XXXX PH...
15    PURCHASE AUTHORIZED ON 12/20 FISHERMEN'S CATCH...
16                     Parkers Goose Creek Sc PIN Purch
17    ROYAL HOUSE OF 10/13 #XXXXXXXXX PURCHASE R

## 2. Define Regex Rules

In [4]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Prevents matching "IN N OUT"
    "KS", "KY",
    "LA(?!\\s+HACIENDA|\\s+FITNESS)", # Prevents matching "LA HACIENDA"
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Prevents matching "ADORE ME"
    "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [41]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), 
    (r"\s{2,}", " "), 

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+CARD\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "),

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "),
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "),
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "),

    # === 5) Merchant-terminal boilerplate ===
    # [MODIFIED] Removed '(?:\s+\w+)?' to stop it from consuming merchant names
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\bPAYMENT\s+CARD\s+TARGET\s+DEBIT\s+CRD(?:\s+ACH(?:\s+TRAN)?)?\b", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b.*", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "),
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b.*", " "),
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT|SEARS\s+BILL\s+PAYMENT)\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), 
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "),
    (r"\b\d{3}-\d{4}\b", " "),
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "),

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "),
    (r"[-]{2,}", " "),
]

In [42]:
REGEX_POST = [
    # --- Prefixes from previous iterations ---
    re.compile(r"^(ACI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(KING\s*#\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(KING\s*#\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(ACE\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(7-ELEVEN\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(7\s+11\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(ZG\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(YSI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(DD\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(CCI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(PY\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(ANC\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(IC\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(J2\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(OSP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(PL\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(RTI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(FSP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(DD\s*BR\s*\*?#?\s*[A-Z\s0-9'.-]+).*"), 
    re.compile(r"^(PT\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(PP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(CHR\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(USA\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(TST\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(CKE\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SQ\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SP\+AFF\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(IC\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SIE\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(PAYPAL\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    
    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [43]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 35.8 s, sys: 312 ms, total: 36.1 s
Wall time: 36.1 s


In [44]:
%%time
# Second pass
def apply_regex(memo_cleaned):
    for pattern in REGEX_POST:
        match = pattern.match(memo_cleaned)
        if match:
            return match.group(1).strip()
    return memo_cleaned
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 4.74 s, sys: 35.9 ms, total: 4.78 s
Wall time: 4.78 s


In [9]:
# Save to "memos_P1.csv"
df.to_csv('memos_P1.csv', index=False) 

In [10]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
37809,Amazon.com*HY1F93ZL2 Amzn.com/bill WA 03/30,AMAZON.COM*HY1F93ZL2 AMZN.COM BILL,AMAZON.COM
254495,POS Withdrawal AMZN Mktp / US*1K6TX1Amzn.com/b...,AMZN MKTP US*1K6TX1AMZN.COM BILL,AMZN MKTP US
359007,PURCHASE AUTHORIZED ON 09/19 PUBLIX SUPER MAR ...,PUBLIX SUPER MAR PORT CHARLOTT P,PUBLIX SUPER MAR PORT CHARLOTT P
110836,Check Card Purchase / COURTYARD BY MARRIOTT OR...,PURCHASE COURTYARD BY MARRIOTT ORLANDO,PURCHASE COURTYARD BY MARRIOTT ORLANDO
87509,CHECKCARD XXXX SNACK SODA VENDIN HOLLYWOOD FL ...,SNACK SODA VENDIN HOLLYWOOD,SNACK SODA VENDIN HOLLYWOOD
278128,PURCHASE AUTHORIZED ON 02/20 THE HOME DEPOT #X...,THE HOME DEPOT # ORANGE CITY P,THE HOME DEPOT
326930,PURCHASE AUTHORIZED ON 06/28 TOCAYA KIERLAND C...,TOCAYA KIERLAND SCOTTSDALE S,TOCAYA KIERLAND SCOTTSDALE S
314120,PURCHASE AUTHORIZED ON 05/25 ULTA#XXXX LAS VEG...,ULTA# LAS VEGAS S,ULTA
386106,PURCHASE AUTHORIZED ON 11/27 Wal-Mart Super Ce...,WAL-MART SUPER CENTER FAYETTEVILLE P,WAL-MART SUPER CENTER FAYETTEVILLE P
508646,Withdrawal Debit DUNKIN #XXXXXX Q35 MANCHESTER...,WITHDRAWAL DEBIT DUNKIN # Q35 MANCHESTER 973 20 #,WITHDRAWAL DEBIT DUNKIN


In [50]:
unique_df = df.drop_duplicates(subset='memo_post')

result = (
    unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
    .sort_values(by='memo_pre')[300:400]
)
result

Unnamed: 0,memo,memo_pre,memo_post
526686,amoledwfs,AMOLEDWFS,AMOLEDWFS
228966,POS Debit - Visa Check Card XXXX - AMPHORABAK ...,AMPHORABAK,AMPHORABAK
17865,AMS*MNHUNTFISH TX 06/12,AMS*MNHUNTFISH,AMS
17866,AMSHINE,AMSHINE,AMSHINE
67111,CHECKCARD XXXX AMSOIL XXXXXXXXXX WI XXXXXXXXXX...,AMSOIL,AMSOIL
...,...,...,...
40284,Astroline.Today,ASTROLINE.TODAY,ASTROLINE.TODAY
238944,POS Debit ASURINT /,ASURINT,ASURINT
40291,AsurionWireles,ASURIONWIRELES,ASURIONWIRELES
230125,POS Debit - Visa Check Card XXXX - ATG,ATG,ATG


In [37]:
df[(df['memo'].str.contains('QVC')) & (df['memo_post'] == '')]['memo'].iloc[0]

'POS PURCHASE / MERCHANT PURCHASE TERMINAL XXXXXXXX QVC*XXXXXXXXXXXX* 800-367-9 PA XXXXXXXXXXXXXXXX 10-09-21 12:00 AM'

In [14]:
print(df[df['memo_post'].str.startswith('ZG')]['memo_post'].to_string())

94056             ZG * RENTAPPLICAT XXX-XXX- RECURRING CKCD
94057               ZG * RENTAPPLICATION XXX-XXX- RECURRING
143523                         ZG * RENTAPPLICATION XXX-XXX
420670                                 ZG * RENTAPPLICATI S
421189                        ZG * RENTAPPLICATI XXX-XXX- S
426374                        ZG * RENTAPPLICATI XXX-XXX- S
430735                                 ZG * RENTAPPLICATI S
525904                                 ZG * RENTAPPLICATION
525905                      ZG * RENTAPPLICATION WITHDRAWAL
525906                                 ZG * RENTAPPLICATION
525907                                 ZG * RENTAPPLICATION
525908                         ZG * RENTAPPLICATION XXX-XXX
525909                         ZG * RENTAPPLICATION XXX-XXX
525910    ZG * RENTAPPLICATION XXX-XXX- US PREAUTH HOLD-...
525911                        ZG * RENTAPPLICATION XXX-XXX-
526568                                 ZG * RENTAPPLICATION
526569                                  

In [15]:
merchants = ['AMAZON.COM', 'DOORDASH', 'GOOGLE', 'WAL-MART', 'CHICK-FIL-A', 'UBER']

# Phase 2: Extract & Analyze N-Grams

In [15]:
df_p2 = pd.read_csv("memos_P1.csv")

In [16]:
df.sample(15)

Unnamed: 0,memo,memo_pre,memo_post
436055,Recurring Debit Purchase 07/24 Card XXXXget It...,RECURRING GET IT NOW XXX,RECURRING GET IT NOW XXX
133085,DOLLAR GENERAL BEEBE AR 10/29/20 16:10:05,DOLLAR GENERAL BEEBE 16:10:05,DOLLAR GENERAL BEEBE 16:10:05
53476,CAPE CENTER 02-01 CAPE CHARLES VA XXXX DEBIT C...,CAPE CENTER CAPE CHARLES,CAPE CENTER CAPE CHARLES
475554,UBER * EATS PENDING AMSTERDAM 1...,UBER * EATS PENDING AMSTERDAM MX NU PESO 264.0...,UBER
414725,Pos Debit- XXXX XXXX Louisiana Seafood Deca...,LOUISIANA SEAFOOD DECATUR,LOUISIANA SEAFOOD DECATUR
439798,SAN DIEGO ROCK SUPPLY EL CAJON CA 10/23,SAN DIEGO ROCK SUPPLY EL CAJON,SAN DIEGO ROCK SUPPLY EL CAJON
232107,POS Debit - Visa Check Card XXXX - DICK'S SPOR...,DICK'S SPORTING GO CHESAPEAKE,DICK'S SPORTING GO CHESAPEAKE
281838,PURCHASE AUTHORIZED ON 03/03 365 Market 888 43...,365 MARKET 888 43 TROY S,365 MARKET 888 43 TROY S
300558,PURCHASE AUTHORIZED ON 04/19 TWIN PEAKS AUTO S...,TWIN PEAKS AUTO SAN FRANCISCO P,TWIN PEAKS AUTO SAN FRANCISCO P
390321,PURCHASE AUTHORIZED ON 12/08 PICK UP STIX XXXX...,PICK UP STIX S,PICK UP STIX S
