# Phase 1: Preprocessing with Regular Expressions

## 1. Load Dataset


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [2]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [3]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0                        AFTERPAY 185-XXXXXXXX CA 09/01
1                                  AMAZON.COM*2Y6FH8VA2
2     AMZN Mktp US*3I5F48F Amzn.com/bill WA        0...
3     Amazon Tips*HW4ZV562 Amzn.com/bill WA        1...
4     Brigit-com PROTECTION 46BXXXXF1F954F0 WEB ID: ...
5     CHECKCARD XXXX HARDEES XXXXXXX JEFFERSON GA XX...
6                               Danny Hair Beauty Suppl
7                  IC* INSTACART HTTPSINSTACAR CA 09/15
8                   IN N OUT BURGER 336 UPLAND CA 06/04
9     POS Debit - Visa Check Card XXXX - CASH APP*CR...
10    POS Withdrawal - JACKSONS XXXX 2 - MEMO=POS Wi...
11    PURCHASE AUTHORIZED ON 02/19 DOLLARTRE 105 S T...
12    PURCHASE AUTHORIZED ON 02/26 WAL-MART #XXXX HO...
13    PURCHASE AUTHORIZED ON 04/02 JACK IN THE BOX 6...
14    PURCHASE AUTHORIZED ON 09/01 ISLAND JERK RE Hu...
15    PURCHASE AUTHORIZED ON 10/17 COLDSTONE #XXXXX ...
16    PURCHASE AUTHORIZED ON 10/18 SMITH'S F 689 NOR...
17    PURCHASE AUTHORIZED ON 10/27 ORCA WA SXXXX

## 2. Define Regex Rules

In [4]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Prevents matching "IN N OUT"
    "KS", "KY",
    "LA(?!\\s+HACIENDA|\\s+FITNESS)", # Prevents matching "LA HACIENDA"
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Prevents matching "ADORE ME"
    
    # [MODIFIED] Now checks for ' and `
    "MI", "MN", "MO(?!['`])", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    
    "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [5]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), 
    (r"\s{2,}", " "), 

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), 
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "),

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "),
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "),
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "),

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "),
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    
    # [CHANGE 1 - MODIFIED] The '|SEARS\s+BILL\s+PAYMENT' part MUST be deleted from this line.
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    
    # [CHANGE 2 - ADDED] This new line must be added.
    (r"\bBILL\s+PAYMENT\b", " "),
    
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), 
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "),
    (r"\b\d{3}-\d{4}\b", " "),
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "),

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "),
    (r"[-]{2,}", " "),
]

In [6]:
REGEX_POST = [
    # --- Prefixes from previous iterations ---
    re.compile(r"^(TARGET)\b.*"), 
    re.compile(r"^(ACI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(KING\s*#\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(KING\s*#\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(ACE\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(7-ELEVEN\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(7\s+11\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(ZG\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(YSI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(DD\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(CCI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(PY\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(ANC\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(IC\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(J2\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(OSP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(PL\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(RTI\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(FSP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(DD\s*BR\s*\*?#?\s*[A-Z\s0-9'.-]+).*"), 
    re.compile(r"^(PT\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(PP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(CHR\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(USA\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SP\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(TST\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(CKE\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SQ\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SP\+AFF\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(IC\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    re.compile(r"^(SIE\s*\*?\s*[A-Z\s0-9'.-]+).*"),

    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),
    
    re.compile(r"^(PAYPAL\s*\*?\s*[A-Z\s0-9'.-]+).*"),
    
    # [ADDED] Specific rule for 'AMS * MERCHANT' pattern
    # This will capture 'MNHUNTFISH' from 'AMS*MNHUNTFISH'
    re.compile(r"^AMS\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [7]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 31.3 s, sys: 256 ms, total: 31.6 s
Wall time: 31.6 s


In [8]:
%%time
# Second pass
def apply_regex(memo_cleaned):
    for pattern in REGEX_POST:
        match = pattern.match(memo_cleaned)
        if match:
            return match.group(1).strip()
    return memo_cleaned
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 4.91 s, sys: 11.9 ms, total: 4.92 s
Wall time: 4.92 s


In [9]:
# Save to "memos_P1.csv"
df.to_csv('memos_P1.csv', index=False) 

In [10]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
2965,229 DBT CRD XXXX 02/08/21 XXXXXXXXeBay O*21-XX...,229 DBT CRD EBAY O*21- - XXX- C#,229 DBT CRD EBAY O*21- - XXX- C#
5602,7-ELEVEN 10/22 #XXXXXXXXX PURCHASE 7-ELEVEN DA...,7-ELEVEN # PURCHASE 7-ELEVEN DAVENPORT,7-ELEVEN
41887,BASHAS'' 172 GILBERT AZ XXXXXX 06/10,BASHAS'' 172 GILBERT,BASHAS'' 172 GILBERT
135814,DONUT PALACE BROWNWOOD TX 0...,DONUT PALACE BROWNWOOD,DONUT PALACE BROWNWOOD
410067,Point Of Sale Withdrawal EMPIRE WINE + LIQUOR ...,EMPIRE WINE + LIQUOR 155 THOMASTON AVENUE WATE...,EMPIRE WINE + LIQUOR 155 THOMASTON AVENUE WATE...
463004,"TEQUILA CENTRALIA, IL, USA","TEQUILA CENTRALIA, ,","TEQUILA CENTRALIA, ,"
237618,POS Debit - Visa Check Card XXXX - TST* A SLIC...,TST* A SLICE OF TH BRADFORD P,TST* A SLICE OF TH BRADFORD P
237653,POS Debit - Visa Check Card XXXX - TST* CHADWI...,TST* CHADWICKS - O ALEXANDRIA,TST* CHADWICKS - O ALEXANDRIA
472530,TST* THIRSTY LION GASTR TIGARD OR 01/19,TST* THIRSTY LION GASTR TIGARD,TST* THIRSTY LION GASTR TIGARD
92693,CHECKCARD XXXX WAG-A-LOT DECATUR DECATUR GA XX...,WAG-A-LOT DECATUR DECATUR,WAG-A-LOT DECATUR DECATUR


In [11]:
unique_df = df.drop_duplicates(subset='memo_post')

In [12]:
result = (
    unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
    .sort_values(by='memo_pre')[300:400]
)
result

Unnamed: 0,memo,memo_pre,memo_post
17865,AMS*MNHUNTFISH TX 06/12,AMS*MNHUNTFISH,MNHUNTFISH
17866,AMSHINE,AMSHINE,HINE
67111,CHECKCARD XXXX AMSOIL XXXXXXXXXX WI XXXXXXXXXX...,AMSOIL,OIL
39665,Amtrak,AMTRAK,AMTRAK
408711,Point Of Sale Withdrawal - AMZN,AMZN,AMZN
...,...,...,...
40291,AsurionWireles,ASURIONWIRELES,ASURIONWIRELES
230125,POS Debit - Visa Check Card XXXX - ATG,ATG,ATG
29905,ATGTHEATRETICKETS NY 08/20,ATGTHEATRETICKETS,ATGTHEATRETICKETS
40299,Athens,ATHENS,ATHENS


In [13]:
print(df[df['memo_post'] == ''].to_string())

                                                 memo memo_pre memo_post
71408                           CHECKCARD XXXX ARXXXX                   
74774                       CHECKCARD XXXX CK XXXXXXX                   
82514   CHECKCARD XXXX MCC NC XXXXXXXXXXXXXXXXXXXXXXX                   
93843                     CHECKCARD XXXX XXXXXXXXXXXX                   
93846                   CHECKCARD XXXX XXXXXXXXXXXXXX                   
98408                                      CK XXXXXXX                   
104841                                           Card                   
112295                                     Ck XXXXXXX                   
112576                                             Co                   
147714           Debit Purchase 11/12 Card XXXXilXXXX                   
159148                                         FLXXXX                   
178224                                         ILXXXX                   
193400                                             

In [14]:
merchants = ['AMAZON.COM', 'DOORDASH', 'GOOGLE', 'WAL-MART', 'CHICK-FIL-A', 'UBER', 'ULTA']

In [18]:
df[df['memo'].str.contains('SMOKEHOUSE')]

Unnamed: 0,memo,memo_pre,memo_post
4750,4 RIVERS SMOKEHOUSE OF TA954-257-XXXX,4 RIVERS SMOKEHOUSE OF TA954-257,4 RIVERS SMOKEHOUSE OF TA954-257
4818,4RIVERS SMOKEHOUSE OF ORLANDO FL Date 06/10/0 ...,4RIVERS SMOKEHOUSE OF ORLANDO 0 2 MERCHANT CAT...,4RIVERS SMOKEHOUSE OF ORLANDO 0 2 MERCHANT CAT...
30741,AVIATOR SMOKEHOUSE INC,AVIATOR SMOKEHOUSE INC,AVIATOR SMOKEHOUSE INC
42487,BBQ KING SMOKEHOUSE WOODSTOCK IL 03/26,BBQ KING SMOKEHOUSE WOODSTOCK,BBQ KING SMOKEHOUSE WOODSTOCK
42488,BBQ KING SMOKEHOUSE WOODSTOCK IL 04/21,BBQ KING SMOKEHOUSE WOODSTOCK,BBQ KING SMOKEHOUSE WOODSTOCK
47316,BUBBAS SMOKEHOUSE,BUBBAS SMOKEHOUSE,BUBBAS SMOKEHOUSE
63242,CHECKCARD XXXX 4RIVERS SMOKEHOUSE SODO ORLANDO...,4RIVERS SMOKEHOUSE SODO ORLANDO,4RIVERS SMOKEHOUSE SODO ORLANDO
72645,CHECKCARD XXXX BIG SHANTY SMOKEHOUSE KENNESAW ...,BIG SHANTY SMOKEHOUSE KENNESAW,BIG SHANTY SMOKEHOUSE KENNESAW
77288,CHECKCARD XXXX ETHYL`S SMOKEHOUSE O FALLON MO ...,ETHYL`S SMOKEHOUSE O FALLON,ETHYL`S SMOKEHOUSE O FALLON
83452,CHECKCARD XXXX MO`S SMOKEHOUSE BBQ PISMO BEACH...,MO`S SMOKEHOUSE BBQ PISMO BEACH,MO`S SMOKEHOUSE BBQ PISMO BEACH


# Phase 2: Extract & Analyze N-Grams

In [15]:
df_p2 = pd.read_csv("memos_P1.csv")

In [16]:
df.sample(15)

Unnamed: 0,memo,memo_pre,memo_post
436055,Recurring Debit Purchase 07/24 Card XXXXget It...,RECURRING GET IT NOW XXX,RECURRING GET IT NOW XXX
133085,DOLLAR GENERAL BEEBE AR 10/29/20 16:10:05,DOLLAR GENERAL BEEBE 16:10:05,DOLLAR GENERAL BEEBE 16:10:05
53476,CAPE CENTER 02-01 CAPE CHARLES VA XXXX DEBIT C...,CAPE CENTER CAPE CHARLES,CAPE CENTER CAPE CHARLES
475554,UBER * EATS PENDING AMSTERDAM 1...,UBER * EATS PENDING AMSTERDAM MX NU PESO 264.0...,UBER
414725,Pos Debit- XXXX XXXX Louisiana Seafood Deca...,LOUISIANA SEAFOOD DECATUR,LOUISIANA SEAFOOD DECATUR
439798,SAN DIEGO ROCK SUPPLY EL CAJON CA 10/23,SAN DIEGO ROCK SUPPLY EL CAJON,SAN DIEGO ROCK SUPPLY EL CAJON
232107,POS Debit - Visa Check Card XXXX - DICK'S SPOR...,DICK'S SPORTING GO CHESAPEAKE,DICK'S SPORTING GO CHESAPEAKE
281838,PURCHASE AUTHORIZED ON 03/03 365 Market 888 43...,365 MARKET 888 43 TROY S,365 MARKET 888 43 TROY S
300558,PURCHASE AUTHORIZED ON 04/19 TWIN PEAKS AUTO S...,TWIN PEAKS AUTO SAN FRANCISCO P,TWIN PEAKS AUTO SAN FRANCISCO P
390321,PURCHASE AUTHORIZED ON 12/08 PICK UP STIX XXXX...,PICK UP STIX S,PICK UP STIX S
