# Phase 1: Preprocess with Regular Expressions

## 1. Load Dataset


In [142]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [20]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [97]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0                 AMZN DIGITAL*MVXXXXI11 07/24 PURCHASE
1                           CASH APP*CAMERON XXXXXXXXXX
2     CHECKCARD XXXX BTA IST YENI HAVA ISTANBUL XXXX...
3     CHECKCARD XXXX TY TA UG HAMBURG XXXXXXXXXXXXXX...
4                  CLASSY NAILS SPA SCOTTSDALE AZ 12/05
5     DBT CRD XXXX 11/25/22 DJPIBX5W 365 MARKET 888 ...
6     DDA PUR 4EF2IJR654V7 AMAZON.COM SEATTLE WA AMA...
7     DEBIT CARD PURCHASE AT BULLDOGS - AURORA, AURO...
8     Debit: Signature purchase from XXXXXXXXXXXXXXX...
9                    PANERA BREAD #XXXXXX P COLUMBUS GA
10    PET SUPERMARKE 01/22 #XXXXXXXXX PURCHASE PET S...
11    POS Debit - Visa Check Card XXXX - BBAIKDABANG...
12    POS Debit DICKSSPORTINGGOODS.COM DBT CRD XXXX ...
13    POS PURCHASE TERMINAL XXXXXXXX KROGER #5 XXXXX...
14      PROTECTION Brigit-com 59AF97A436FXXXX ACH DEBIT
15    PURCHASE AUTHORIZED ON 02/02 99 RANCH #XXXX UN...
16    PURCHASE AUTHORIZED ON 04/28 SAMS CLUB #XXXX X...
17    PURCHASE AUTHORIZED ON 08/25 CIRCLE K XXXX

## 2. Define Regex Rules

In [110]:
# State abbreviations
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", "IN(?!\\s+N\\s+OUT\\s+BURGER)", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", # Special expression for "In N Out"
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", 
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", 
    "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

# Order of execution
REGEX = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "),  # Non-breaking space
    (r"\s{2,}", " "),  # Multiple spaces

    # === 0.5) Remove leading dates left over from boilerplate ===
    (r"^\d{1,2}\s+", " "),

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2})?\b", " "),
    (r"\bPURCHASE\s+AUTHORIZED\s+ON(?:\s+\d{2})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}(?:\s+\d{2})?\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "),
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "),
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "),
    (r"\bCDX{4,}\b", " "),
    (r"\b(?:X{2,}|X{2,}\b)", " "),
    (r"\b[SP]X{6,}\b", " "),
    (r"\bDEBIT\s+CARD\b", " "),
    (r"\bDEBIT\s+PURCHASE\b", " "),
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "),

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "),
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "),
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*[AP]?\b", " "),

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b(?:\s+\w+)?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b", " "),
    (r"\bPAYMENT\s+CARD\s+TARGET\s+DEBIT\s+CRD(?:\s+ACH(?:\s+TRAN)?)?\b", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b.*", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "),

    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b.*", " "),
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b.*", " "),
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT|SEARS\s+BILL\s+PAYMENT)\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"\b(?:USA|US)\b$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "),
    (r"\b\d{3}-\d{4}\b", " "),
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 123 4567

    # === 8) URLs/domains ===
    # Specific junk billing info is still good to remove
    (r"\b(?:AMZN\.COM\/BILL|AMZNAMZN\.COM\s*BILL|AMAZON\.COM\/BILL|APPLE\.COM\/BILL|G\.CO\/HELPPAY#|SUPPORT\.GOOGLE?)\b", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "),

    # === 10) Final Tidy (Punctuation) ===
    # Preserving * and # for Phase 2
    (r"[|%_=;\\/]+", " "),
    (r"[-]{2,}", " "),
]

## 3. Apply Regex

In [99]:
%%time
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_cleaned'] = memos

CPU times: user 16.6 s, sys: 42.7 ms, total: 16.7 s
Wall time: 16.7 s


In [105]:
# Add column for removed characters
memo_size = df['memo'].str.len()
cleaned_memo_size = df['memo_cleaned'].str.len()
df['removed_chars'] = memo_size - cleaned_memo_size 

In [112]:
# Save to "memos_P1.csv"
df.to_csv('memos_P1.csv', index=False) 

In [137]:
df.sample(10).sort_values(by='memo').reset_index(drop=True)

Unnamed: 0,memo,memo_cleaned,removed_chars
0,Amazon Prime*GY69S06 Amzn.com/bill WA 0...,AMAZON PRIME*GY69S06,30
1,Amazon.com*2P6G81WT0 Amzn.com/bill WA 08/10,AMAZON.COM*2P6G81WT0,23
2,BACKYARD BIRD SHOP #11 LAKE OSWEGO ORXXXXXX 08/15,BACKYARD BIRD SHOP #11 LAKE OSWEGO ORXXXXXX,6
3,GLAM-O-RAMA CLEANERS 10/02 PURCHASE,GLAM-O-RAMA CLEANERS PURCHASE,6
4,Jules Kae pay.amazon.co FL 08/19,JULES KAE PAY.AMAZON.,11
5,PURCHASE AUTHORIZED ON 03/26 AMZN Mktp US*164X...,26 AMZN MKTP US*164X7,70
6,PURCHASE AUTHORIZED ON 04/16 SQ *PUFF N PASS F...,16 SQ *PUFF N PASS FORT LAUDERDA,56
7,PURCHASE AUTHORIZED ON 10/03 Subway XXXXX Idah...,03 SUBWAY IDAHO FALLS,62
8,PURCHASE AUTHORIZED ON 10/11 ROSS STORES #XXXX...,11 ROSS STORES # SCHAUMBURG,60
9,RECURRING PAYMENT AUTHORIZED ON 10/17 APPLE.CO...,17,93


# Phase 2: Extract & Analyze N-Grams

## 1. Load Updated Dataset

In [626]:
df_p2 = pd.read_csv("memos_P1.csv")

## 2. Extract N-Grams

## 3. Extract N-Grams