# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
# subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
# subprocess.run(['git', 'commit', '-m', 'regex'])

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [73]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     229 DBT CRD XXXX 04/06/21 XXXXXXXXLA GRAN FIES...
1     AMZN Mktp US*VW89L3W Amzn.com/bill WA        0...
2     BENEFIT EVANSTON EVANSTON IL                 0...
3                                   CASH APP ROMARO WIL
4     CHECKCARD XXXX BREAKOUT GAMES VIRGINIA BEACVA ...
5     CHECKCARD XXXX GRANT PARK MARKET LLC ATLANTA G...
6     CHECKCARD XXXX VILLAGE SPA SAN JUAN XXXXXXXXXX...
7     DEBIT CARD PURCHASE XXXXXXXXX ZAZZLE INC XXXXX...
8                                    E Z MART WILLIS TX
9     MCDONALD'S MXXXX OF TX GRAPEVINE CARD: XXXXXXX...
10                        Neelys Market Murfreesboro Tn
11    POS PURCHASE TERMINAL XXXXXXXX AMAZON.COM*PH6O...
12    PURCHASE AUTHORIZED ON 03/29 PIGGLY WIGGLY # C...
13    PURCHASE AUTHORIZED ON 04/04 UNC CHAR SA HRL I...
14    PURCHASE AUTHORIZED ON 08/21 Amazon.com*IC3XD9...
15    PURCHASE AUTHORIZED ON 08/25 23RD & MISSION PR...
16    PURCHASE AUTHORIZED ON 08/26 DD DOORDASH DUNKI...
17    PURCHASE XXXX DD DOORDASH AZULM XXXXXXXXXX

## 2. Define Regex Rules

In [6]:
STATE_LIST = [ 
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO) 
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER) 
    "KS", "KY", 
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA 
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA) 
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY" 
] 
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [55]:
CITY_LIST = [
    "MIAMI", "PHOENIX", "SEATTLE", "HOUSTON", "ORLANDO", 
    "CHICAGO", "ATLANTA", "LAS VEGAS", "CHARLOTTE", "TAMPA", 
    "GREENVILLE", "BROOKLYN", "DENVER", "LOS ANGELES", "SAN ANTONIO", "MEMPHIS", "NEW YORK",
    "RICHMOND", "MONT BELVIEU", "INDIANAPOLIS", "PINECREST", "COLUMBUS", 
    # Added from 2-grams
    "WAXAHACHIE", "EL CAJON", "PASO ROBLES", "BUENA VI", "CHULA VISTA", 
    "BOCA RATON", "PINE PLAINS", "HIGHLANDS RAN", "RENTON", "SALT", "BOGOT",
    "SAN DIEGO"
]
CITY_REGEX = r"\b(" + "|".join(CITY_LIST) + r")\b"

In [79]:
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "CRD", "PUR", "LLC", "SIGNATURE", "WEB", "PAYMENT",
    "DEB", "INTL", "RECURRING", "DIGIT", "ONLINE",
    "800", "888", "WITHDRAWAL", "STORE", "STORES", "RESTAURANT",
    "E-COMMERCE",
    "STOP", "BUSINESS",
    # Added from 1-gram
    "POS", "PURCHASE", "DEBIT",
    # Added from 2-grams
    "HELP", "HTTPSHBOMAX", "HTTPSINSTACAR"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [83]:
REGEX_PRE = [ 
    # === 0) Normalize spaces first === 
    (r"\u00A0", " "), # Replace non-breaking space with regular space 
    (r"\s{2,}", " "), # Collapse multiple spaces into one 

    # === 1) “Authorized / Recurring” headers === 
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "), 
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "), 
    (r"\bRECURRING\s+PYMT\b", " "), 

    # === 2) Card & mask boilerplate === 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "), 
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "), 
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "), 
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "), 
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "), 
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "), 
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "), 
    (r"\bF[XF]{4,}\b", " "), # Handle FXXXXX, FXXXX 
    (r"\bXXX\b", " "), # Handle masked number from n-grams
    
    # --- MODIFIED/MOVED RULE ---
    # Generalized from [SP]X{6,} to [A-Z]X{5,} to match SXXXXX
    # Moved *before* the generic X{4,} rule to ensure correct match priority
    (r"\b[A-Z]X{5,}\b", " "), 
    
    (r"X{4,}", " "), # Remove generic masked numbers 
    
    # (The old \b[SP]X{6,}\b rule is removed from its original position)
    
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "), 
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords 
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start 
    (r"^\s*REC\s+POS\b", " "), 
    (r"^\s*RECURRING\b", " "), 

    # === 2.5) Prefix Normalization === 
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM" 
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR 

    # === 3) State + mask tails ===
    # --- MODIFIED RULES ---
    # Applied the same [A-Z]?X{5,} logic here to generalize from [SP]?X{6,}
    (r"\b[A-Z]{2}\s+[A-Z]?X{5,}\s+CARD\s+X{4}\b", " "), 
    (r"\b[A-Z]{2}\s+[A-Z]?X{5,}\b", " "), 

    # === 4) Dates/times === 
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025 
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM 
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM 

    # === 4.5) Noise numbers (from n-grams) ===
    (r"\b(?:00|10|15|16|20|365)\b", " "),

    # === 5) Merchant-terminal boilerplate === 
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "), 
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "), 
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "), 
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID... 
     
    # === 6) Misc tails === 
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "), 
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "), 
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "), 
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "), 
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "), 
    (r"\bBILL\s+PAYMENT\b", " "), 
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "), 
    (r"-\s*MEMO=", " "), 
    (r"(?:USA|US)$", " "), # Remove USA or US at the end 
    (r"\s+FSP$", " "), 
    (r"\bL\d{3}\b", " "), # Handle L340

    # === 7) Phone numbers === 
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212 
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212 
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212 
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "), 

    # === 8) URLs/domains === 
    (r"^\.COM\s+BILL\b.*", " "), 
    (r"\.COM\b", " "), # Remove .COM anywhere
    (r"\s+COM$", " "), # Remove .COM at end of string

    # === 9) State abbreviations === 
    (STATE_REGEX, " "), # Remove standalone state codes 
    
    # === 9.5) City abbreviations ===
    (CITY_REGEX, " "), # Remove standalone city names
     
    # === 10) Noise Words (from 1-grams) ===
    (NOISE_WORDS_REGEX, " "),

    # === 11) Final Tidy (Punctuation) === 
    (r"[|%_=;\\/]+", " "), # Remove misc separators 
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s*|\s*-\s*$", " "), # Remove leading/trailing hyphens
]

In [82]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADDY
    re.compile(r".*(GODADDY\.COM|GODADDY)\b.*"),

    # === NEW: High-Frequency Full Merchant Names (from 1-grams) ===
    re.compile(r"^(AFTERPAY)\b.*"),
    re.compile(r"^(ALDI)\b.*"),
    re.compile(r"^(AMZN(?:\s*MKTP)?)\b.*"), # Handles amzn, amzn mktp
    re.compile(r"^(AMAZON(?:\.COM|\s+PRIME)?)\b.*"), # Handles amazon, amazon prime
    re.compile(r"^(APPLE(?:\.COM)?)\b.*"),
    re.compile(r"^(BRIGIT)\b.*"),
    re.compile(r"^(BURGER\s+KING)\b.*"),
    re.compile(r"^(CASH\s+APP)\b.*"),
    re.compile(r"^(CHICK-FIL-A)\b.*"),
    re.compile(r"^(CHIPOTLE)\b.*"),
    re.compile(r"^(CIRCLE\s+K)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),
    re.compile(r"^(DAIRY\s+QUEEN)\b.*"),
    re.compile(r"^(DOLLAR\s+GENERAL)\b.*"),
    re.compile(r"^(DOLLAR\s+TREE)\b.*"),
    re.compile(r"^(DOORDASH)\b.*"),
    re.compile(r"^(DUNKIN)\b.*"),
    re.compile(r"^(EBAY)\b.*"), # Added from n-grams
    re.compile(r"^(ETSY)\b.*"),
    re.compile(r"^(FAMILY\s+DOLLAR)\b.*"),
    re.compile(r"^(FOOD\s+LION)\b.*"),
    re.compile(r"^(FRYS)\b.*"),
    re.compile(r"^(GOOGLE)\b.*"),
    re.compile(r"^(HELPPAY)\b.*"),
    re.compile(r"^(HOME\s+DEPOT)\b.*"),
    re.compile(r"^(INSTACART)\b.*"),
    re.compile(r"^(KFC)\b.*"),
    re.compile(r"^(KROGER)\b.*"),
    re.compile(r"^(LITTLE\s+CAESARS)\b.*"),
    re.compile(r"^(LOWE'?S)\b.*"),
    re.compile(r"^(LYFT)\b.*"),
    re.compile(r"^(MCDONALD'?S)\b.*"),
    re.compile(r"^(MICROSOFT)\b.*"),
    re.compile(r"^(PUBLIX)\b.*"),
    re.compile(r"^(ROSS)\b.*"),
    re.compile(r"^(SAFEWAY)\b.*"),
    re.compile(r"^(SAMS\s*CLUB|SAMSCLUB)\b.*"), # Updated from n-grams
    re.compile(r"^(SHOPRITE)\b.*"),
    re.compile(r"^(SONIC)\b.*"),
    re.compile(r"^(STARBUCKS)\b.*"),
    re.compile(r"^(SUBWAY)\b.*"),
    re.compile(r"^(TACO\s+BELL)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(UBER(?:\s+EATS)?)\b.*"),
    re.compile(r"^(USPS)\b.*"),
    re.compile(r"^(VONS)\b.*"),
    # Updated from n-grams (wal, mart, wm, superc, supercenter)
    re.compile(r"^(WALMART(?:\s*SUPERCENTER|\s*SUPER\s*C)?|WM\s*SUPERCENTER|WAL-MART|WAL\s*MART)\b.*"),
    re.compile(r"^(WENDY'?S)\b.*"),


    re.compile(r"^(7(?:-ELEVEN|\s+11))\s*\*?#?.*"),
    re.compile(r"^(DDBR)\s*\*?#?.*")
]

## 3. Apply Regex

In [84]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 6.99 s, sys: 12 ms, total: 7 s
Wall time: 7 s


In [85]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            # Use the last non-null group if multiple are present
            # (handles the GODADDY rule)
            groups = match.groups()
            if groups:
                return groups[-1].strip()
    return memo

df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 657 ms, sys: 40 µs, total: 657 ms
Wall time: 654 ms


In [122]:
df['memo_post'] = df['memo_post'].str.replace('-', '')
df['memo_post'] = df['memo_post'].str.replace("'", '')
df.loc[df['memo_post'].str.contains("*", regex=False), 'memo_post'] = df.loc[df['memo_post'].str.contains("*", regex=False), 'memo_post']\
                                                                                            .str.split('*').str[1::].str.join('')

In [13]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [14]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [69]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
342047,PURCHASE AUTHORIZED ON 08/07 FOOD4LESS XXXXX B...,FOOD4LESS BEAR V VICTORVILLE,FOOD4LESS BEAR V VICTORVILLE
160210,FOODSCO # XXXX S.BROAD SANTA MARIA CA 0...,FOODSCO # S.BROAD SANTA MARIA,FOODSCO # S.BROAD SANTA MARIA
518616,XXXXX POS SIGNATURE IN N OUT BURGER COLORADO S...,IN N OUT BURGER COLORADO SPRI,IN N OUT BURGER COLORADO SPRI
181701,JENNY'S CAFE 01-25 CONYERS GA XXXX DEBIT CARD ...,JENNY'S CAFE CONYERS,JENNYS CAFE CONYERS
83364,CHECKCARD XXXX MONARCAS MEAT MARKET CA XXXXXXX...,MONARCAS MEAT MARKET,MONARCAS MEAT MARKET
218097,PARKING - DTD DLR ANAHIEM CA 04/21,PARKING - DTD DLR ANAHIEM,PARKING DTD DLR ANAHIEM
299822,PURCHASE AUTHORIZED ON 04/17 PUBLIX SUPER MAR ...,PUBLIX SUPER MAR TOW,PUBLIX
382130,PURCHASE AUTHORIZED ON 11/17 ROVER.COM support...,ROVER SUPPORT@ROVER,ROVER SUPPORT@ROVER
296760,PURCHASE AUTHORIZED ON 04/09 SONIC DRIVE IN #4...,SONIC DRIVE #41,SONIC
449453,SQ *PAT AND TONI?S SWEE DeLand FL 1...,SQ *PAT AND TONI?S SWEE DELAND,SQ *PAT AND TONI?S SWEE DELAND


In [125]:
df[df['memo'].str.contains("WAL-")]

Unnamed: 0,memo,memo_pre,memo_post
276025,PURCHASE AUTHORIZED ON 02/14 WAL-MART #XXXX SA...,WAL-MART # SAN JOSE,WALMART
275667,PURCHASE AUTHORIZED ON 02/13 WAL-MART #XXXX TA...,WAL-MART # TALLADEGA,WALMART
302910,PURCHASE AUTHORIZED ON 04/25 WAL-MART #XXXX HO...,WAL-MART # HOLLY SPRINGS,WALMART
487762,WAL-MART #XXXX 12/27 #XXXXXXXXX PURCHASE WAL-M...,WAL-MART # # WAL-MART #,WALMART
102119,CRD PUR MDJ0UJK70 XXXX / WAL-MART #XXXX NOBLES...,MDJ0UJK70 WAL-MART # NOBLESVILLE,MDJ0UJK70 WALMART # NOBLESVILLE
...,...,...,...
352895,PURCHASE AUTHORIZED ON 09/03 WAL-MART #XXXX CI...,WAL-MART # CITY OF INDUS,WALMART
512985,XXXX 24 WM SUPERCENTER # WAL-MART SUPER CENT S...,24 WM SUPERCENTER # WAL-MART SUPER CENT SPARTA...,24 WM SUPERCENTER # WALMART SUPER CENT SPARTAN...
275300,PURCHASE AUTHORIZED ON 02/12 WAL-MART #XXXX FA...,WAL-MART # FAYETTEVILLE,WALMART
488203,WAL-MART #XXXX HUNTSVILLE TX,WAL-MART # HUNTSVILLE,WALMART


In [121]:
df[df['memo_pre'].str.contains('TURO')]

Unnamed: 0,memo,memo_pre,memo_post
119536,DBT Purchase ARTUROS RESTAURANTNORTH OLMSTEDOH...,ARTUROS RESTAURANTNORTH OLMSTEDOH,ARTUROS RESTAURANTNORTH OLMSTEDOH
29662,ARTUROS MEXICAN RESTAU ALBUQUERQUE NM 0...,ARTUROS MEXICAN RESTAU ALBUQUERQUE,ARTUROS MEXICAN RESTAU ALBUQUERQUE
398950,PURCHASE AUTHORIZED ON 12/29 TURO INC.* TRIP A...,TURO INC.* TRIP AU HTTPSTURO,TRIP AU HTTPSTURO
475311,Turo Inc.* Trip Jul,TURO INC.* TRIP JUL,TRIP JUL
473191,TURO INC.* TRIP APR HTTPSTURO.COM CA 04/09,TURO INC.* TRIP APR HTTPSTURO,TRIP APR HTTPSTURO
126760,"DEBIT CARD PURCHASE AT TURO INC.* TRIP DE, SAN...","AT TURO INC.* TRIP , SAN FRANCISCO, ON FROM #:","TRIP , SAN FRANCISCO, ON FROM #:"


In [110]:
df.loc[df['memo_post'].str.contains("*", regex=False), 'memo_post'] = df.loc[df['memo_post'].str.contains("*", regex=False), 'memo_post'].str.split('*').str[1::].str.join('')

In [19]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [20]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

MCDONALD'S
APPLE
TARGET
STARBUCKS
AMZN
EBAY
SUBWAY
USPS
COSTCO
ROSS
BRIGIT
KFC
ETSY
SHOPRITE
CHIPOTLE
MARKET@WORK
WAL
CHEWY
PLAYSTATIONNETWORK
MCDONALDS
DOMINO'S
SUNPASS*ACC
GODADDY
CRYPTO
BETMGM
PARKMOBILE
TOLLWAY-AUTOREPLEN
PLAYRIX
SOUTHWES
CLEO
UNITED
WISH
MARKET
MACYS
POPEYES
HELLOFRESH
NIKE
SEI
PETCO
WORLDREMIT
SHEIN
VERIZONWRLSS*RTCCR
PARAMOUNT+
STEAM
EZPASS
HOMEDEPOT
ANC*ANCESTRY
VOLA
DOLLARSHAVECLUBUS
CMSVEND*CV
BANFIELD-PET*WPPAYMEN
SCENTSY,INC.
KIMS
SIE*PLAYSTATIONNET
COOPERSHAWK
PLAYSUGARHOUSE
PRESSNET
SIMPLISAFE
ALIEXPRESS
POSHMARK
BIGBADTOYSTORE
FIVERR
ZULILY
VETSOURCE
HLLFRSH*
EXPRESSVPN
KATAPULT
ECHST.NET
NYTIMES*NYTIMES
CRAIGSLIST.ORG
WINGSTOP
FEDEX
FRG*FANATICS
EVERYPLATE
H&M
.HOTTOPIC
GOPUFF
FACTOR75
DOCUSIGN
ICM*INSTANTCHECKMA
UNITEDHEALTHONE
HARDEES
STAPLES
TOUCHTUNES.HELPSHI
UCHEALTH
TROPSUNPROD
#
SNOOZE
IBI*FABLETICS
THREDUP
PLAYTKASLOTOMANIA.
BANFIELD-PET*WPPAY
QDOBA
ZAXBYS
IHOP
JCPENNEY
JACKSON
FOODSMART
RESUME-NOW
BURLINGTON
OUTBACK
FRG*NFLSHOP
PUBLICDATA
PLANT

In [119]:
df[df['memo'].str.contains('PROMENADE')]

Unnamed: 0,memo,memo_pre,memo_post


In [115]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants[~prefix_star_merchants['memo_post'].isin(merchants)][['memo_pre', 'memo_post']].sample(20)

Unnamed: 0,memo_pre,memo_post
469641,TST* BETTER BUZZ MIRAMA,BETTER BUZZ MIRAMA
123219,DDA * MACY S RT 9 SO FREEHOLD *,MACY S RT 9 SO FREEHOLD
244161,SIE*PLAYSTATIONNET WORK,PLAYSTATIONNET WORK
526064,ZIP.CO* SHEIN .ZIP.,SHEIN .ZIP.
87381,SKIPLAGGED*MNKIID HTTPSSKIPLAGGNY,MNKIID HTTPSSKIPLAGGNY
512233,SUNPASS*ACC,ACC
423319,OTT* UPFAITHANDFAM ABARNES@UPTV.,UPFAITHANDFAM ABARNES@UPTV.
89809,TEAMSNAP* HOHOKUSSADDL PAYMENTS@TEAMNJ,HOHOKUSSADDL PAYMENTS@TEAMNJ
471858,TST* ROOSTERS - PICK,ROOSTERS PICK
411202,TST* CAFE FIORE BALLANTCHARLOTTE,CAFE FIORE BALLANTCHARLOTTE


In [62]:
df[df['memo_pre'].str.contains("ONLYF")]

Unnamed: 0,memo,memo_pre,memo_post
422131,RECURRING PAYMENT AUTHORIZED ON 02/27 CCBill.c...,CCBILL *ONLYF,CCBILL *ONLYF
422861,RECURRING PAYMENT AUTHORIZED ON 03/21 CCBill.c...,CCBILL *ONLYF,CCBILL *ONLYF
510174,Withdrawal PFCU Check Card / CCBill.com *OnlyF...,PFCU CCBILL *ONLYFANS 30 #,PFCU CCBILL *ONLYFANS 30 #
422966,RECURRING PAYMENT AUTHORIZED ON 03/24 CCBill.c...,CCBILL *ONLYF,CCBILL *ONLYF
61742,CCBill.com *OnlyFans XXX-XXXXXXX CA 0...,CCBILL *ONLYFANS,CCBILL *ONLYFANS
424144,RECURRING PAYMENT AUTHORIZED ON 05/02 CCBill.c...,CCBILL *ONLYF,CCBILL *ONLYF
61759,CCBill.com *OnlyFans XXX-XXXXXXX CA 0...,CCBILL *ONLYFANS,CCBILL *ONLYFANS
422393,RECURRING PAYMENT AUTHORIZED ON 03/06 CCBill.c...,CCBILL *ONLYF,CCBILL *ONLYF
61790,CCBill.com *OnlyFans XXX-XXXXXXX CARD: XXXXXXX...,CCBILL *ONLYFANS - : :31,CCBILL *ONLYFANS - : :31
61744,CCBill.com *OnlyFans XXX-XXXXXXX CA 0...,CCBILL *ONLYFANS,CCBILL *ONLYFANS


In [66]:
df.loc[422966]['memo']

'RECURRING PAYMENT AUTHORIZED ON 03/24 CCBill.com *OnlyF CA SXXXXXXXXXXXXXXX CARD XXXX'

In [24]:
print(pd.Series(prefix_star_merchants['memo_pre'].str.split('*').str[0].unique()).sort_values().to_string())

829              .BBQKINGSM
372                 .BROWAR
658                 .EASTVE
842                 .ELFENI
830               .GBPRIMEP
831                 .GETPUR
779                .HBOMAX 
843               .IPAY.UA.
708                 .MASSAG
373                 .MRPICK
853                 .MYSOBO
703              .NCARB.ORG
832                 .PLAYGR
833                 .RIBCIT
374                 .STYLES
854                .STYLESE
834               .STYLESEA
855                 .THECEL
835                 .WATANA
1                13FLDENVER
2                 1PASSWORD
3                   2CHECKO
4                      2CO 
5                    2COCOM
112                     3CI
6                       4TE
7                         5
8                   99PLEDG
9                     9FOLD
10                       A 
45                      AAV
11                      AB 
12                      ABC
13                ABCMOUSE 
720             ACCOUNTANT 
14                  

In [25]:
df[df['memo_pre'].str.contains('PAYPAL')]

Unnamed: 0,memo,memo_pre,memo_post
1199,#XXXXXX PAYPAL ADOBE INC,# PAYPAL ADOBE INC,# PAYPAL ADOBE INC
1200,#XXXXXX PAYPAL BARKBOX INC,# PAYPAL BARKBOX INC,# PAYPAL BARKBOX INC
1201,#XXXXXX PAYPAL ETSY INC,# PAYPAL ETSY INC,# PAYPAL ETSY INC
1202,#XXXXXX PAYPAL MICROSOFT,# PAYPAL MICROSOFT,# PAYPAL MICROSOFT
1203,#XXXXXX PAYPAL MICROSOFT M,# PAYPAL MICROSOFT M,# PAYPAL MICROSOFT M
...,...,...,...
523714,XXXXXXXK4S11X9Y5S PAYPAL *FINISH LINE XXX-XXX-...,K4S11X9Y5S PAYPAL *FINISH LINE,K4S11X9Y5S PAYPAL *FINISH LINE
523740,XXXXXXXL8S0R63EHS PAYPAL *FINISH LINE XXX-XXX-...,L8S0R63EHS PAYPAL *FINISH LINE,L8S0R63EHS PAYPAL *FINISH LINE
523747,XXXXXXXLES10PS2QN PAYPAL *FINISH LINE XXX-XXX-...,LES10PS2QN PAYPAL *FINISH LINE,LES10PS2QN PAYPAL *FINISH LINE
523758,XXXXXXXLVS0LKWQK3 PAYPAL *FINISH LINE XXX-XXX-...,LVS0LKWQK3 PAYPAL *FINISH LINE,LVS0LKWQK3 PAYPAL *FINISH LINE


In [26]:
full_df = pd.read_parquet('../q1-ucsd-outlfows.pqt')

In [27]:
full_df[full_df['memo'].str.contains('PRIME')][['memo', 'category']].sample(50)

Unnamed: 0,memo,category
2198689,VISA - 02/16 AMAZON PRIME*979JR2N33 AMZN.COM/B...,GENERAL_MERCHANDISE
1959379,POS Debit - Visa Check Card XXXX - AMAZON PRIM...,GENERAL_MERCHANDISE
2446191,AMAZON PRIME PMTS Amzn.com/bill,GENERAL_MERCHANDISE
2263334,XXXX VSA RECUR AMAZON PRIME 5D0UE6J13 AMZN.COM...,GENERAL_MERCHANDISE
2072724,DBT CRD XXXX 32 PRIME STORAGE SC 864-XXXXXXXSC...,GENERAL_MERCHANDISE
1638327,AMAZON PRIME PMTS XXX-XXX-XXXX W,GENERAL_MERCHANDISE
1563849,AMAZON PRIME*1M6CI6C21,GENERAL_MERCHANDISE
940838,PRIME VIDEO CHA AMZN.COM/BILL,GENERAL_MERCHANDISE
1960718,POS Debit - Visa Check Card XXXX - AMAZON PRIM...,GENERAL_MERCHANDISE
1981605,PURCHASE XXXX GRUBHUBPRIMETIMER XXXXXXXXXX NY ...,GENERAL_MERCHANDISE


# Phase 2: Extract & Analyze N-Grams

In [28]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [29]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [30]:
%%time
corpus = df['memo'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=100)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=100)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=100)
print(f"--- N-gram Analysis Complete ---")

Analyzing 528766 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 49.7 s, sys: 401 ms, total: 50.1 s
Wall time: 50.1 s


In [31]:
top_1grams
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [32]:
top_1grams

[('xxxx', 405924),
 ('card', 231945),
 ('purchase', 220149),
 ('authorized', 154350),
 ('sxxxxxxxxxxxxxxx', 118231),
 ('ca', 102989),
 ('com', 92840),
 ('xxx', 70724),
 ('debit', 67658),
 ('amzn', 59044),
 ('xxxxxx', 46682),
 ('10', 46644),
 ('pos', 46347),
 ('12', 45199),
 ('09', 44177),
 ('xxxxxxxxxxxxxxxxxxxxxxx', 44176),
 ('08', 43975),
 ('wa', 43142),
 ('07', 43058),
 ('11', 42206),
 ('05', 41251),
 ('03', 40744),
 ('06', 40271),
 ('04', 40218),
 ('01', 39908),
 ('02', 36771),
 ('fl', 36570),
 ('xxxxx', 33927),
 ('amazon', 33190),
 ('22', 33110),
 ('checkcard', 31265),
 ('tx', 27586),
 ('xxxxxxxxxx', 27141),
 ('21', 24588),
 ('xxxxxxxxx', 24415),
 ('withdrawal', 24081),
 ('cash', 24046),
 ('pxxxxxxxxxxxxxxx', 21521),
 ('date', 21400),
 ('xxxxxxx', 20939),
 ('visa', 20424),
 ('ga', 20375),
 ('app', 19612),
 ('mktp', 19161),
 ('xxxxxxxx', 18908),
 ('doordash', 16966),
 ('mart', 16380),
 ('15', 16297),
 ('20', 16203),
 ('check', 15385),
 ('wal', 15348),
 ('recurring', 15112),
 ('16',

In [33]:
ngrams_1

['xxxx',
 'card',
 'authorized',
 'sxxxxxxxxxxxxxxx',
 'ca',
 'com',
 'xxx',
 'amzn',
 'xxxxxx',
 '10',
 '12',
 '09',
 'xxxxxxxxxxxxxxxxxxxxxxx',
 '08',
 'wa',
 '07',
 '11',
 '05',
 '03',
 '06',
 '04',
 '01',
 '02',
 'fl',
 'xxxxx',
 'amazon',
 '22',
 'checkcard',
 'tx',
 'xxxxxxxxxx',
 '21',
 'xxxxxxxxx',
 'cash',
 'pxxxxxxxxxxxxxxx',
 'date',
 'xxxxxxx',
 'visa',
 'ga',
 'app',
 'mktp',
 'xxxxxxxx',
 'doordash',
 '15',
 '20',
 'check',
 'wal',
 '16',
 '23',
 'ny',
 'az',
 'pxxxxxxxxxxxxxxxxx',
 '17',
 '14',
 '18',
 '19',
 '30',
 '13',
 '24',
 '26',
 'san',
 '28',
 '25',
 '27',
 '29',
 'va',
 'il',
 'market',
 'tst',
 'mcdonald',
 'dollar',
 'apple',
 'nc',
 'billwa',
 'food',
 'id',
 'xxxxxxxxxxxx',
 'pin',
 'pa',
 'point',
 'xxxxxxxxxxxxxxxx',
 'nj',
 'target',
 '31',
 'google',
 '00',
 'oh',
 'mi']

In [34]:
ngrams_1

['xxxx',
 'card',
 'authorized',
 'sxxxxxxxxxxxxxxx',
 'ca',
 'com',
 'xxx',
 'amzn',
 'xxxxxx',
 '10',
 '12',
 '09',
 'xxxxxxxxxxxxxxxxxxxxxxx',
 '08',
 'wa',
 '07',
 '11',
 '05',
 '03',
 '06',
 '04',
 '01',
 '02',
 'fl',
 'xxxxx',
 'amazon',
 '22',
 'checkcard',
 'tx',
 'xxxxxxxxxx',
 '21',
 'xxxxxxxxx',
 'cash',
 'pxxxxxxxxxxxxxxx',
 'date',
 'xxxxxxx',
 'visa',
 'ga',
 'app',
 'mktp',
 'xxxxxxxx',
 'doordash',
 '15',
 '20',
 'check',
 'wal',
 '16',
 '23',
 'ny',
 'az',
 'pxxxxxxxxxxxxxxxxx',
 '17',
 '14',
 '18',
 '19',
 '30',
 '13',
 '24',
 '26',
 'san',
 '28',
 '25',
 '27',
 '29',
 'va',
 'il',
 'market',
 'tst',
 'mcdonald',
 'dollar',
 'apple',
 'nc',
 'billwa',
 'food',
 'id',
 'xxxxxxxxxxxx',
 'pin',
 'pa',
 'point',
 'xxxxxxxxxxxxxxxx',
 'nj',
 'target',
 '31',
 'google',
 '00',
 'oh',
 'mi']

In [35]:
top_2grams

[('card xxxx', 173894),
 ('purchase authorized', 140141),
 ('sxxxxxxxxxxxxxxx card', 118231),
 ('amzn com', 37465),
 ('ca sxxxxxxxxxxxxxxx', 37250),
 ('checkcard xxxx', 31234),
 ('debit card', 30134),
 ('xxx xxx', 28777),
 ('xxx xxxx', 28777),
 ('amazon com', 25341),
 ('com wa', 25079),
 ('card purchase', 23151),
 ('pxxxxxxxxxxxxxxx card', 21521),
 ('amzn mktp', 18860),
 ('cash app', 18859),
 ('xxxxxxxxx purchase', 17845),
 ('pos debit', 15296),
 ('check card', 15180),
 ('pxxxxxxxxxxxxxxxxx card', 14541),
 ('wa sxxxxxxxxxxxxxxx', 14215),
 ('authorized 12', 13927),
 ('authorized 10', 13740),
 ('xxxxxxxxxx ca', 13369),
 ('authorized 03', 13219),
 ('authorized 08', 13157),
 ('authorized 07', 13053),
 ('xxx xxxxxxx', 13034),
 ('authorized 09', 12936),
 ('authorized 04', 12850),
 ('wal mart', 12720),
 ('authorized 05', 12711),
 ('authorized 11', 12679),
 ('authorized 06', 12327),
 ('authorized 01', 12270),
 ('fl sxxxxxxxxxxxxxxx', 12117),
 ('visa check', 11708),
 ('recurring payment', 11506

In [36]:
ngrams_2 = [ngram for ngram, count in top_2grams]
ngrams_2

['card xxxx',
 'purchase authorized',
 'sxxxxxxxxxxxxxxx card',
 'amzn com',
 'ca sxxxxxxxxxxxxxxx',
 'checkcard xxxx',
 'debit card',
 'xxx xxx',
 'xxx xxxx',
 'amazon com',
 'com wa',
 'card purchase',
 'pxxxxxxxxxxxxxxx card',
 'amzn mktp',
 'cash app',
 'xxxxxxxxx purchase',
 'pos debit',
 'check card',
 'pxxxxxxxxxxxxxxxxx card',
 'wa sxxxxxxxxxxxxxxx',
 'authorized 12',
 'authorized 10',
 'xxxxxxxxxx ca',
 'authorized 03',
 'authorized 08',
 'authorized 07',
 'xxx xxxxxxx',
 'authorized 09',
 'authorized 04',
 'wal mart',
 'authorized 05',
 'authorized 11',
 'authorized 06',
 'authorized 01',
 'fl sxxxxxxxxxxxxxxx',
 'visa check',
 'recurring payment',
 'authorized 02',
 'xxxx debit',
 'payment authorized',
 'debit visa',
 'xxxx ca',
 'xxxx card',
 'withdrawal debit',
 'purchase xxxx',
 'tx sxxxxxxxxxxxxxxx',
 'pos purchase',
 'com billwa',
 'billwa xxxxxxxxxxxxxxxxxxxxxxx',
 'ga sxxxxxxxxxxxxxxx',
 'ca xxxxxxxxxxxxxxxxxxxxxxx',
 'crd xxxx',
 'dbt crd',
 'apple com',
 'com ca',
 

In [37]:
top_3grams

[('sxxxxxxxxxxxxxxx card xxxx', 118231),
 ('ca sxxxxxxxxxxxxxxx card', 37250),
 ('xxx xxx xxxx', 28777),
 ('amzn com wa', 24880),
 ('pxxxxxxxxxxxxxxx card xxxx', 21521),
 ('debit card purchase', 17673),
 ('pxxxxxxxxxxxxxxxxx card xxxx', 14541),
 ('wa sxxxxxxxxxxxxxxx card', 14215),
 ('purchase authorized 12', 12668),
 ('purchase authorized 10', 12430),
 ('fl sxxxxxxxxxxxxxxx card', 12117),
 ('purchase authorized 08', 11992),
 ('purchase authorized 03', 11984),
 ('purchase authorized 07', 11912),
 ('purchase authorized 04', 11756),
 ('purchase authorized 09', 11700),
 ('visa check card', 11667),
 ('purchase authorized 05', 11601),
 ('purchase authorized 11', 11467),
 ('xxxx debit card', 11399),
 ('recurring payment authorized', 11334),
 ('debit visa check', 11291),
 ('pos debit visa', 11250),
 ('check card xxxx', 11250),
 ('com wa sxxxxxxxxxxxxxxx', 11214),
 ('purchase authorized 06', 11207),
 ('purchase authorized 01', 11066),
 ('xxx xxxx ca', 10701),
 ('purchase authorized 02', 10358)

In [38]:
# Use 1 grams to find prefixes