# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main a15ad2c] regex
 1 file changed, 759 insertions(+), 4312 deletions(-)
 rewrite Week 2/Haris_Saif.ipynb (84%)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in#.sample(100_000).copy()
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0      AMAZON.COM*1N2HQ2CLAMZN.COM/BILL WAUS Card #XXXX
1     ARBYS XXXX 07-24 CLERMONT FL XXXX DEBIT CARD P...
2                                         BUSINESSOLVER
3     CHECKCARD XXXX AMZN MKTP US*1L45S4FE2 AMZN.COM...
4     CHERRY VALLEY MARKET PL YONKERS NY           1...
5     CYPRESS 08/08 #XXXXXXXXX PURCHASE CYPRESS PLEA...
6     DUNKIN #XXXXXX Q35 XXX-XXX-XXXX NY           0...
7     Debit Purchase 10/16 Card XXXXcost Cutters 169...
8     FRY'S XXX-XXX-XXXX AZ                        0...
9     MOD PIZZA DOWN SPOKANE WA Date 02/24/22 XXXXXXXXX
10    MR VAPE GURU MONTICELLO NY                   0...
11    POS Card purchase STATERBROS177 XXXXX BEN XXXX...
12    POS SIG 11/26 VISA #XXXX LOWES #XXXXX* JACKSON...
13                                    POSHMARK CA 03/22
14    PURCHASE AUTHORIZED ON 05/16 PlaystationNetwor...
15    PURCHASE AUTHORIZED ON 07/02 PANDA EXPRESS XXX...
16    PURCHASE AUTHORIZED ON 09/03 EARTH ORIGINS MAR...
17    PURCHASE AUTHORIZED ON 12/21 TROYFAMILYDEN

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [8]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"\bX{4,}\b", " "), # PRECISE: Remove standalone masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times (INCLUSIVE) ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{4}-\d{2}-\d{2}\b", " "), # YYYY-MM-DD
    (r"\b\d{1,2}-(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-\d{2,4}\b", " "), # DD-MMM-YYYY
    (r"\b(?:19|20)\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[12]\d|3[01])\b", " "), # YYYYMMDD
    (r"\b(?:0[1-9]|1[0-2])(?:0[1-9]|[12]\d|3[01])\d{2}\b", " "), # MMDDYY
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails (INCLUSIVE) ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\bID\b", " "), # Remove standalone 'ID' (from 'ID: DSW')
    (r"\b(?:REF|TERM|TRN|INV|ACCT|TID|MID)\s*#?[\d\w-]+\b", " "), # REF 123, TERM 001, etc.
    (r"\bAUTH\s+CODE\s*[\d\w-]+\b", " "), # AUTH CODE 0123
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers (INCLUSIVE) ===
    (r"\b1[\s.-]\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}\b", " "), # 1-800-555-1212
    (r"\b\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}\b", " "), # (800) 555-1212, 800.555.1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\bXXX-XXX-XXXX\b", " "), # Masked phone

    # === 8) URLs/domains (INCLUSIVE) ===
    (r"^\.COM\s+BILL\b.*", " "),
    (r"\s+\.(?:COM|NET|ORG|GOV|EDU|IO|CO)\b", " "), # Remove trailing .COM, .NET etc.

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s+", " ") # Remove leading hyphens
]

In [9]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    # CORRECTED: [A-Z\s0-9'.*&-]+?
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*&-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    # CORRECTED: [A-Z\s0-9'.*&-]+?
    re.compile(r"^([A-Z\s0-9'.*&-]+?)\s+PAYPAL.*$"),

    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    # CORRECTED: [A-Z\s0-9'.&-]+
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.&-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    # CORRECTED: [A-Z\s0-9'.&-]+
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.&-]+).*"),

    # --- NEW: High-Frequency Merchants (Capture prefix as merchant) ---
    re.compile(r"^(7-ELEVEN|7\s+11)\s*\*?#?.*"),
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    re.compile(r"^(AMZN\s*MKTP|AMAZON\s*MKTPLACE|AMAZON\.COM)\b.*"),
    re.compile(r"^(WAL-MART|WM\s+SUPER)\b.*"),
    re.compile(r"^(UBER|LYFT)\b.*"),
    re.compile(r"^(DOORDASH|GRUBHUB)\b.*"),
    re.compile(r"^(STARBUCKS|SBUX)\b.*"),
    re.compile(r"^(MCDONALD'S|MCDONALDS)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),

    # --- Standardized Prefix Rules (Capture *after* prefix) ---
    # CORRECTED: [A-Z\s0-9'.-&] -> [A-Z\s0-9'.&-_]
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    
    # === TST RULES (Precise, non-greedy) ===
    # CORRECTED: [A-Z\s0-9'.-&] -> [A-Z\s0-9'.&-]
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.&-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.&-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.&-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\b.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.&-]+?)\s+[A-Z]{2,}\b.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.&-]+).*"), 
    
    # === SQ RULES (Precise, non-greedy) ===
    # CORRECTED: [A-Z\s0-9'.-&] -> [A-Z\s0-9'.&-]
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.&-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\b.*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.&-]+?)\s+[A-Z]{2,}\b.*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    
    # CORRECTED: [A-Z\s0-9'.-&] -> [A-Z\s0-9'.&-]
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.&-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.&-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    # CORRECTED: [A-Z\s0-9'.-&] -> [A-Z\s0-9'.&-]
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.&-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    # CORRECTED: [A-Z\s0-9'.-&] -> [A-Z\s0-9'.&-]
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.&-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.&-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    # CORRECTED: [A-Z\s'.-&][A-Z\s0-9'.-&]*? -> [A-Z\s'.&-][A-Z\s0-9'.&-]*?
    re.compile(r"^([A-Z\s'.&-][A-Z\s0-9'.&-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    # CORRECTED: [A-Z\s'.-&][A-Z\s0-9'.-&]*? -> [A-Z\s'.&-][A-Z\s0-9'.&-]*?
    re.compile(r"^([A-Z\s'.&-][A-Z\s0-9'.&-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    # CORRECTED: [A-Z\s'.-&][A-Z\s0-9'.-&]*? -> [A-Z\s'.&-][A-Z\s0-9'.&-]*?
    re.compile(r"^([A-Z\s'.&-][A-Z\s0-9'.&-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 24.8 s, sys: 202 ms, total: 25 s
Wall time: 25 s


In [11]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 3.83 s, sys: 400 µs, total: 3.83 s
Wall time: 3.83 s


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [14]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
37212,Amazon.com*H20JC8I41 Amzn.com/bill WA 1...,AMAZON.COM*H20JC8I41 AMZN.COM BILL,AMAZON.COM
236589,POS Debit - Visa Check Card XXXX - SP HOTLIFEC...,SP HOTLIFECHARMS HOTWIFECHARM,HOTLIFECHARMS HOTWIFECHARM
311763,PURCHASE AUTHORIZED ON 05/19 LL*PAMCLUBXXXXXXX...,LL*PAMCLUBXXXXXXXX SWITZERLAND CHE,LL
498879,Withdrawal #XXXXXXXXXXXX / NNT PGA TOUR SUPERS...,NNT PGA TOUR SUPERSXXXXXX N. ARIZONA AVENUE,NNT PGA TOUR SUPERSXXXXXX N. ARIZONA AVENUE
214824,OREGANOS PIZZA XXXX,OREGANOS PIZZA,OREGANOS PIZZA
305585,PURCHASE AUTHORIZED ON 05/02 STARBUCKS STORE 0...,STARBUCKS STORE 08 HALLANDALE BE,STARBUCKS
461259,TARGET ST La Canada Fli CA 09/15,TARGET ST CANADA FLI,TARGET
488924,WAL-MART Wal- 11/02 #XXXXXXXXX PURCHASE WAL-MA...,WAL-MART WAL- # PURCHASE WAL-MART WAL-MAR PITT...,WAL-MART
401866,PURCHASE WITH CASH BACK $ 12.00 AUTHORIZED ON ...,WITH CASH BACK $ 12.00 PUBLIX SUPER MAR 101 E ...,WITH CASH BACK $ 12.00 PUBLIX SUPER MAR 101 E ...
265863,PURCHASE AUTHORIZED ON 01/17 WM SUPERC Wal-Mar...,WM SUPERC WAL-MART SUP PICO RIVERA,WM SUPERC WAL-MART SUP PICO RIVERA


In [15]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [16]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

MCDONALD'S
TARGET
STARBUCKS
COSTCO
THE
FRESH
MCDONALDS
XXXXAMAZON.COM
AMZNFREETIME
CHEWY.COM
WINN-DIXIE
DES
SHIPT
BASKIN
SLICE
RALPHS
BETMGM
PETCO
GOFAN
XXXXTST
IHOP
UARE
INTUIT
STEAK-N-SHAKE
DILLONS
*STARBUCKS
STATERBRO
AYRIX
SKILLZ
VANS
TOLLWAY-AUTOREPLEN
FRG
PRICELN
RIOT
STORE
BANFIELD-PET
MCW
NOTHING
.KOHLS.COM
XXXXGOOGLE
GAMESTOP
MARINA
SAVEMART
OCULUS
BASHAS''
FOODMAXX
CANVA
RAINBOW
NYTIMES
SEZZLE
XXXXSQ
WALGREENS
VERIZONWRLSS
CLAIRE'S
ABC
POPEYES
GNC
EL
SUBWAY
FRYS-MKTPLACE
BELK
LUCKY
FOOD4LESS
FIV
V
STAPLES
*EBAY
AMZ
*MICROSOFT
TILLYS
SHOPIFY
IBI
QFC
UPS
OTT
BLUESKY
*UBER
HELLOFRESH
GOODWILL
DROPBOX
PIZZA
LIBRARY
JACK'S
POTBELLY
JAMBA
L
QUADPAY
WEGMANS
EVI
SOUTHWES
ETT
NORTON
RVT
CHECKERS
CRYPTO.COM
ENMARKET
PARKMOBILE
SALAD
HOME
ABCMOUSE.COM
FBPAY
EA
JOURNEYS
TACOS
OPC
TLG
CRT
PRESSNET
GERALD
SEDANOS
NORDSTROM
LJS
MEIJER
REI
VOLA
ANCESTRY.COM
FH
KING
EBAY
NEWSSTAND
EZPASS
EVERYPLATE
PAR
FIVERR
E-Z
HLLFRSH
ZTL
*STEAM
XXXXWAL-MART
LUIS
MOE'S
RGP
TRTHFDR
EMPRESSO
DRI
PACSUN
NIKE.

In [17]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
74774,CHECKCARD XXXX CK XXXXXXX,,
82514,CHECKCARD XXXX MCC NC XXXXXXXXXXXXXXXXXXXXXXX,,
88157,"CHECKCARD XXXX SQ *""IT'S ALL PEACHY"" F Centenn...","SQ *""IT'S ALL PEACHY"" F CENTENNIAL",
93788,CHECKCARD XXXX XXXX XXXXXXXXXX TX XXXXXXXXXXXX...,,
93843,CHECKCARD XXXX XXXXXXXXXXXX,,
93846,CHECKCARD XXXX XXXXXXXXXXXXXX,,
98408,CK XXXXXXX,,
104841,Card,,
112295,Ck XXXXXXX,,
112576,Co,,


In [18]:
df[df['memo_pre'].str.contains('ELEVEN')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
384253,PURCHASE AUTHORIZED ON 11/23 7-ELEVEN FORT WOR...,7-ELEVEN FORT WORTH,7-ELEVEN
227667,POS Debit - Visa Check Card XXXX - 7-ELEVEN BU...,7-ELEVEN BUDA,7-ELEVEN
306734,PURCHASE AUTHORIZED ON 05/06 7-ELEVEN ORLANDO ...,7-ELEVEN ORLANDO,7-ELEVEN
6073,7-ELEVEN DES PLAINES IL 0...,7-ELEVEN DES PLAINES,7-ELEVEN
520099,XXXXXX POS DDA W/D 01/22 22:32 7-ELEVEN WESTMI...,POS DDA W D 22:32 7-ELEVEN WESTMINSTER 00MTXXXX,POS DDA W D 22:32 7-ELEVEN WESTMINSTER 00MTXXXX
6509,7-ELEVEN PACOIMA CA 0...,7-ELEVEN PACOIMA,7-ELEVEN
289984,PURCHASE AUTHORIZED ON 03/24 7-ELEVEN Murrieta...,7-ELEVEN MURRIETA,7-ELEVEN
227692,POS Debit - Visa Check Card XXXX - 7-ELEVEN LO...,7-ELEVEN LORTON,7-ELEVEN
6951,7-ELEVEN XXXXX BOCA RATON FL 0...,7-ELEVEN BOCA RATON,7-ELEVEN
6983,7-ELEVEN XXXXX GLENDALE CA 0...,7-ELEVEN GLENDALE,7-ELEVEN


In [19]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

365050                                       AMAZON.COM
34198                              AMAZON COM XH5ZA96O3
516506            VSA PUR PP APPLE.COM BILL ( 01:38:19)
478436                                SOUTHCOM AIR VEND
37829                                        AMAZON.COM
155483                          ETSY.COM THE CRAFT ENGI
523427                SIG COMPLETION KFC LXXXXXX DUBLIN
51240             BRIGIT-COM PROTECTION 111FDF079ADF49D
12995                  AMAZON LUNA AMZN. COM BILLWAXXXX
78850     GRAMMARLY COVKECFKO GRAMMARLY.COMCA RECURRING
64353                                        AMAZON.COM
28671                        APPLE.COM BILL866-712- WDR
348055                           WALMART.COM AY XXX-XXX
66602                                        AMAZON.COM
125664                                    AT AMAZON.COM
486232                           W D SVC ARIS 20 TACOMA
392402                                     NIKE.COM XXX
517843                                   XXXXAMA

In [20]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
110,#00 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#00 BP APPLE COM BILL CUPERTINO 10 #,#00 BP APPLE COM BILL CUPERTINO 10 #
135,#04 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#04 BP APPLE COM BILL CUPERTINO 10 #,#04 BP APPLE COM BILL CUPERTINO 10 #
150,#07 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#07 BP APPLE COM BILL CUPERTINO 10 #,#07 BP APPLE COM BILL CUPERTINO 10 #
179,#12 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#12 BP APPLE COM BILL CUPERTINO 10 #,#12 BP APPLE COM BILL CUPERTINO 10 #
188,#14 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#14 BP APPLE COM BILL CUPERTINO 10 #,#14 BP APPLE COM BILL CUPERTINO 10 #
...,...,...,...
526991,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 1 6,APPLE.COM BILL ONE APPLE PARK WAY 1 6
526992,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 8 2,APPLE.COM BILL ONE APPLE PARK WAY 8 2
526993,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 2 6,APPLE.COM BILL ONE APPLE PARK WAY 2 6
526994,debit card APPLE.COM/BILL ONE APPLE PARK XXXXX...,APPLE.COM BILL ONE APPLE PARK C 2 4,APPLE.COM BILL ONE APPLE PARK C 2 4


In [21]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
2212,13FLDENVER* 13THFL HTTPSWWW.13TH CO,13FLDENVER* 13THFL .13TH,13FLDENVER* 13THFL .13TH
2331,1PASSWORD* TRIAL OVER TORONTO ON 01/07,1PASSWORD* TRIAL OVER TORONTO ON,1PASSWORD* TRIAL OVER TORONTO ON
3607,2CHECKO*KILOHEA Alpharetta GA USA,2CHECKO*KILOHEA ALPHARETTA,2CHECKO*KILOHEA ALPHARETTA
3608,2CO.COM*slideupli XXXXXXXXXX 04/03,2CO.COM*SLIDEUPLI,2CO.COM*SLIDEUPLI
3609,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM
...,...,...,...
528725,www.Playgr* Bidiboo.Co,.PLAYGR* BIDIBOO.CO,.PLAYGR
528726,www.Playgr* Littlemiss,.PLAYGR* LITTLEMISS,.PLAYGR
528732,www.Styles* Luv N Hair,.STYLES* LUV N HAIR,.STYLES
528733,www.Stylese* West Brim,.STYLESE* WEST BRIM,.STYLESE


In [22]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
74774,CHECKCARD XXXX CK XXXXXXX,,
82514,CHECKCARD XXXX MCC NC XXXXXXXXXXXXXXXXXXXXXXX,,
88157,"CHECKCARD XXXX SQ *""IT'S ALL PEACHY"" F Centenn...","SQ *""IT'S ALL PEACHY"" F CENTENNIAL",
93788,CHECKCARD XXXX XXXX XXXXXXXXXX TX XXXXXXXXXXXX...,,
93843,CHECKCARD XXXX XXXXXXXXXXXX,,
93846,CHECKCARD XXXX XXXXXXXXXXXXXX,,
98408,CK XXXXXXX,,
104841,Card,,
112295,Ck XXXXXXX,,
112576,Co,,


In [23]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

143560               [AMAZON.COM, 1A8TW6H40 A AMZN.COM BIL]
34549                 [AMAZON.COM, 1B8FVXXXX AMZN.COM BILL]
34589           [AMAZON.COM, 1F7BJ AMZN.COM BILL SIG 05:05]
34767                 [AMAZON.COM, 1K3G12EJ0 AMZN.COM BILL]
34878                 [AMAZON.COM, 1M0Z10HK1 AMZN.COM BILL]
404106                 [AMAZON.COM, 1VXXXX AMZN.COM BILLWA]
283077                  [AMAZON.COM, 1W8H78T AMZN.COM BILL]
35361                 [AMAZON.COM, 1XXXXXFE2 AMZN.COM BILL]
352013                  [AMAZON.COM, 250CP4K AMZN.COM BILL]
14440     [AMAZON.COM, 251BEXXXX SEATT LE WAXXXX XXXXWTX...
35537                 [AMAZON.COM, 274HG8NZ0 AMZN.COM BILL]
14587                             [AMAZON.COM, 296A57QE0 A]
35729                 [AMAZON.COM, 2D6ZY6VZ0 AMZN.COM BILL]
254431                              [AMAZON.COM, 2L0PP0AK0]
14922           [AMAZON.COM, 2LXXXXXI0 AMZNAMZN.COM BILLWA]
254441                              [AMAZON.COM, 2R6HR5K80]
65340            [AMAZON.COM, 429C71B33 

# Phase 2: Extract & Analyze N-Grams

In [24]:
df_p2 = pd.read_csv("memos_P1.csv")

In [25]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [26]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [27]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...
Analyzing (3, 3) n-grams...
--- N-gram Analysis Complete ---
CPU times: user 2.06 s, sys: 8.1 ms, total: 2.07 s
Wall time: 2.07 s


In [30]:
top_1grams.sort(key=lambda x: x[0])
ngrams_1 = []
for ngram, value in top_1grams:
    if ngram.upper() not in NOISE_WORDS:
        ngrams_1 += [ngram]

In [31]:
ngrams_1

['abc',
 'ach',
 'afterpay',
 'air',
 'aldi',
 'amazon',
 'american',
 'amzn',
 'angeles',
 'antonio',
 'app',
 'apple',
 'arbys',
 'atlanta',
 'bakery',
 'based',
 'bbq',
 'beach',
 'bell',
 'big',
 'box',
 'brooklyn',
 'burger',
 'caesars',
 'canteen',
 'canton',
 'cash',
 'charlotte',
 'chicago',
 'chick',
 'chicken',
 'chipotle',
 'circle',
 'club',
 'cmsvend',
 'collins',
 'com',
 'costco',
 'crd',
 'creek',
 'dairy',
 'deb',
 'debit',
 'del',
 'denver',
 'depot',
 'des',
 'diego',
 'dollar',
 'doordash',
 'drive',
 'dunkin',
 'east',
 'eats',
 'el',
 'express',
 'family',
 'fee',
 'fil',
 'folsom',
 'fort',
 'francisco',
 'fresh',
 'frys',
 'garden',
 'general',
 'giant',
 'glendale',
 'google',
 'greenville',
 'help',
 'helppay',
 'hill',
 'home',
 'house',
 'houston',
 'httpsinstacar',
 'hunt',
 'indn',
 'info',
 'inn',
 'instacart',
 'intl',
 'jack',
 'jacksonville',
 'kfc',
 'king',
 'kroger',
 'l340',
 'lake',
 'las',
 'lion',
 'little',
 'llc',
 'los',
 'lowe',
 'lowes',
 '

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes