# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 13af3c8] regex
 1 file changed, 3035 insertions(+), 3200 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in#.sample(100_000).copy()
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0                              AMAZON COM 1A3A199U0 AMZ
1           ANYTIME STORAGE ARIZO ARIZONA CITY AZ 08/01
2     Afterpay XXX-XXXXXXX CA                      0...
3               Amazon Prime*ZA4CL02T3 Amzn.com/bill WA
4     Amazon.com*1OXXXXGC1 Amzn.com/bill WA        0...
5              Debit Purchase -visa Card XXXXvons #XXXX
6                               Jerrys Restaurant &Loun
7     MCDONALD'S FXXXX COVINGTON LA                1...
8                               Osaka Jr Japanese Expre
9     POS PURCHASE / MERCHANT PURCHASE TERMINAL XXXX...
10            POS XXXXXX MCDONALD'S F22 TEMPE AZ ##XXXX
11    PURCHASE 06/15 CIRCLE K # XXXX CIRCLE K # XXXX...
12    PURCHASE AUTHORIZED ON 02/25 TST* KUBA CABANA ...
13    PURCHASE AUTHORIZED ON 05/05 VA ABC STORE 348 ...
14    PURCHASE AUTHORIZED ON 05/13 PUBLIX #XXXX SARA...
15    PURCHASE AUTHORIZED ON 05/18 BIG WAVE COFFEEHO...
16    PURCHASE XXXX CITY PARKING INC XXX-XXX-XXXX FL...
17                                      SUGARTOW

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [8]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"\bX{4,}\b", " "), # PRECISE: Remove standalone masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times (INCLUSIVE) ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{4}-\d{2}-\d{2}\b", " "), # YYYY-MM-DD
    (r"\b\d{1,2}-(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-\d{2,4}\b", " "), # DD-MMM-YYYY
    (r"\b(?:19|20)\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[12]\d|3[01])\b", " "), # YYYYMMDD
    (r"\b(?:0[1-9]|1[0-2])(?:0[1-9]|[12]\d|3[01])\d{2}\b", " "), # MMDDYY
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails (INCLUSIVE) ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\bID\b", " "), # Remove standalone 'ID' (from 'ID: DSW')
    (r"\b(?:REF|TERM|TRN|INV|ACCT|TID|MID)\s*#?[\d\w-]+\b", " "), # REF 123, TERM 001, etc.
    (r"\bAUTH\s+CODE\s*[\d\w-]+\b", " "), # AUTH CODE 0123
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers (INCLUSIVE) ===
    (r"\b1[\s.-]\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}\b", " "), # 1-800-555-1212
    (r"\b\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}\b", " "), # (800) 555-1212, 800.555.1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\bXXX-XXX-XXXX\b", " "), # Masked phone

    # === 8) URLs/domains (INCLUSIVE) ===
    (r"^\.COM\s+BILL\b.*", " "),
    (r"\s+\.(?:COM|NET|ORG|GOV|EDU|IO|CO)\b", " "), # Remove trailing .COM, .NET etc.

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    (r"^\s*-\s+", " ") # Remove leading hyphens
]

In [10]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-&]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-&]+?)\s+PAYPAL.*$"),

    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-&]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-&]+).*"),

    # --- NEW: High-Frequency Merchants (Capture prefix as merchant) ---
    re.compile(r"^(7-ELEVEN|7\s+11)\s*\*?#?.*"),
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    re.compile(r"^(AMZN\s*MKTP|AMAZON\s*MKTPLACE|AMAZON\.COM)\b.*"),
    re.compile(r"^(WAL-MART|WM\s+SUPER)\b.*"),
    re.compile(r"^(UBER|LYFT)\b.*"),
    re.compile(r"^(DOORDASH|GRUBHUB)\b.*"),
    re.compile(r"^(STARBUCKS|SBUX)\b.*"),
    re.compile(r"^(MCDONALD'S|MCDONALDS)\b.*"),
    re.compile(r"^(TARGET)\b.*"),
    re.compile(r"^(COSTCO)\b.*"),

    # --- Standardized Prefix Rules (Capture *after* prefix) ---
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    
    # === TST RULES (Precise, non-greedy) ===
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-&]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-&]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # NEW: Stop at City, ST
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-&]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\b.*"),
    # NEW: Stop at City
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-&]+?)\s+[A-Z]{2,}\b.*"),
    # Fallback: (POS) TST* MERCHANT... (Greedy)
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-&]+).*"), 
    
    # === SQ RULES (Precise, non-greedy) ===
    # NEW: Stop at City, ST
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-&]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\b.*"),
    # NEW: Stop at City
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-&]+?)\s+[A-Z]{2,}\b.*"),
    # Fallback: SQ* MERCHANT... (Greedy)
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-&]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-&]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-&]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-&]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-&]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-&][A-Z\s0-9'.-&]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-&][A-Z\s0-9'.-&]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-&][A-Z\s0-9'.-&]*?)\s+-\s+EXP.*"),
]

error: bad character range *-& at position 22

## 3. Apply Regex

In [None]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

In [None]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

In [None]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [None]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [None]:
df.sample(10).sort_values(by='memo_post')

In [None]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [None]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

In [None]:
df[df['memo_post'] == '']#.iloc[0]['memo']

In [None]:
df[df['memo_pre'].str.contains('ELEVEN')].sample(10)

In [None]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

In [None]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

In [None]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

In [None]:
df[df['memo_post'] == '']

In [None]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

# Phase 2: Extract & Analyze N-Grams

In [None]:
df_p2 = pd.read_csv("memos_P1.csv")

In [None]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [None]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [None]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

In [None]:
top_1grams.sort(key=lambda x: x[0])
top_1 = [ngram for ngram, count if ngram not in NOISE_WORDS]

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes