# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main c5f2764] regex
 1 file changed, 4525 insertions(+), 9443 deletions(-)
 rewrite Week 2/Haris_Saif.ipynb (65%)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in.sample(100_000).copy()
row_count = df.size
row_count

100000

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     7-ELEVEN LYNWOOD CA                          0...
1     ACH IAT DEBIT, CANVA PTY LIMITE IAT PAYPAL ***...
2                                AMZN Mktp US HB3FA9W33
3     AMZN Mktp US*FE6DE0E Amzn.com/bill WA        0...
4     AMZN Mktp US*I93B58J Amzn.com/bill WA        0...
5     CHECKCARD XXXX AMZN MKTP US*1K87Q08Z2 AMZN.COM...
6                                       GRUB BURGER BAR
7     INSTACART 09-11 HTTPSINSTACAR CAXXXX DEBIT CAR...
8     LA DINASTIA NEW YORK NY                      0...
9                        NEKTER JUICE BAR XXXX WA 12/19
10    NORTHGATE MARK 01/21 #XXXXXXXXX PURCHASE NORTH...
11    POPEYES XXXX NARANJA FL                      0...
12    POS PURCHASE NON-PIN APPLE.COM/BILL CA XXXXXX ...
13    PUBLIX #XXXX AVENTURA FL                     0...
14    PUBLIX #XXXX PUBLIX #XXXX ORLANDO FLUS XXXXXX ...
15    PURCHASE AUTHORIZED ON 01/29 DD DOORDASH AMERI...
16    PURCHASE AUTHORIZED ON 03/21 LANIER PARKING 21...
17    PURCHASE AUTHORIZED ON 05/20 YAYAS COOKBOO

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [8]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [9]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # === NEW PAYPAL RULES ===
    # Handle 'PAYPAL *... MERCHANT' (from INST XFER)
    re.compile(r"^PAYPAL\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    # Handle 'PAYPAL : MERCHANT' (from INST XFER / ID:)
    re.compile(r"^PAYPAL\s*:\s*([A-Z\s0-9'.-]+).*"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === 7-ELEVEN RULES ===
    # These now capture '7-ELEVEN' or '7 11' as the merchant
    re.compile(r"^(7-ELEVEN)\s*\*?#?.*"),
    re.compile(r"^(7\s+11)\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    # re.compile(r"^DD\s*BR\s*\*?#?\s*([A-Z\s0-9'.-]+).*"), # Removed, redundant
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # Generic PAYPAL rule, catches PAYPAL*MERCHANT
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"), 

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)

memos = memos.str.replace(NOISE_WORDS_REGEX, " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 4.55 s, sys: 32.5 ms, total: 4.58 s
Wall time: 4.58 s


In [11]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 668 ms, sys: 4.02 ms, total: 672 ms
Wall time: 670 ms


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [14]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
68651,CHECKCARD XXXX AMZN DIGITAL*XXXXD56X1 WA XXXXX...,AMZN * D56X1,AMZN
372676,PURCHASE AUTHORIZED ON 10/24 AMZN Mktp US*2Y2B...,AMZN MKTP US*2Y2BF AMZN.COM BILL S,AMZN MKTP US
395692,PURCHASE AUTHORIZED ON 12/21 CHICKEN N PICKLE ...,CHICKEN N PICKLE GRAND PRAIRIE S,CHICKEN N PICKLE GRAND PRAIRIE S
336594,PURCHASE AUTHORIZED ON 07/24 DOLLARTRE XXXX W ...,DOLLARTRE W HAMME STOCKTON P,DOLLARTRE W HAMME STOCKTON P
142066,Debit Card JOHNNY GRITS TARPON SPRING FL Date ...,JOHNNY GRITS TARPON SPRING,JOHNNY GRITS TARPON SPRING
364795,PURCHASE AUTHORIZED ON 10/04 FOOD LION #XXXX R...,LION # RICHMOND P,LION
198239,MCDONALD'S FXXXX OAK FOREST IL 0...,MCDONALD'S F OAK FOREST,MCDONALD'S F OAK FOREST
89206,CHECKCARD XXXX Scribd Inc San FranciscoCA XXXX...,SCRIBD INC SAN FRANCISCOCA 00BUF3I0 RECURRING,SCRIBD INC SAN FRANCISCOCA 00BUF3I0 RECURRING
270708,PURCHASE AUTHORIZED ON 01/31 STOP & SHOP XXXX ...,STOP & SHOP 80 TO ROCKY HILL P,STOP & SHOP 80 TO ROCKY HILL P
492837,WING EXPRESS DELI. TUPELO MS,WING EXPRESS . TUPELO,WING EXPRESS . TUPELO


In [15]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [16]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

AMZN
WM
ROSS
LION
WINCO
TARGET.COM
EXPRESS
AMZNFREETIME
SALLY
PETCO
FAIRWAY
DES
SHIPT
BASKIN
WINN-DIXIE
GOFAN
CHEWY.COM
RIOT
SLICE
RALPHS
NORTHGATE
INTUIT
TOLLWAY-AUTOREPLEN
BELL''S
DILLONS
*STARBUCKS
IHOP
STATERBRO
STEAK-N-SHAKE
SKILLZ
DEPOT
VANS
PRICELN
SEZZLE
CANVA
.KOHLS.COM
BETMGM
VERIZONWRLSS
FRG
OCULUS
BASHAS''
FRYS-MKTPLACE
POPEYES
SAVEMART
V
QFC
MARINA
AMZ
ABC
WALGREENS
P
L
SMITHS
JOURNEYS
*EBAY
TILLYS
BEAN
MCW
RAINBOW
FOOD4LESS
CRYPTO.COM
CLAIRE'S
FOODMAXX
CHECKERS
OTT
BORO
RVT
MEIJER
HLLFRSH
OPC
EBAY
EVI
POTBELLY
NYTIMES
IBI
STAPLES
UPS
SHOPIFY
SUBWAY
BANFIELD-PET
GAMESTOP
*MICROSOFT
DRI
ABCMOUSE.COM
TLG
ETT
BELK
FRED-MEYER
HOME
FIV
AT
DAVIS
WEGMANS
TRTHFDR
BUCKLE
FACEBK
GOODWILL
QUADPAY
DROPBOX
SHEIN
GNC
*UBER
NEWSSTAND
BLUESKY
*STEAM
CUB
FBPAY
PAVILIONS
PAM
CKO
ENMARKET
GERALD
JACK'S
SEDANOS
SMITH'S
ZTL
APPLE.COM
TORRID
JEANNETTE
ROSES
PACSUN
CDSR
AIRBNB
EIG
FH
DOLLARTRE
EZPASS
ANCESTRY.COM
AGI
SAVE-A-LOT
PAR
RONAN
CHILI'S
MGM
MOE'S
TLF
NORDSTROM
LUCKY
WHOLEFDS
BOXYCHARM
S

In [17]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
236931,POS Debit - Visa Check Card XXXX - STORE,,
513319,XXXX CA 07/14,,
93843,CHECKCARD XXXX XXXXXXXXXXXX,,
214749,OR LIQUOR STORE,,
512862,XXXX,,
416586,Purchase CAFE,,
112295,Ck XXXXXXX,,
523779,XXXXXXXX XXXXXXXXXX,,
418414,Purchase XXXX LIQUORS,,
513318,XXXX CA 03/03,,


In [18]:
df[df['memo_pre'].str.contains('ELEVEN')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
395590,PURCHASE AUTHORIZED ON 12/21 7-ELEVEN Port Cha...,7-ELEVEN PORT CHARLOTT P,7-ELEVEN
227713,POS Debit - Visa Check Card XXXX - 7-ELEVEN SA...,7-ELEVEN SANFORD MARY E MURRAY POS,7-ELEVEN
6209,7-ELEVEN HUNTINGTON ST NY 0...,7-ELEVEN HUNTINGTON ST,7-ELEVEN
321770,PURCHASE AUTHORIZED ON 06/15 7-ELEVEN LEWISVIL...,7-ELEVEN LEWISVILLE P,7-ELEVEN
326692,PURCHASE AUTHORIZED ON 06/28 7-ELEVEN LEHIGH A...,7-ELEVEN LEHIGH ACRES P,7-ELEVEN
5844,7-ELEVEN BOCA RATON FL 0...,7-ELEVEN BOCA RATON,7-ELEVEN
298657,PURCHASE AUTHORIZED ON 04/15 7-ELEVEN LEHIGH A...,7-ELEVEN LEHIGH ACRES P,7-ELEVEN
6728,7-ELEVEN SCHILLER PARK IL 0...,7-ELEVEN SCHILLER PARK,7-ELEVEN
6071,7-ELEVEN DES PLAINES IL 0...,7-ELEVEN DES PLAINES,7-ELEVEN
5530,7-ELEVEN 09/22 #XXXXXXXXX PURCHASE 7-ELEVEN FR...,7-ELEVEN # PURCHASE 7-ELEVEN FRISCO,7-ELEVEN


In [19]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

36206                                            AMAZON.COM
141267                                           AMAZON.COM
65324                                            AMAZON.COM
521429                                           AMAZON.COM
27088                                        APPLE.COM BILL
65129                                            AMAZON.COM
17444                                            AMAZON.COM
327448                     GRUBHUBTACOCABANA2 GRUBHUB.COM S
36368                                            AMAZON.COM
326705                                           AMAZON.COM
476123                              UBER TRIP HELP.UBER.COM
265191                            WALMART.COM AA XXX-XXX- S
125628                                        AT AMAZON.COM
7961                        AARONS F370 EZPA HTTPS: .A 07:0
290135                        FORMSWIFT.COM CHAR XXX-XXX- S
288241                          GROUPON, INC. GROUPON.COM S
115749                      CRD 21 REI.C

In [20]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
102184,CRD PUR MDJC986S2 XXXX / APPLE.COM/BILL CA POI...,CRD PUR MDJC986S2 APPLE.COM BILL,CRD PUR MDJC986S2 APPLE.COM BILL
501292,Withdrawal CONSUMER DEBIT / APPLE.COM/BILL XXX...,WITHDRAWAL DEBIT APPLE.COM BILL XXX-XXX- 0 0 15 #,WITHDRAWAL DEBIT APPLE.COM BILL XXX-XXX- 0 0 15
308061,PURCHASE AUTHORIZED ON 05/09 APPLE.COM/BILL CA...,APPLE.COM BILL S,APPLE.COM BILL S
483828,VISA DDA PUR XXXXXX APPLE COM BILL CUPERTINO *,DDA PUR APPLE COM BILL CUPERTINO *,A PUR APPLE COM BILL CUPERTINO
423062,RECURRING PAYMENT AUTHORIZED ON 03/27 APPLE.CO...,APPLE.COM BILL S,APPLE.COM BILL S
...,...,...,...
261535,PURCHASE AUTHORIZED ON 01/06 APPLE.COM/BILL XX...,APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S
413063,Point of Sale Debit L340 DATE 12-12 APPLE COM/...,L340 APPLE COM BILL,L340 APPLE COM BILL
409609,Point Of Sale Withdrawal APPLE COM BILL CUPERT...,APPLE COM BILL CUPERTINO,APPLE COM BILL CUPERTINO
27512,APPLE.COM/BILL C,APPLE.COM BILL C,APPLE.COM BILL C


In [21]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
362329,PURCHASE AUTHORIZED ON 09/28 GOOGLE *Pandora g...,GOOGLE *PANDORA G.CO HELPPAY# S,GOOGLE
42469,BB* CENTER BAR&THE CHO RUFFS DALE PAXXX...,BB* &THE CHO RUFFS DALE,BB
419175,QVC*XXXXXXXXXXXX*5OF5 XXX-XXX-XXXX PA 1...,QVC* *5OF5 XXX-XXX,QVC
469847,TST* Brown Bag Seafood Chicago IL 08/21,TST* BROWN BAG SEAFOOD CHICAGO,BROWN BAG SEAFOOD CHICAGO
12893,AMAZON DIGIT*1W2OS1UI0,AMAZON *1W2OS1UI0,AMAZON
...,...,...,...
68247,CHECKCARD XXXX AMZN DIGITAL*HZ2I XXX-XXX-XXXX ...,AMZN *HZ2I XXX-XXX,AMZN
332050,PURCHASE AUTHORIZED ON 07/12 FACEBK *ZQ46Y5K5R...,FACEBK *ZQ46Y5K5R2 FB. ADS IRL S,FACEBK
15894,AMAZON.COM*GG09L1E53,AMAZON.COM*GG09L1E53,AMAZON.COM
277096,PURCHASE AUTHORIZED ON 02/18 AMAZON.COM*1I6TX2...,AMAZON.COM*1I6TX2O AMZN.COM BILL S,AMAZON.COM


In [22]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
236931,POS Debit - Visa Check Card XXXX - STORE,,
513319,XXXX CA 07/14,,
93843,CHECKCARD XXXX XXXXXXXXXXXX,,
214749,OR LIQUOR STORE,,
512862,XXXX,,
416586,Purchase CAFE,,
112295,Ck XXXXXXX,,
523779,XXXXXXXX XXXXXXXXXX,,
418414,Purchase XXXX LIQUORS,,
513318,XXXX CA 03/03,,


In [23]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

429840                      [AGI, RENTERS CONDO XXX-XXX- S]
31871                         [AMAZON , 1H I AMZN.COM BILL]
291599                [AMAZON.COM, 164SP04 AMZN.COM BILL S]
404039                 [AMAZON.COM, 1L02H3 AMZN.COM BILLWA]
35049                 [AMAZON.COM, 1Q3FJ28U0 AMZN.COM BILL]
14274                      [AMAZON.COM, 1V4H AMZN.COM BILL]
35863                               [AMAZON.COM, 2G5X82IM1]
65198            [AMAZON.COM, 2W8RH19P2 AM AMZN.COM BILLWA]
517840                      [AMAZON.COM, 2Y0 AMZN.COM BILL]
36206                               [AMAZON.COM, 371DX0ZU3]
259640                            [AMAZON.COM, 814NA5B63 A]
36631                       [AMAZON.COM, 9O AMZN.COM BILLW]
65577            [AMAZON.COM, AC62U68A3 AM AMZN.COM BILLWA]
15599                          [AMAZON.COM, B889L2PG3 AMZN]
37057               [AMAZON.COM, FP2IA8M33 AMZN.COM BILLWA]
404200                 [AMAZON.COM, GA8VG2 AMZN.COM BILLWA]
66099            [AMAZON.COM, HT9AM4Q62 

# Phase 2: Extract & Analyze N-Grams

In [24]:
df_p2 = pd.read_csv("memos_P1.csv")

In [25]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [26]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [None]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

Analyzing 100000 cleaned memos...
Analyzing (1, 1) n-grams...
Analyzing (2, 2) n-grams...


In [None]:
top_1grams.sort(key=lambda x: x[0])
top_1grams

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes