# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main f5a900e] regex
 1 file changed, 100 insertions(+), 228 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df_in = pd.read_csv("memos.csv")
df = df_in#.sample(100_000).copy()
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     ARCHIBALDS DRIVE THRU MENIFEE CA Date 02/01/23...
1     Audible*HB3LA1R90 11-14 Amzn.com/bill NJ XXXX ...
2     CHECKCARD XXXX CHEESECAKE DOLPHIN MALL MIAMI F...
3     MOBILE PURCHASE XXXX SQ *HOP HOP - HAYWARD Hay...
4                        MPA PARKING PAY BY PH FL 03/19
5                                       NAYAX VENDING 6
6     OVERDRAFT ITEM FEE FOR ACTIVITY OF 02-10 ELECT...
7                          PIN ONORE CLOTHING LLCSPARTA
8     PURCHASE AUTHORIZED ON 01/06 eBay O*27-XXXXX-1...
9     PURCHASE AUTHORIZED ON 04/14 THE FLAME BROILER...
10    PURCHASE AUTHORIZED ON 06/11 BURKES OUT Clevel...
11    PURCHASE AUTHORIZED ON 06/23 GOOGLE *Big Fish ...
12    PURCHASE AUTHORIZED ON 07/25 CAPTAIN D'S XXXX ...
13    PURCHASE AUTHORIZED ON 07/28 FAMILY DOLLAR #57...
14    PURCHASE AUTHORIZED ON 08/24 REG'S 7 MILE STEA...
15    PURCHASE AUTHORIZED ON 11/24 PARK FIRST - BLOC...
16    PURCHASE AUTHORIZED ON 12/25 TINEE GIANT #249 ...
17    PURCHASE XXXX NOOCUBE GLASGOW XXXXXXXXXXXX

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
# From 1-gram
NOISE_WORDS = [
    "DBT", "PURCH", "TRANSACTION", "PMT", "PMTS", "HTTPSWWW", "WWW", "CONSUMER", 
    "CKCD", "VENDING", "SUPERCENTER", "SUPERC", "STORE", "STORES", "RESTAURANT", 
    "PROTECTION", "PARKING", "GRILL", "MARKET", "LIQUOR", "LIQUORS", "GROCERY", 
    "FOOD", "FOODS", "DIGITAL", "DIGIT", "DELI", "COFFEE", "CITY", "CENTER", 
    "CAFE", "BUSINESS", "BEAUTY", "BAR", "STREET"
]
NOISE_WORDS_REGEX = r"\b(" + "|".join(NOISE_WORDS) + r")\b"

In [8]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [45]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === UPDATED 7-ELEVEN RULES ===
    # These now capture '7-ELEVEN' or '7 11' as the merchant
    re.compile(r"^(7-ELEVEN)\s*\*?#?.*"),
    re.compile(r"^(7\s+11)\s*\*?#?.*"),
    
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # === UPDATED RULE ORDER ===
    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    # re.compile(r"^DD\s*BR\s*\*?#?\s*([A-Z\s0-9'.-]+).*"), # Removed, redundant
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    
    # === FIXED RULE ===
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"), # Corrected '0a-Z' to '0-9'
    
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0F'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [10]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 21.9 s, sys: 191 ms, total: 22.1 s
Wall time: 22.1 s


In [46]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 3.39 s, sys: 4.5 ms, total: 3.4 s
Wall time: 3.39 s


In [12]:
# df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [13]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [53]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
227206,POS DEBIT XXXX 11/29 9:12 AMAZON COM,9:12 AMAZON COM,9:12 AMAZON COM
391332,PURCHASE AUTHORIZED ON 12/11 AMAZON.COM*TY1LE7...,AMAZON.COM*TY1LE7W AMZN.COM BILL S,AMAZON.COM
67619,CHECKCARD XXXX AMZN DIGITAL*2D0SXXXXX WA XXXXX...,AMZN DIGITAL*2D0S,AMZN DIGITAL
122303,DD DOORDASH ZAXBYS,DD DOORDASH ZAXBYS,DOORDASH ZAXBYS
163875,Ferdinand Grill,FERDINAND GRILL,FERDINAND GRILL
242692,POS PURCHASE / HOMELAND #469,HOMELAND #469,HOMELAND
503886,Withdrawal DEBIT CHIP / MCDONALD'S FXXXX TIPP ...,MCDONALD'S F TIPP CITY,MCDONALD'S F TIPP CITY
257339,PUBLIX #376 KEY LARGO FL 1...,PUBLIX #376 KEY LARGO,PUBLIX
420174,REALPAGE RENTREPORTING REALPAGECO TX 1...,REALPAGE RENTREPORTING REALPAGECO,REALPAGE RENTREPORTING REALPAGECO
352317,PURCHASE AUTHORIZED ON 09/02 SAIMAI THAI CHICA...,SAIMAI THAI CHICAGO S,SAIMAI THAI CHICAGO S


In [15]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [16]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

TARGET.COM
AMZNFREETIME
WINN-DIXIE
DES
SHIPT
BASKIN
RALPHS
SLICE
BETMGM
PETCO
GOFAN
CHEWY.COM
IHOP
INTUIT
STEAK-N-SHAKE
DILLONS
STATERBRO
*STARBUCKS
SKILLZ
VANS
FRG
TOLLWAY-AUTOREPLEN
PRICELN
RIOT
WWW.KOHLS.COM
BANFIELD-PET
STORE
MCW
P
SAVEMART
GAMESTOP
MARINA
OCULUS
FOODMAXX
BASHAS''
VERIZONWRLSS
RAINBOW
CANVA
WALGREENS
CLAIRE'S
SEZZLE
ABC
GNC
FOOD4LESS
BELK
QFC
POPEYES
FRYS-MKTPLACE
SUBWAY
STAPLES
NYTIMES
FIV
AMZ
*EBAY
L
V
TILLYS
SHOPIFY
IBI
*MICROSOFT
UPS
OTT
BLUESKY
*UBER
POTBELLY
GOODWILL
DROPBOX
JACK'S
QUADPAY
WEGMANS
CHECKERS
EVI
LUCKY
MEIJER
ETT
RVT
ABCMOUSE.COM
NORTON
ENMARKET
FBPAY
JOURNEYS
EA
CRYPTO.COM
TLG
OPC
HOME
CRT
GERALD
HLLFRSH
SEDANOS
NORDSTROM
EBAY
REI
NEWSSTAND
FH
LJS
STOCKTON
EZPASS
ZTL
PARKMOBILE
EVERYPLATE
FRED-MEYER
*STEAM
PACSUN
TRTHFDR
RGP
PAR
M
E-Z
DRI
UBR
TACOMA
EPC
MOE'S
TLF
APPLE.COM
PCH
EXPRESS
BUCKLE
PEET'S
FACEBK
CKO
ECS
BOXYCHARM
PAM
GOFNDME
SOUTHWES
ARBY'S
TOMMY'S
FAMOUSFOOTWEAR
GLOSS
VONS.COM
LYNWOOD
OPS
LEGALSHIELD
PAVILIONS
AF
VOLA
CDSR
OFFICE
COL

In [42]:
df[df['memo_post'] == '']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
5088,7-ELEVEN #XXXXX 46-047 K,7-ELEVEN # 46-047 K,
5089,7-ELEVEN #XXXXX 94-429 U,7-ELEVEN # 94-429 U,
5090,7-ELEVEN #XXXXX XXXX N S,7-ELEVEN # N S,
5092,7-ELEVEN 01/01 #XXXXXXXXX PURCHASE 7-ELEVEN CH...,7-ELEVEN # PURCHASE 7-ELEVEN CHARLOTTESVIL,
5093,7-ELEVEN 01/01 #XXXXXXXXX PURCHASE XXXX W GLEN...,7-ELEVEN # PURCHASE W GLENVIEW R GLENVIEW CKCD,
...,...,...,...
523814,XXXXXXXXXX,,
523840,XXXXXXXXXXX XXXXXX,,
523860,XXXXXXXXXXXX,,
524051,XXXXXXXXXXXXX,,


In [38]:
df[df['memo_pre'].str.contains('ELEVEN')].sample(10)

Unnamed: 0,memo,memo_pre,memo_post
278850,PURCHASE AUTHORIZED ON 02/23 7-ELEVEN ORLANDO ...,7-ELEVEN ORLANDO P,ORLANDO P
324777,PURCHASE AUTHORIZED ON 06/23 7-ELEVEN XXXXX RI...,7-ELEVEN RICHMOND S,RICHMOND S
356499,PURCHASE AUTHORIZED ON 09/13 7-ELEVEN SAN DIEG...,7-ELEVEN SAN DIEGO P,SAN DIEGO P
6186,7-ELEVEN HUNTINGTON ST NY 0...,7-ELEVEN HUNTINGTON ST,HUNTINGTON ST
5533,7-ELEVEN 09/23 #XXXXXXXXX PURCHASE 7-ELEVEN PI...,7-ELEVEN # PURCHASE 7-ELEVEN PITTSBURG,
290801,PURCHASE AUTHORIZED ON 03/26 7-ELEVEN SAN JOSE...,7-ELEVEN SAN JOSE P,SAN JOSE P
136066,DOORDASH*7-ELEVEN WWW.DOORDASH. CA 1...,DOORDASH*7-ELEVEN WWW.DOORDASH.,DOORDASH
517260,XXXX XXXXXXXX 7-ELEVEN XXXX PALM AVE PEMBROKE ...,7-ELEVEN PALM AVE PEMBROKE PINEFL C#,PALM AVE PEMBROKE PINEFL C
275703,PURCHASE AUTHORIZED ON 02/14 7-ELEVEN FORT WOR...,7-ELEVEN FORT WORTH P,FORT WORTH P
278190,PURCHASE AUTHORIZED ON 02/21 7-ELEVEN SAN JOSE...,7-ELEVEN SAN JOSE P,SAN JOSE P


In [41]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

178362                       IMPERIAL VENDING SERVIC TACOMA
228568                                           AMAZON.COM
430865                                  PROBILLER.COM CYP S
43786                         BETPL (855)842- HTTPSWWW.BET.
432946                     RING UNLIMITED MON HTTPSRING.COM
43507                      BESTBUYCOM RICHFIELD US PURCHASE
523373              SIG COMPLETION CSA-GGA-DAM TRIP XXX-XXX
476614                UBER TRIP HELP. HELP.UBER.COM NLD NLD
117198    DBT CRD DSV9IRG8 SQ LOVE, LIFE, AND LI GOSQ.CO...
177427                       COSTCO BY INSTAC WWW.COSTCO.CO
506821                  WITHDRAWAL WISH.COM XXX-XXX- CADATE
14008                                            AMAZON.COM
344219                                 ETSY.COM - WEDOHON S
427716                                 CHEWY.COM XXX-XXX- S
224841                               POS AMAZON.COM SEATTLE
148902    DEBIT: SIGNATURE PURCHASE FROM AMAZON.COM*L53H...
255015                                  

In [40]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
110,#00 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#00 BP APPLE COM BILL CUPERTINO 10 #,#00 BP APPLE COM BILL CUPERTINO 10 #
135,#04 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#04 BP APPLE COM BILL CUPERTINO 10 #,#04 BP APPLE COM BILL CUPERTINO 10 #
150,#07 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#07 BP APPLE COM BILL CUPERTINO 10 #,#07 BP APPLE COM BILL CUPERTINO 10 #
179,#12 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#12 BP APPLE COM BILL CUPERTINO 10 #,#12 BP APPLE COM BILL CUPERTINO 10 #
188,#14 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#14 BP APPLE COM BILL CUPERTINO 10 #,#14 BP APPLE COM BILL CUPERTINO 10 #
...,...,...,...
526991,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 1 6,APPLE.COM BILL ONE APPLE PARK WAY 1 6
526992,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 8 2,APPLE.COM BILL ONE APPLE PARK WAY 8 2
526993,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 2 6,APPLE.COM BILL ONE APPLE PARK WAY 2 6
526994,debit card APPLE.COM/BILL ONE APPLE PARK XXXXX...,APPLE.COM BILL ONE APPLE PARK C 2 4,APPLE.COM BILL ONE APPLE PARK C 2 4


In [39]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]
prefix_star_merchants

Unnamed: 0,memo,memo_pre,memo_post
2212,13FLDENVER* 13THFL HTTPSWWW.13TH CO,13FLDENVER* 13THFL HTTPSWWW.13TH,13FLDENVER* 13THFL HTTPSWWW.13TH
2331,1PASSWORD* TRIAL OVER TORONTO ON 01/07,1PASSWORD* TRIAL OVER TORONTO ON,1PASSWORD* TRIAL OVER TORONTO ON
3607,2CHECKO*KILOHEA Alpharetta GA USA,2CHECKO*KILOHEA ALPHARETTA,2CHECKO*KILOHEA ALPHARETTA
3608,2CO.COM*slideupli XXXXXXXXXX 04/03,2CO.COM*SLIDEUPLI,2CO.COM*SLIDEUPLI
3609,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM,2COCOM*BITDEFENDER.COM
...,...,...,...
528725,www.Playgr* Bidiboo.Co,WWW.PLAYGR* BIDIBOO.CO,WWW.PLAYGR
528726,www.Playgr* Littlemiss,WWW.PLAYGR* LITTLEMISS,WWW.PLAYGR
528732,www.Styles* Luv N Hair,WWW.STYLES* LUV N HAIR,WWW.STYLES
528733,www.Stylese* West Brim,WWW.STYLESE* WEST BRIM,WWW.STYLESE


In [36]:
df[df['memo_post'] == '']

Unnamed: 0,memo,memo_pre,memo_post
5088,7-ELEVEN #XXXXX 46-047 K,7-ELEVEN # 46-047 K,
5089,7-ELEVEN #XXXXX 94-429 U,7-ELEVEN # 94-429 U,
5090,7-ELEVEN #XXXXX XXXX N S,7-ELEVEN # N S,
5092,7-ELEVEN 01/01 #XXXXXXXXX PURCHASE 7-ELEVEN CH...,7-ELEVEN # PURCHASE 7-ELEVEN CHARLOTTESVIL,
5093,7-ELEVEN 01/01 #XXXXXXXXX PURCHASE XXXX W GLEN...,7-ELEVEN # PURCHASE W GLENVIEW R GLENVIEW CKCD,
...,...,...,...
523814,XXXXXXXXXX,,
523840,XXXXXXXXXXX XXXXXX,,
523860,XXXXXXXXXXXX,,
524051,XXXXXXXXXXXXX,,


In [22]:
print(prefix_star_merchants['memo_pre'].str.split('*').sample(100).sort_values().to_string())

9640                              [AF, THE MOUNT WASHINGT.]
365051                   [AMAZON.COM,  G7S AMZN.COM BILL S]
72017                     [AMAZON.COM,  L2 AMZN.COM BILLWA]
64025            [AMAZON.COM, 130L31LW1 AM AMZN.COM BILLWA]
298668                [AMAZON.COM, 1A08B20 AMZN.COM BILL S]
13803                               [AMAZON.COM, 1A3SE5H12]
34560                 [AMAZON.COM, 1F0U04LF0 AMZN.COM BILL]
128553                  [AMAZON.COM, 1F3R67US1 AMZN.COM BI]
310639                [AMAZON.COM, 1L58H0V AMZN.COM BILL S]
34852                               [AMAZON.COM, 1L7DD7B50]
14051            [AMAZON.COM, 1M ME2 AMAZON.COM SEATTLE, ,]
64335            [AMAZON.COM, 1N9Y84SZ0 AM AMZN.COM BILLWA]
14126                             [AMAZON.COM, 1O6CJ1JQ2 A]
304028                [AMAZON.COM, 1Q6PO8Z AMZN.COM BILL S]
14190                               [AMAZON.COM, 1R08T9JZ1]
352047                [AMAZON.COM, 257GR8S AMZN.COM BILL S]
14477             [AMAZON.COM, 257UI0CH1

# Phase 2: Extract & Analyze N-Grams

In [23]:
df_p2 = pd.read_csv("memos_P1.csv")

In [24]:
# print(df[df['memo_post'].str.len() < 4]['memo_post'].to_string())

In [25]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [None]:
%%time
corpus = df_p2['memo_post'].fillna('')
print(f"Analyzing {len(corpus)} cleaned memos...")
# Get the top 200 of each n-gram type
top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
print(f"--- N-gram Analysis Complete ---")

In [27]:
top_1grams.sort(reverse=True)
top_1grams

NameError: name 'top_1grams' is not defined

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes