# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 52fe792] regex
 1 file changed, 6589 insertions(+), 6433 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     AFTERPAY 07-22 185-XXXXXXXX CA XXXX DEBIT CARD...
1     AMAZON.COM*O425A5Q83 SEATTLE WACard XXXX/Withd...
2                CASH APP*CHONG*ADD XXXXXXXXXX CA 09/11
3     CHECKCARD XXXX EIG*CONSTANTCONTA XXX-XXXXXXX M...
4     CHECKCARD XXXX GRID* #0 ADV REPM SAN FRANCISCO...
5     CHICK-FIL-A #XXXXX FORT WORTH CARD: XXXXXXX 05...
6     CIRCLE K # XXXXX XXXXX SAHUARITA AZ          1...
7     CRD PUR MDJIUS2YF XXXX / Amazon.com*2C03 Amzn....
8     Check Card Purchase / CASH APP*JASON HARR XXXX...
9     DEBIT CARD DEBIT / auth #XXXXXX 08-15-XXXX SNA...
10          FIGURE8 - ULTIMATE + WWW.BODYFX.CO FL 02/18
11    GRAND TEQUILA REST SAN ANTONIO TX            0...
12    MCDONALD'S FXXXXX 05-20 SPRING HILL TN XXXX DE...
13    MERCHANT PAYMENT - 014 CIRCLE K # XXXXX XXXXX ...
14    POS Debit - Visa Check Card XXXX - DOORDASH*CA...
15       POS TRADER JOE'S # TORRANCE CA   Card 16 #XXXX
16    PURCHASE AUTHORIZED ON 03/04 ROBERTITOS TACO S...
17    PURCHASE AUTHORIZED ON 04/23 S G KOMEN FOR

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [7]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [25]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^DD\s*BR\s*\*?#?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === UPDATED TST RULES ===
    # Specific: (POS) TST* MERCHANT CITY STATE...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+(?:[A-Z]{2,}\s*|[A-Z]{2}$).*"), 
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Catches duplicate IC rule
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s09'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s09'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s09'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [9]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 22 s, sys: 181 ms, total: 22.1 s
Wall time: 22.1 s


In [26]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 3.33 s, sys: 11.9 ms, total: 3.34 s
Wall time: 3.34 s


In [11]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [12]:
# Manually check
unique_df = df.drop_duplicates(subset='memo_post')
result = (
    unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
    .sort_values(by='memo_pre')[100:200].to_string()
)
print(result)

                                                                                          memo                      memo_pre                     memo_post
7979                                                                               AB'S XXXXXX                          AB'S                          AB'S
110174  Check Card Purchase / ABCMOUSE CA Date 07/02/21 XXXXXXXXXXXXXXXXXXXXXXX XXXX Card XXXX                      ABCMOUSE                      ABCMOUSE
8087                                                                             ABCMOUSE.COM*                 ABCMOUSE.COM*                  ABCMOUSE.COM
30935                                                                  Abelardosmexicanfreshar       ABELARDOSMEXICANFRESHAR       ABELARDOSMEXICANFRESHAR
30937                                                                              Abercrombie                   ABERCROMBIE                   ABERCROMBIE
251857                                                        POS Sign

In [13]:
df.sample(20).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
3082,229 DBT CRD XXXX 05/29/21 XXXXXXXXROVER.COM WW...,229 DBT CRD ROVER.COM WWW.ROVER.COM C#,229 DBT CRD ROVER.COM WWW.ROVER.COM C#
7558,99 CENTS ONLY STORES REDONDO BEACH CA 0...,99 CENTS ONLY STORES REDONDO BEACH,99 CENTS ONLY STORES REDONDO BEACH
111831,Check Card: STEVES BURGERS HEMET CA 07/24/22,: STEVES BURGERS HEMET,: STEVES BURGERS HEMET
30608,AVALON ACE HDWE ORLANDO FL 1...,AVALON ACE HDWE ORLANDO,AVALON ACE HDWE ORLANDO
335689,PURCHASE AUTHORIZED ON 07/22 CASH APP*HENSON S...,CASH APP*HENSON SA S,CASH APP
115382,DBT CRD XXXX 09/23/22 DJW0LM0N 365 MARKET 888 ...,DBT CRD DJW0LM0N 365 MARKET 888 432-32 TROY C#,DBT CRD DJW0LM0N 365 MARKET 888 432-32 TROY C#
155304,Etsy.com - SvgMode XXX-XXXXXXX NY 1...,ETSY.COM - SVGMODE XXX,ETSY.COM - SVGMODE XXX
258959,PURCHASE 04/19 FAMILY DOLLAR # XXXX N MARINE B...,FAMILY DOLLAR # N MARINE BLVDJACKSONVILLE,FAMILY DOLLAR
77744,CHECKCARD XXXX FIRED PIE-DP AVONDALE AZ XXXXXX...,FIRED PIE-DP AVONDALE,FIRED PIE-DP AVONDALE
333947,PURCHASE AUTHORIZED ON 07/17 FRYS-MKTP XXXX IN...,FRYS-MKTP INDIAN PHOENIX P,FRYS-MKTP INDIAN PHOENIX P


In [31]:
df[df['memo_pre'].str.contains('HOSPITALITY ROCK SPRINGS')]

Unnamed: 0,memo,memo_pre,memo_post
252252,POS TST* LEW''S HOSPITALITY ROCK SPRINGS WY ON...,POS TST* LEW''S HOSPITALITY ROCK SPRINGS ON ##,LEW''S


In [14]:
merchants_clean = ['AMAZON', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'TARGET', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE's"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT']

multiples = [['AMAZON', 'AMAZON.COM'], ['LOWES', "LOWE'S"]]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [15]:
df[df['memo_post'] == 'BR']#iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
75816,CHECKCARD XXXX DD/BR #XXXXXX APEX NC XXXXXXXXX...,DD BR # APEX,BR
75817,CHECKCARD XXXX DD/BR #XXXXXX ATLANTA GA XXXXXX...,DD BR # ATLANTA,BR
75818,CHECKCARD XXXX DD/BR #XXXXXX AURORA CO XXXXXXX...,DD BR # AURORA,BR
75819,CHECKCARD XXXX DD/BR #XXXXXX AUSTIN TX XXXXXXX...,DD BR # AUSTIN,BR
75820,CHECKCARD XXXX DD/BR #XXXXXX BELLWOOD IL XXXXX...,DD BR # BELLWOOD,BR
...,...,...,...
414502,Pos Debit- XXXX XXXX Dd/br #XXXXXX Cary NC,DD BR # CARY,BR
416882,Purchase DD/BR #XXXXXX Q35,DD BR # Q35,BR
519008,XXXXX PURCHASE DD/BR #XXXXXX MOORE OK XXXXXXX ...,DD BR # MOORE,BR
520834,XXXXXX POS PURCHASE 04/20 DD/BR #XXXXXX WAYNE ...,DD BR # WAYNE,BR


In [16]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

BR
BUC-EE'S
AMZNFREETIME
ALBERTSONS
ADOBE
WINN-DIXIE
SONIC
LIDL
DES
SHIPT
WITHDRAWAL
BASKIN
RALPHS
SLICE
BETMGM
PETCO
GOFAN
CHEWY.COM
BASHAS'
IHOP
INTUIT
STEAK-N-SHAKE
STATERBRO
DILLONS
*STARBUCKS
SKILLZ
VANS
TOLLWAY-AUTOREPLEN
FRG
PRICELN
RIOT
WWW.KOHLS.COM
P
BANFIELD-PET
MCW
STORE
SAVEMART
MARINA
GAMESTOP
FOODMAXX
OCULUS
BASHAS''
VERIZONWRLSS
RAINBOW
CANVA
CLAIRE'S
WALGREENS
SEZZLE
ABC
GNC
QFC
BELK
FRYS-MKTPLACE
POPEYES
STAPLES
SUBWAY
NYTIMES
FIV
*EBAY
AMZ
L
V
TILLYS
SHOPIFY
IBI
UPS
*MICROSOFT
OTT
BLUESKY
*UBER
GOODWILL
POTBELLY
DROPBOX
JACK'S
QUADPAY
WEGMANS
CHECKERS
LUCKY
EVI
MEIJER
ETT
ABCMOUSE.COM
RVT
NORTON
ENMARKET
FBPAY
JOURNEYS
EA
CRYPTO.COM
OPC
TLG
HOME
CRT
HLLFRSH
GERALD
MESA
SEDANOS
NORDSTROM
EBAY
REI
NEWSSTAND
FH
LJS
STOCKTON
EZPASS
ZTL
FRED-MEYER
PARKMOBILE
PAR
EVERYPLATE
*STEAM
TRTHFDR
E-Z
PACSUN
RGP
M
DRI
TACOMA
UBR
APPLE.COM
TLF
MOE'S
EPC
PCH
BUCKLE
EXPRESS
PEET'S
BOXYCHARM
CKO
FACEBK
ECS
ARBY'S
PAM
FAMOUSFOOTWEAR
GOFNDME
SOUTHWES
TOMMY'S
LYNWOOD
OPS
LEGALSHIELD
VONS.

In [17]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

446356                     SOULVATION SOCHTTPSSOULVATI NDUS
65802                                            AMAZON.COM
430865                                  PROBILLER.COM CYP S
155322                               ETSY.COM - THEMYSTICCO
397345                                    .COM * XXX-XXX- S
78839         GRAMMARLY CO3X8JRVV GRAMMARLY.COMCA RECURRING
116858             DBT CRD DSJFYXL0 SAMSCLUB.COM XXX-XXX- C
504780    WITHDRAWAL DEBIT APPLE.COM BILL XXX-XXX- 327 20 #
16175                                            AMAZON.COM
34102                                  AMAZON COM 2G9L57L01
715         #U0G1G3TKRT1D AMAZON.COM*2X4IK0XT0 SEATTLE 20 #
253740    (FIS) DOORDASH THE DOORDASH THE COPPER WWW.DOO...
446420                         US.ODABASH.COM HTTPSODABASHU
484066                              A PUR OSMO WWW PLAYOSMO
485339                                             VONS.COM
228336                                           AMAZON.COM
434676            ROUND UP .19 VSA PUR B

In [18]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
110,#00 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#00 BP APPLE COM BILL CUPERTINO 10 #,#00 BP APPLE COM BILL CUPERTINO 10 #
135,#04 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#04 BP APPLE COM BILL CUPERTINO 10 #,#04 BP APPLE COM BILL CUPERTINO 10 #
150,#07 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#07 BP APPLE COM BILL CUPERTINO 10 #,#07 BP APPLE COM BILL CUPERTINO 10 #
179,#12 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#12 BP APPLE COM BILL CUPERTINO 10 #,#12 BP APPLE COM BILL CUPERTINO 10 #
188,#14 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#14 BP APPLE COM BILL CUPERTINO 10 #,#14 BP APPLE COM BILL CUPERTINO 10 #
...,...,...,...
526991,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 1 6,APPLE.COM BILL ONE APPLE PARK WAY 1 6
526992,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 8 2,APPLE.COM BILL ONE APPLE PARK WAY 8 2
526993,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 2 6,APPLE.COM BILL ONE APPLE PARK WAY 2 6
526994,debit card APPLE.COM/BILL ONE APPLE PARK XXXXX...,APPLE.COM BILL ONE APPLE PARK C 2 4,APPLE.COM BILL ONE APPLE PARK C 2 4


In [19]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]

In [20]:
print(prefix_star_merchants['memo_pre'].str.split().sample(100).to_string())

14562                                    [AMAZON.COM*290FE]
406109                     [PP*DR, DRI*AVG, TEC, RECURRING]
168570               [GOOGLE, *YOUTUBE, TV, G.CO, HELPPAY#]
98711               [CKE*BLUAQUA, RESTROBAR, MANMANCHESTER]
38975                [AMAZON.COM*TN2OW4LU3, AMZN.COM, BILL]
13974     [AMAZON.COM*1K, KO0, A, AMZN.COM, BILL, :, 11:27]
8913                           [ACI*FIRSTNATIONAL, XXX-XXX]
34351                [AMAZON.COM*138M93E50, AMZN.COM, BILL]
291265              [AMAZON.COM*360PN04, AMZN.COM, BILL, S]
177432             [IC*, FRYS, VIA, INSTACA, HTTPSDELIVERY]
361506              [AMAZON.COM*1U91P89, AMZN.COM, BILL, S]
448625                    [SQ, *COFFEE, TIME, SANTA, MARIA]
295248              [AMAZON.COM*AX8ZF4J, AMZN.COM, BILL, S]
137510              [DOORDASH*SMART, &, FIN, WWW.DOORDASH.]
472614                          [TST*, TAQUERIAS, ATOTONIL]
427179                       [GOOGLE, *FUNJOY, XXX-XXX-, S]
193023                            [LYFT,

# Phase 2: Extract & Analyze N-Grams

In [21]:
df_p2 = pd.read_csv("memos_P1.csv")

In [22]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [23]:
%%time
# corpus = df_p2['memo_post'].fillna('')
# print(f"Analyzing {len(corpus)} cleaned memos...")
# # Get the top 200 of each n-gram type
# top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
# top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
# top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
# print(f"--- N-gram Analysis Complete ---")

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.3 µs


In [24]:
top_1grams

NameError: name 'top_1grams' is not defined

In [None]:
top_2grams

In [None]:
top_3grams