# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 0625cee] regex
 1 file changed, 381 insertions(+), 316 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0                 AD PARKING POS EAST RUTHERFO NJ 05/28
1            ATHLETICO-ACCELERATED - OAK BROOK IL 09/25
2     CENTER EXPRESS 01/21 #XXXXXXXXX PURCHASE XXXXX...
3     CHECKCARD XXXX PEREZ NURSERY LANDSCAP BRENTWOO...
4     CHECKCARD XXXX STEWARTS SHOP 415 QUEENSBURY NY...
5     CHICK-FIL-A #XXXXX AMERICAN FORK UT          0...
6     Check Card Purchase / RGP*Arbor-Nomics GA Date...
7                                 Dd Doordash Freshtrea
8                                     ETSY INC NY 02/23
9     GIANT-EAG 900 Northfie Bedford OH            1...
10                   LONGHORN STEAK STRONGSVILLE OH USA
11    MCDONALD'S FXXXX BUELLTON CA                 0...
12                                           Mcalisters
13                     NNT BISHOP'S BBQ BEXXXXXX BELDEN
14    PAPA JOHN'S #XXXX XXX-XXX-XXXX FL            1...
15    POS PUR- *****XXXX 11/06 01:41 / CASH APP*AMAN...
16    PURCHASE AUTHORIZED ON 01/01 STARY LTD CA SXXX...
17    PURCHASE AUTHORIZED ON 07/07 McDonald's XX

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [44]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"
    (r"\bDD\s*(?:[\\/]\s*)?BR\b", "DDBR"), # Combine DD/BR or DD BR -> DDBR

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    # === UPDATED RULE ===
    (r"\b(?:INST\s+XFER|RETRY\s+PYMT)\s+(?:ID)?\b", " "), # Handles PAYPAL DES:INST XFER ID...
    (r"\bPAYPAL\s+XFER\b", " "), # Handles PAYPAL XFER
    
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [71]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # === UPDATED PAYPAL RULES ===
    # Rules for 'PAYPAL DES:...' format
    re.compile(r"^PAYPAL\s+DES:.*?:(.*?)(?:\s+INDN:.*)?$"),
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),
    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0Note-9'.-]+).*"),

    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # This generic PAYPAL rule must come *after* specific PAYPAL rules
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [47]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 22.2 s, sys: 153 ms, total: 22.3 s
Wall time: 22.3 s


In [72]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 3.19 s, sys: 7.97 ms, total: 3.2 s
Wall time: 3.2 s


In [11]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [28]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [73]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
68643,CHECKCARD XXXX AMZN DIGITAL*XQ6E XXX-XXX-XXXX ...,AMZN DIGITAL*XQ6E XXX-XXX,AMZN DIGITAL
75258,CHECKCARD XXXX CRAZY HIBACHI NORTH LITTLE AR X...,CRAZY HIBACHI NORTH LITTLE,CRAZY HIBACHI NORTH LITTLE
332038,PURCHASE AUTHORIZED ON 07/12 DOLLARTRE 310 N E...,DOLLARTRE 310 N ED CAR HARLINGEN P,DOLLARTRE 310 N ED CAR HARLINGEN P
380559,PURCHASE AUTHORIZED ON 11/13 DOORDASH*SUBWAY W...,DOORDASH*SUBWAY WWW.DOORDASH. S,DOORDASH
382428,PURCHASE AUTHORIZED ON 11/18 HASBRO PULSEPURCH...,HASBRO PULSEPURCHA XXX-XXX- S,HASBRO PULSEPURCHA XXX-XXX- S
187320,KROGER #7 #1 T 08/21 #XXXXXXXXX PURCHASE KROGE...,KROGER #7 #1 T # PURCHASE KROGER #7 #1 TROY TROY,KROGER
295934,PURCHASE AUTHORIZED ON 04/07 TRACTOR S XXXX HI...,TRACTOR S HIGHWAY PORT ARTHUR P,TRACTOR S HIGHWAY PORT ARTHUR P
479492,USPS PO XXXXXXXXXX,USPS PO,USPS PO
489717,WALTON'S GREENHOUSE AN TUPELO MS,WALTON'S GREENHOUSE AN TUPELO,WALTON'S GREENHOUSE AN TUPELO
402883,PURCHASE WITH CASH BACK $ 55.00 AUTHORIZED ON ...,WITH CASH BACK $ 55.00 SHOPRITE WOODBRIDGE S1 ...,WITH CASH BACK $ 55.00 SHOPRITE WOODBRIDGE S1 ...


In [88]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'TARGET', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 'CASH APP', 'AMAZON PRIME']

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [89]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

AMZNFREETIME
WINN-DIXIE
DES
SHIPT
BASKIN
RALPHS
SLICE
BETMGM
PETCO
GOFAN
CHEWY.COM
BASHAS'
IHOP
INTUIT
STEAK-N-SHAKE
STATERBRO
DILLONS
*STARBUCKS
SKILLZ
VANS
FRG
TOLLWAY-AUTOREPLEN
PRICELN
RIOT
WWW.KOHLS.COM
BANFIELD-PET
MCW
P
STORE
GAMESTOP
SAVEMART
MARINA
OCULUS
FOODMAXX
VERIZONWRLSS
BASHAS''
CANVA
RAINBOW
WALGREENS
CLAIRE'S
SEZZLE
ABC
GNC
FOOD4LESS
QFC
BELK
FRYS-MKTPLACE
POPEYES
SUBWAY
STAPLES
FIV
NYTIMES
*EBAY
AMZ
L
V
IBI
SHOPIFY
TILLYS
OTT
UPS
*MICROSOFT
BLUESKY
*UBER
POTBELLY
GOODWILL
DROPBOX
JACK'S
QUADPAY
WEGMANS
LUCKY
EVI
CHECKERS
MEIJER
ETT
ABCMOUSE.COM
RVT
ENMARKET
NORTON
FBPAY
JOURNEYS
EA
CRYPTO.COM
OPC
TLG
HOME
CRT
GERALD
HLLFRSH
MESA
SEDANOS
NORDSTROM
REI
EBAY
NEWSSTAND
FH
LJS
ZTL
STOCKTON
EZPASS
PAR
PARKMOBILE
FRED-MEYER
*STEAM
EVERYPLATE
PACSUN
TRTHFDR
E-Z
M
RGP
DRI
TACOMA
UBR
APPLE.COM
TLF
EPC
MOE'S
PCH
EXPRESS
BUCKLE
PEET'S
CKO
FACEBK
ECS
BOXYCHARM
PAM
FAMOUSFOOTWEAR
GOFNDME
SOUTHWES
TOMMY'S
ARBY'S
OPS
LEGALSHIELD
PAVILIONS
VONS.COM
LYNWOOD
GLOSS
OFFICE
AF
CDSR
VOLA
C

In [90]:
df[df['memo_post'] == 'DES']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
219348,PAYPAL DES:INST XFER ID:ADOBE INC INDN:BRIAN H...,PAYPAL DES: :ADOBE INC INDN:BRIAN HEIST :PAYPA...,DES
219349,PAYPAL DES:INST XFER ID:ADOBE INC INDN:DINUK B...,PAYPAL DES: :ADOBE INC INDN:DINUK BATUVANTUDAV...,DES
219350,PAYPAL DES:INST XFER ID:ADOBE INC INDN:EM WEB ...,PAYPAL DES: :ADOBE INC INDN:EM WEB DESIGN AND ...,DES
219351,PAYPAL DES:INST XFER ID:ADOBE INC INDN:JACQUEL...,PAYPAL DES: :ADOBE INC INDN:JACQUELINE LOPEZ :...,DES
219352,PAYPAL DES:INST XFER ID:ADOBE INC INDN:JESSICA...,PAYPAL DES: :ADOBE INC INDN:JESSICA RENCHEN :P...,DES
...,...,...,...
219549,PAYPAL DES:RETRY PYMT ID:INSTACART INDN:REBEKA...,PAYPAL DES:RETRY PYMT :INSTACART INDN:REBEKAH ...,DES
219550,PAYPAL DES:RETRY PYMT ID:MICROSOFT ULTIM INDN:...,PAYPAL DES:RETRY PYMT :MICROSOFT ULTIM INDN:SH...,DES
219551,PAYPAL DES:RETRY PYMT ID:MICROSOFT XBOX INDN:T...,PAYPAL DES:RETRY PYMT :MICROSOFT XBOX INDN:TIT...,DES
219552,PAYPAL DES:RETRY PYMT ID:UBER EATS INDN:JASON ...,PAYPAL DES:RETRY PYMT :UBER EATS INDN:JASON NA...,DES


In [18]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

520917      09:52 DOORDASH*DAVES H WWW.DOORDASH. APGEHDCX
248595                         NON-PIN AMAZON.COM SEATTLE
475729                            UBER EATS HELP.UBER.COM
222340                                     PIN AMAZON.COM
400565        INTL UBER TRIP HELP.UBE HELP.UBER.COM NLD S
341407                             WALMART.COM XXX-XXX- S
92915                                 WALMART.COM XXX-XXX
346268                      RENTSPREE.COM RENTSPREE.COM S
92547                                             VIO.COM
347327                                         AMAZON.COM
64784                                          AMAZON.COM
102330    CRD PUR MDJKDFDA4 AMZN MKTP US*H0 AMZN.COM BILL
405521                  GRUBHUBNENOSMEXICANGO GRUBHUB.COM
428596                    HELP.HBOMAX.COM HTTPSHBOMAX.C S
243152                                     APPLE.COM BILL
436435      WITHDRAWAL - DEBIT GOLD APPLE.COM BILL CADATE
36990                                          AMAZON.COM
125699        

In [19]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
110,#00 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#00 BP APPLE COM BILL CUPERTINO 10 #,#00 BP APPLE COM BILL CUPERTINO 10 #
135,#04 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#04 BP APPLE COM BILL CUPERTINO 10 #,#04 BP APPLE COM BILL CUPERTINO 10 #
150,#07 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#07 BP APPLE COM BILL CUPERTINO 10 #,#07 BP APPLE COM BILL CUPERTINO 10 #
179,#12 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#12 BP APPLE COM BILL CUPERTINO 10 #,#12 BP APPLE COM BILL CUPERTINO 10 #
188,#14 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#14 BP APPLE COM BILL CUPERTINO 10 #,#14 BP APPLE COM BILL CUPERTINO 10 #
...,...,...,...
526991,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 1 6,APPLE.COM BILL ONE APPLE PARK WAY 1 6
526992,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 8 2,APPLE.COM BILL ONE APPLE PARK WAY 8 2
526993,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 2 6,APPLE.COM BILL ONE APPLE PARK WAY 2 6
526994,debit card APPLE.COM/BILL ONE APPLE PARK XXXXX...,APPLE.COM BILL ONE APPLE PARK C 2 4,APPLE.COM BILL ONE APPLE PARK C 2 4


In [20]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]

In [21]:
print(prefix_star_merchants['memo_pre'].str.split().sample(100).to_string())

351104                    [PP*APPLE.COM, BILL, XXX-XXX-, S]
103207                           [CRUMBL*, OWASSO, 180-, #]
112554                          [CMSVEND*DOUBLE, R, VENDIN]
136563               [DOORDASH*DUCHESS, RES, WWW.DOORDASH.]
448730                      [SQ, *EL, CAMPEON, ALBUQUERQUE]
496601              [WWP*NEXT, LEVEL, T, SIG, PURCH, 13:34]
219063                                   [PAYPAL, *PERFUME]
320438               [INTUIT, *PROSERIES, CL.INTUIT.COM, S]
243638                    [DOORDASH*TACO, BELL, SAN, FRANC]
527062    [CMSVEND*CV, LOS, KNOTT, ST, GARDEN, GROVE, 5, 9]
137690                   [DOORDASH*TAQUERIA, WWW.DOORDASH.]
16861              [AMAZON.COM*RS70, SEATTLE, US, PURCHASE]
422669                  [GOOGLE, *GRAM, GAMES, XXX-XXX-, S]
472649    [TST*, THE, CORNER, PUB-OFAOFALLON, MOUS, :, P...
504337                             [TARGET.COM, *, XXX-XXX]
323763              [AMAZON.COM*US03J7S, AMZN.COM, BILL, S]
92255                          [USA*CSC,

# Phase 2: Extract & Analyze N-Grams

In [22]:
df_p2 = pd.read_csv("memos_P1.csv")

In [23]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [24]:
%%time
# corpus = df_p2['memo_post'].fillna('')
# print(f"Analyzing {len(corpus)} cleaned memos...")
# # Get the top 200 of each n-gram type
# top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
# top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
# top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
# print(f"--- N-gram Analysis Complete ---")

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.58 µs


In [25]:
top_1grams

NameError: name 'top_1grams' is not defined

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes