# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [25]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main 488b78c] regex
 1 file changed, 104 insertions(+), 73 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [3]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [4]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0                  AMZN Mktp US*FX Amzn.com/bill WA USA
1     AMZN Mktp US*Q53PV6Y Amzn.com/bill WA        0...
2     CHECKCARD XXXX AMAZON.COM*U203H2G23 AM AMZN.CO...
3     CHECKCARD XXXX UBER *EATS HELP.U San Francisco...
4     Guideline Retire/GUIDELINE ST-H5P9K7Q4Z5C2 GUI...
5     MCDONALD'S FXXXX DDA PIN POS PUR        CDXXXX...
6        POS/Check Card Withdrawal 58 / MAIN BBQ & BREW
7     PURCHASE AUTHORIZED ON 01/05 ZAXBY'S #XXXXX SP...
8     PURCHASE AUTHORIZED ON 01/13 KOBE JAPAN VICTOR...
9     PURCHASE AUTHORIZED ON 02/07 SAMSCLUB #XXXX ST...
10    PURCHASE AUTHORIZED ON 04/16 CRO ST JULIANS ML...
11    PURCHASE AUTHORIZED ON 05/01 MCDONALD'S FXXXXX...
12    PURCHASE AUTHORIZED ON 07/29 ACCESS SECURE DEP...
13    PURCHASE AUTHORIZED ON 09/01 THEBESTWOK GREAT ...
14    PURCHASE AUTHORIZED ON 09/19 DOORDASH*J.R. CRI...
15    PURCHASE AUTHORIZED ON 10/11 KROGER #502 XXXX ...
16    PURCHASE AUTHORIZED ON 10/17 CASH APP* XXXXXXX...
17    PURCHASE AUTHORIZED ON 11/02 PUREFORMULAS.

## 2. Define Regex Rules

In [5]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [6]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
    
    # NEW RULE: Remove leading numeric IDs (e.g., "232005 - ")
    (r"^\d+\s*-\s*", " "), 
]

In [7]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),

    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^DD\s*BR\s*\*?#?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^TST\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Catches duplicate IC rule
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s09'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s09'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s09'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [8]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 27.7 s, sys: 411 ms, total: 28.1 s
Wall time: 28.1 s


In [9]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 4.26 s, sys: 40.5 ms, total: 4.3 s
Wall time: 4.3 s


In [10]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [11]:
# Manually check
unique_df = df.drop_duplicates(subset='memo_post')
result = (
    unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
    .sort_values(by='memo_pre')[100:200].to_string()
)
print(result)

                                                                                          memo                      memo_pre                     memo_post
110174  Check Card Purchase / ABCMOUSE CA Date 07/02/21 XXXXXXXXXXXXXXXXXXXXXXX XXXX Card XXXX                      ABCMOUSE                      ABCMOUSE
8087                                                                             ABCMOUSE.COM*                 ABCMOUSE.COM*                  ABCMOUSE.COM
30935                                                                  Abelardosmexicanfreshar       ABELARDOSMEXICANFRESHAR       ABELARDOSMEXICANFRESHAR
30937                                                                              Abercrombie                   ABERCROMBIE                   ABERCROMBIE
251857                                                        POS Signature Purchase -  ABERDE                        ABERDE                        ABERDE
30938                                                                 

In [12]:
df.sample(20).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
8261,ACADEMY SPORTS + OUTDO XXXXX XXXXX TXXXX...,ACADEMY SPORTS + OUTDO T,ACADEMY SPORTS + OUTDO T
319439,PURCHASE AUTHORIZED ON 06/09 AMAZON.COM*6X55B8...,AMAZON.COM*6X55B8Q AMZN.COM BILL S,AMAZON.COM
25067,AMZN Mktp US*TO0UQ2QG3 AMZN.COM/BILLWA,AMZN MKTP US*TO0UQ2QG3 AMZN.COM BILLWA,AMZN MKTP US
229873,POS Debit - Visa Check Card XXXX - AMZN MKTP U...,AMZN MKTP US*UO30S AMZN.COM B OTHA L KIMBROUGH...,AMZN MKTP US
421164,RECURRING PAYMENT AUTHORIZED ON 01/26 APPLE.CO...,APPLE.COM BILL XXX-XXX- S,APPLE.COM BILL XXX-XXX- S
40155,Arepalicious,AREPALICIO,AREPALICIO
49462,Bahama Buck's,BAHAMA BUCK'S,BAHAMA BUCK'S
56832,CASH APP*FRANCISCO XXXXXXXXXX CA 0...,CASH APP*FRANCISCO,CASH APP
102720,CRD PUR XXXXXXXXXXXX XXXX / DOLLAR-GENERAL #XX...,CRD PUR DOLLAR-GENERAL # NOBLESVILLE,CRD PUR DOLLAR-GENERAL
102791,CRD PUR XXXXXXXXXXXX XXXX / WISH.COM CA POINT ...,CRD PUR WISH.COM,CRD PUR WISH.COM


In [32]:
df.loc[8261]['memo']

'ACADEMY SPORTS + OUTDO     XXXXX XXXXX   TXXXXX XXXX'

In [14]:
merchants_clean = ['AMAZON', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'TARGET', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE's"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT']

multiples = [['AMAZON', 'AMAZON.COM'], ['LOWES', "LOWE'S"]]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [15]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

BR
ELEVEN
BUC-EE'S
AMZNFREETIME
ALBERTSONS
ADOBE
WINN-DIXIE
SONIC
LIDL
DES
SHIPT
WITHDRAWAL
BASKIN
RALPHS
SLICE
BETMGM
PETCO
GOFAN
CHEWY.COM
BASHAS'
IHOP
INTUIT
STEAK-N-SHAKE
STATERBRO
DILLONS
*STARBUCKS
SKILLZ
VANS
TOLLWAY-AUTOREPLEN
FRG
PRICELN
RIOT
WWW.KOHLS.COM
STORE
MCW
P
BANFIELD-PET
GAMESTOP
SAVEMART
FOODMAXX
MARINA
OCULUS
BASHAS''
VERIZONWRLSS
RAINBOW
CANVA
CLAIRE'S
WALGREENS
SEZZLE
ABC
GNC
QFC
BELK
FRYS-MKTPLACE
POPEYES
STAPLES
SUBWAY
FIV
NYTIMES
AMZ
*EBAY
V
L
SHOPIFY
TILLYS
IBI
*MICROSOFT
OTT
UPS
BLUESKY
*UBER
POTBELLY
GOODWILL
DROPBOX
JACK'S
QUADPAY
WEGMANS
LUCKY
EVI
CHECKERS
ETT
MEIJER
ABCMOUSE.COM
RVT
ENMARKET
NORTON
FBPAY
EA
JOURNEYS
CRYPTO.COM
TLG
HOME
OPC
CRT
HLLFRSH
GERALD
NORDSTROM
SEDANOS
REI
EBAY
FH
LJS
NEWSSTAND
EZPASS
ZTL
*STEAM
FRED-MEYER
PARKMOBILE
PAR
EVERYPLATE
E-Z
PACSUN
RGP
M
TRTHFDR
DRI
UBR
TLF
EPC
APPLE.COM
MOE'S
PCH
BUCKLE
EXPRESS
PEET'S
BOXYCHARM
ECS
CKO
FACEBK
FAMOUSFOOTWEAR
TOMMY'S
SOUTHWES
GOFNDME
PAM
ARBY'S
OPS
VONS.COM
PAVILIONS
CENTS-ONLY
GLOSS
LEG

In [16]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

270083                                           AMAZON.COM
432975                   RING UNLIMITED MONTH HTTPSRING.COM
65132                                            AMAZON.COM
395284                                 BKGBOOKING.COM HOT S
314543                                           AMAZON.COM
254407                                           AMAZON.COM
85960            QUPBBQ AKA GANGSTABBQ QUPBBQ.COM RECURRING
227890    AFTERPAY AFTERPAY.COM TASHA T MCMURRAY POS TRA...
65131                                            AMAZON.COM
265232                                           AMAZON.COM
484822                       VOLARIS M6HJQL VOLARIS.COM 0 7
50986     BRIGIT-COM DES:MEMBERSHIP : FC249F C INDN:NICO...
239540                 POS EJZ6NY AMAZON.COM*OM0 SEATTLE ##
423299                            UBER PASS HELP.UBER.COM S
424091                      WWW.ONXMAPS.COM HTTPSWWW.ONXM S
422606                            APPLE.COM BILL XXX-XXX- S
14569                                   

In [17]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

Unnamed: 0,memo,memo_pre,memo_post
110,#00 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#00 BP APPLE COM BILL CUPERTINO 10 #,#00 BP APPLE COM BILL CUPERTINO 10 #
135,#04 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#04 BP APPLE COM BILL CUPERTINO 10 #,#04 BP APPLE COM BILL CUPERTINO 10 #
150,#07 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#07 BP APPLE COM BILL CUPERTINO 10 #,#07 BP APPLE COM BILL CUPERTINO 10 #
179,#12 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#12 BP APPLE COM BILL CUPERTINO 10 #,#12 BP APPLE COM BILL CUPERTINO 10 #
188,#14 BP APPLE COM BILL CUPERTINO CA Card 10 #XX...,#14 BP APPLE COM BILL CUPERTINO 10 #,#14 BP APPLE COM BILL CUPERTINO 10 #
...,...,...,...
526991,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 1 6,APPLE.COM BILL ONE APPLE PARK WAY 1 6
526992,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 8 2,APPLE.COM BILL ONE APPLE PARK WAY 8 2
526993,debit card APPLE.COM/BILL ONE APPLE PARK WAY X...,APPLE.COM BILL ONE APPLE PARK WAY 2 6,APPLE.COM BILL ONE APPLE PARK WAY 2 6
526994,debit card APPLE.COM/BILL ONE APPLE PARK XXXXX...,APPLE.COM BILL ONE APPLE PARK C 2 4,APPLE.COM BILL ONE APPLE PARK C 2 4


In [18]:
df.loc[478436]

memo         USA*SOUTHCOM AIR VEND
memo_pre     USA*SOUTHCOM AIR VEND
memo_post        SOUTHCOM AIR VEND
Name: 478436, dtype: object

In [19]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]

In [20]:
print(prefix_star_merchants['memo_pre'].str.split().sample(100).to_string())

256011                       [PP*GOOGLE, FUNMINAP, XXX-XXX]
67037          [AMAZON.COM*ZH71L20G3, AM, AMZN.COM, BILLWA]
37434                [AMAZON.COM*HC27G0WK2, AMZN.COM, BILL]
147227                       [TST*, STARRY, LANESAN, DIEGO]
35427             [AMAZON.COM*1H6QB1WK0, A, AMZN.COM, BILL]
15085          [AMAZON.COM*2X3TE5US2, AMZNAMZN.COM, BILLWA]
524935                             [YSI*BJB, PROPERTIES, 7]
13981           [AMAZON.COM*1L1MI, AMAZON.COM, SEATTLE,, ,]
85817          [PYL*THE, MANAGEMENT, GROU, XXX-, RECURRING]
364486                  [PAR*PINECREST, BAKE, PINECREST, S]
405629                       [IC*, INSTACART*ALDI, ALDI.US]
138351                  [DROPBOX*1ZJ3DHGN3LT2, DROPBOX.COM]
161421                           [FRG*TEAMFANSHOP, FLUS, #]
167595                             [GOOGLE, *KING, XXX-XXX]
361232             [GOOGLE, *ZYNGA, INC, G.CO, HELPPAY#, S]
421859                  [CNP*THE, NEW, YORKER, XXX-XXX-, S]
312705              [USA*SNACK, SODA, VE

# Phase 2: Extract & Analyze N-Grams

In [21]:
df_p2 = pd.read_csv("memos_P1.csv")

In [22]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [23]:
%%time
# corpus = df_p2['memo_post'].fillna('')
# print(f"Analyzing {len(corpus)} cleaned memos...")
# # Get the top 200 of each n-gram type
# top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
# top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
# top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
# print(f"--- N-gram Analysis Complete ---")

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 14.3 µs


In [24]:
top_1grams

NameError: name 'top_1grams' is not defined

In [None]:
top_2grams

In [None]:
top_3grams