# Phase 1: Preprocessing with Regular Expressions

In [1]:
import subprocess
subprocess.run(['git', 'pull'])

Already up to date.


CompletedProcess(args=['git', 'pull'], returncode=0)

In [2]:
subprocess.run(['git', 'add', 'Haris_Saif.ipynb'])
subprocess.run(['git', 'commit', '-m', 'regex'])

[main e8e5305] regex
 1 file changed, 5029 insertions(+), 5232 deletions(-)


CompletedProcess(args=['git', 'commit', '-m', 'regex'], returncode=0)

## 1. Load Dataset


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [4]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [5]:
df['memo'].sample(25).sort_values().reset_index(drop=True)

0     BUHRE MINI MART DELI CO BRONX NY             0...
1     CHECKCARD XXXX AMZN MKTP US*2P37 AMZN.COM/BILL...
2     CHECKCARD XXXX CORNER BAKERY CAFE XXXX MURRIET...
3     DBT CRD XXXX 03/14/22 XXXXXXX CIRCLE K # XXXXX...
4                             DD DOORDASH LUSH CA 12/30
5     DNH*GODADDY.COM https://www.g AZ             0...
6             Debit Card Purchase DD DOORDASH PENNSTATI
7     FOOD CITY #123 MESA AZ               XXXXXX  0...
8     HMR WEIGHT MGMT SVC CO MA Date 03/24/22 0 XXXX...
9                    ILA FORUM - CON INGLEWOOD CA 07/28
10                                JETS PIZZA OF CAILLAC
11                                         MEIJER # 212
12                               Mohegan Sun Box Office
13    NNT CENTER EXP 08/03 #XXXXXXXXX PURCHASE XXXXX...
14    NNT WSS #9 MAIN XXXXXX 08-17-21LOS ANGELES CA ...
15                            PDQ TRINITY NEW PORT RICH
16    PURCHASE AUTHORIZED ON 02/19 CASH APP*PETRI TI...
17    PURCHASE AUTHORIZED ON 05/29 YOUR DEKALB F

## 2. Define Regex Rules

In [6]:
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", 
    "(?<!\.)CO(?!['`])", # Negative lookbehind/ahead for CO (e.g., not .CO or COSTCO)
    "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", 
    "IN(?!\\s+N\\s+OUT\\s+BURGER)", # Negative lookahead for IN (not IN N OUT BURGER)
    "KS", "KY",
    "(?<!['`])LA(?!\\s+HACIENDA|\\s+FITNESS|\\s+LA'S|['`])", # Negative lookaheads for LA
    "MA", "MD", 
    "ME(?!\\s+DIA)", # Negative lookahead for ME (not ME DIA)
    "MI", "MN", "MO(?!['`])", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", 
    "OH", "OK", "OR", 
    "PA(?!['`])", 
    "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", 
    "WI", "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

In [68]:
REGEX_PRE = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "), # Replace non-breaking space with regular space
    (r"\s{2,}", " "), # Collapse multiple spaces into one

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\b(?:P?URCHASE\s+)?AUTHORIZED\s+ON(?:\s+\d{2}[/-]\d{2,4})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}[/-]\d{2,4}\b", " "),
    (r"\bRECURRING\s+PYMT\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "), 
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\bDEBIT\s+CARD\s+DEBIT\s*/", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\b(?:WITHDRAWAL|POS)\s*#", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CHIP\b", " "),
    (r"\bPOS\s+PUR-\s*(?:\*+)?", " "), 
    (r"\bAUTH\s*#\s*-?", " "), 
    (r"\bCK\s*X+\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "), 
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "), 
    (r"\bCDX{4,}\b", " "),
    (r"X{4,}", " "), # Remove generic masked numbers
    (r"\b[SP]X{6,}\b", " "), 
    (r"\bDEBIT\s+(?:CARD|CRD)\b", " "), 
    (r"\bDEBIT\s+PURCHASE\b", " "), 
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "), # Remove common card-related keywords
    (r"^\s*PURCHASE\b", " "), # Remove "PURCHASE" if at start
    (r"^\s*REC\s+POS\b", " "),
    (r"^\s*RECURRING\b", " "),

    # === 2.5) Prefix Normalization ===
    (r"\b(DNH)(?=[A-Z]{2,})", r"\1 "), # Fix "DNHGODADDYCOM" -> "DNH GODADDYCOM"

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "), # Dates like 10/23, 10/23/2025
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "), # 10 23 25 PM
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM)\b", " "), # Times like 10:23 AM

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b\s*-?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b\s*-?", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "), # Remove CO ID...
    
    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b", " "), 
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b", " "), 
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT)\b", " "),
    (r"\bBILL\s+PAYMENT\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"-\s*MEMO=", " "),
    (r"(?:USA|US)$", " "), # Remove USA or US at the end
    (r"\s+FSP$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "), # 800-555-1212
    (r"\b\d{3}-\d{4}\b", " "), # 555-1212
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 555 1212 or 1 800 555 1212
    (r"\b#?\s*\d{3}-\d{3}-\d{1,4}\s*(?:AM|PM)?\b", " "),

    # === 8) URLs/domains ===
    (r"^\.COM\s+BILL\b.*", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "), # Remove standalone state codes

    # === 10) Final Tidy (Punctuation) ===
    (r"[|%_=;\\/]+", " "), # Remove misc separators
    (r"[-]{2,}", " "), # Collapse multiple hyphens
]

In [69]:
REGEX_POST = [
    # --- Specific/Tricky Rules ---
    # This greedily finds the *last* instance of GODADDY.COM or GODADD
    re.compile(r".*(GODADDY\.COM|GODADD)\b.*"),
    
    # === PAYPAL RULES ===
    # Rules for 'PAYPAL DES:...' format
    re.compile(r"^PAYPAL\s+DES:.*?:(.*?)(?:\s+INDN:.*)?$"),
    # Rules for 'PAYPAL [MERCHANT] INTERNET PAYMENT' format
    re.compile(r"^PAYPAL\s+([A-Z\s0-9'.*-]+?)\s+(?:INTERNET\s+PAYMENT|COID.*)?.*$"),
    # Rules for '[MERCHANT] ... PAYPAL' format
    re.compile(r"^([A-Z\s0-9'.*-]+?)\s+PAYPAL.*$"),

    # --- Standardized Prefix Rules ---
    # (Capture group is placed *after* the prefix)
    re.compile(r"^TARGET\b(.*)"), # Special case for TARGET, might capture store #
    re.compile(r"^ACI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^KING\s*#\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ACE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7-ELEVEN\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^7\s+11\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZG\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^YSI\s*\*?\s*([A-Z\s0-9'.-]+).*"), # Corrected

    # Specific DDBR rule must come *before* the general DD rule
    re.compile(r"^(DDBR)\s*\*?#?.*"), 
    
    re.compile(r"^DD\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PY\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ANC\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^J2\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^OSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PL\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^RTI\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^FSP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PT\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^PP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CHR\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^USA\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # === TST RULES ===
    # Specific: (POS) TST* MERCHANT [TWO WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Specific: (POS) TST* MERCHANT [ONE WORD CITY] ON ##...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+?)\s+[A-Z]{2,}\s+ON\s*##.*"),
    # Fallback: (POS) TST* MERCHANT...
    re.compile(r"^(?:POS\s+)?TST\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    
    re.compile(r"^CKE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SQ\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^SP\+AFF\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^IC\s*\*?\s*([A-Z\s0-9'.-]+).*"), 
    re.compile(r"^SIE\s*\*?\s*([A-Z\s0-9'.-]+).*"),
    
    # This generic PAYPAL rule must come *after* specific PAYPAL rules
    re.compile(r"^PAYPAL\s*\*?\s*([A-Z\s0-9'.-]+).*"),

    # --- Specific '*' prefix rules (already correct) ---
    re.compile(r"^AMS\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+ZSK\s*\*+([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZSK\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^CCM\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^ZIP\.CO\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^XSOLLA\s*\*+\s*([A-Z\s0-9'.-]+).*"),
    re.compile(r"^NOR\s*\*+\s*([A-Z\s0-9'.-]+).*"),

    
    # --- Generic Fallback Rules ---
    # These rules try to find the merchant *before* a common delimiter.
    
    # Pattern for "MERCHANT NAME * JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*\*.*"),
    
    # Pattern for "MERCHANT NAME # JUNK"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s*#.*"),
    
    # Pattern for "MERCHANT NAME - Exp"
    re.compile(r"^([A-Z\s'.-][A-Z\s0-9'.-]*?)\s+-\s+EXP.*"),
]

## 3. Apply Regex

In [70]:
%%time
# First pass
memos = df['memo'].astype(str).fillna('').str.upper()
memos = memos.str.replace(r"\u00A0", " ", regex=True)
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.strip()

for pattern, repl in REGEX_PRE:
    memos = memos.str.replace(pattern, repl, regex=True)
        
memos = memos.str.replace(r"\s{2,}", " ", regex=True)
memos = memos.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)
df['memo_pre'] = memos

CPU times: user 22.9 s, sys: 68.2 ms, total: 23 s
Wall time: 23 s


In [71]:
%%time
# Second pass
def apply_regex(memo):
    for pattern in REGEX_POST:
        match = pattern.match(memo)
        if match:
            return match.group(1).strip()
    return memo
df['memo_post'] = df['memo_pre'].apply(apply_regex)

CPU times: user 3.48 s, sys: 12 ms, total: 3.49 s
Wall time: 3.49 s


In [72]:
df.to_csv('memos_P1.csv', index=False) # Save to CSV

In [None]:
# Manually check
# unique_df = df.drop_duplicates(subset='memo_post')
# result = (
#     unique_df[unique_df['memo_pre'].str.split().str.len() == 1]
#     .sort_values(by='memo_pre')[100:200].to_string()
# )
# print(result)

In [87]:
df.sample(10).sort_values(by='memo_post')

Unnamed: 0,memo,memo_pre,memo_post
122326,DD'S DISCOUNT 08/24 #XXXXXXXXX PURCHASE DD'S D...,DD'S DISCOUNT # PURCHASE DD'S DISCOUNT #52,'S DISCOUNT
295628,PURCHASE AUTHORIZED ON 04/07 ARK 1 SAN ANTONIO...,ARK 1 SAN ANTONIO S,ARK 1 SAN ANTONIO S
50989,Brigit-com DES:MEMBERSHIP ID:XXXXXFFAF0BA429 I...,BRIGIT-COM DES:MEMBERSHIP : FFAF0BA429 INDN:KA...,BRIGIT-COM DES:MEMBERSHIP : FFAF0BA429 INDN:KA...
101501,COSTCO WHSE #XXXX MERCED CA US / Withdrawal @ ...,COSTCO WHSE # MERCED US WITHDRAWAL @ COSTCO WH...,COSTCO WHSE
114634,DBT CRD XXXX 04/15/21 XXXXXXXX TACO BELL XXXX ...,DBT CRD TACO BELL TACO BELL KILLEEN C#,DBT CRD TACO BELL TACO BELL KILLEEN C
461056,TARGET DEBIT CRD DES:ACH TRAN ID:XXXXXXXXXXXXX...,TARGET DES: : INDN:L LOPEZ : WEB PMT INFO: 576...,DES: : INDN:L LOPEZ : WEB PMT INFO: 576 TARGET...
131679,DISCOVERY POINT SILVE FL 09/08,DISCOVERY POINT SILVE,DISCOVERY POINT SILVE
84216,CHECKCARD XXXX OASIS CLEANERS VAN NUYS CA XXXX...,OASIS CLEANERS VAN NUYS,OASIS CLEANERS VAN NUYS
393759,PURCHASE AUTHORIZED ON 12/16 XXXX - SEPHORA XX...,SEPHORA PR PLANO P,SEPHORA PR PLANO P
510042,Withdrawal PFCU Check Card / Afterpay CA Date ...,WITHDRAWAL PFCU AFTERPAY 30 #,WITHDRAWAL PFCU AFTERPAY 30


In [30]:
merchants_clean = ['AMAZON', 'ALBERTSONS', 'ADOBE', 'SONIC', 'LIDL', 'AFTERPAY', 'ARBYS', 'ALDI', 'AUDIBLE', 'DOLLARTREE', 'LOWES', 'KROGER', 'DUNKIN', 'HP', 'PUBLIX', 'FRYS', 'SAFEWAY', 'DOORDASH', 'GOOGLE', 'WALMART', 'TARGET', 'CHICK-FIL-A', 'SAMSCLUB', 'MICROSOFT',
             'UBER', 'ULTA', 'H-E-B', 'VONS', 'CMSVEND', 'INSTACART', 'LYFT', 'TJMAXX', 'PETSMART', 'THORNTONS', 'PAYPAL', 'MAVERIK', 'WENDYS', 'MARSHALLS', 'ALLSUPS', 'SUNPASS', 'QVC', 'PRIZEPICKS', 'DDBR']

merchants_punc = ["DENNY'S", 'WAL-MART', "LOWE'S", 'DOLLAR-GENERAL', '7-ELEVEN', "WENDY'S", "ZAXBY'S", 'FRYS-FOOD-DRG', "BUC-EE'S",
                 "BASHAS'"]

merchants_sites = ['AMAZON.COM', 'GODADDY.COM', 'CCBILL.COM']

merchant_cats = ['', 'OVERDRAFT', 'WITHDRAWAL']

multiples = [['AMAZON', 'AMAZON.COM', 'AMAZON PRIME'], ['LOWES', "LOWE'S"], ['BASKIN', 'DDBR']]

merchants = merchants_clean + merchants_punc + merchants_sites + merchant_cats

In [81]:
for memo in df[df['memo_post'].str.split().str.len() == 1]['memo_post'].value_counts().sort_values(ascending=False).index:
    if memo not in merchants:
        print(memo)

AMZNFREETIME
WINN-DIXIE
SHIPT
BASKIN
RALPHS
SLICE
PETCO
BETMGM
GOFAN
CHEWY.COM
IHOP
INTUIT
STEAK-N-SHAKE
DILLONS
STATERBRO
*STARBUCKS
SKILLZ
VANS
FRG
TOLLWAY-AUTOREPLEN
PRICELN
RIOT
WWW.KOHLS.COM
MCW
BANFIELD-PET
STORE
P
MARINA
SAVEMART
GAMESTOP
FOODMAXX
OCULUS
BASHAS''
VERIZONWRLSS
CANVA
RAINBOW
WALGREENS
CLAIRE'S
SEZZLE
ABC
GNC
FOOD4LESS
QFC
BELK
POPEYES
FRYS-MKTPLACE
STAPLES
SUBWAY
FIV
NYTIMES
*EBAY
AMZ
L
V
SHOPIFY
IBI
TILLYS
*MICROSOFT
UPS
OTT
BLUESKY
DROPBOX
*UBER
GOODWILL
POTBELLY
JACK'S
QUADPAY
WEGMANS
EBAY
EVI
LUCKY
CHECKERS
MEIJER
ETT
ABCMOUSE.COM
RVT
ENMARKET
NORTON
FBPAY
JOURNEYS
EA
CRYPTO.COM
HOME
TLG
OPC
CRT
NORDSTROM
HLLFRSH
GERALD
PARKMOBILE
MESA
SEDANOS
REI
NEWSSTAND
LJS
FH
APPLE.COM
STOCKTON
ZTL
EZPASS
*STEAM
EVERYPLATE
FRED-MEYER
PAR
TRTHFDR
RGP
PACSUN
E-Z
M
TACOMA
DRI
UBR
TLF
MOE'S
EPC
PCH
BUCKLE
EXPRESS
PEET'S
BOXYCHARM
FACEBK
CKO
ECS
SOUTHWES
FAMOUSFOOTWEAR
PAM
GOFNDME
TOMMY'S
ARBY'S
PAVILIONS
GLOSS
OPS
LYNWOOD
LEGALSHIELD
VONS.COM
CDSR
OFFICE
VOLA
AF
COLDSTONE
ETS

In [57]:
df[df['memo_post'] == 'POS PUR-']#.iloc[0]['memo']

Unnamed: 0,memo,memo_pre,memo_post
241112,POS PUR- *****XXXX 01/01 14:16 / ALLSUP XXXXXX...,POS PUR- ***** 14:16 ALLSUP ARCHER CITY,POS PUR-
241113,POS PUR- *****XXXX 01/02 01:44 / SUBWAY XXXXX ...,POS PUR- ***** 01:44 SUBWAY FORT WORTH,POS PUR-
241114,POS PUR- *****XXXX 01/03 04:51 / CANTEEN VENDI...,POS PUR- ***** 04:51 CANTEEN VENDING FORT WORTH,POS PUR-
241115,POS PUR- *****XXXX 01/04 01:48 / SPECTRUM XXX-...,POS PUR- ***** 01:48 SPECTRUM XXX-XXX,POS PUR-
241116,POS PUR- *****XXXX 01/04 04:55 / CANTEEN VENDI...,POS PUR- ***** 04:55 CANTEEN VENDING FORT WORTH,POS PUR-
...,...,...,...
242207,POS PUR- XXXXXX *****XXXX 12/31 / 01:50 CASH A...,POS PUR- ***** 01:50 CASH APP*CARLA J,POS PUR-
242208,POS PUR- XXXXXX *****XXXX 12/31 / 04:51 GOLDEN...,POS PUR- ***** 04:51 GOLDEN CHICK MID MIDLOTHIAN,POS PUR-
242209,POS PUR- XXXXXX *****XXXX 12/31 / 10:52 SIERRA...,POS PUR- ***** 10:52 SIERRA EXPRESS C FORT WORTH,POS PUR-
242210,POS PUR- XXXXXX *****XXXX 12/31 / 11:03 LYFT 1...,POS PUR- ***** 11:03 LYFT 1 RIDE 12,POS PUR-


In [28]:
df[df['memo'].str.contains('BASHAS')]

Unnamed: 0,memo,memo_pre,memo_post
1049,#XXXXXX BASHAS' #052 TUCSON AZ Card 15 #XXXX,# BASHAS' #052 TUCSON 15 #,# BASHAS' #052 TUCSON 15 #
30828,AZ LOT BASHAS 17 99 S GILBERT AZ XXXXXX 04/23,LOT BASHAS 17 99 S GILBERT,LOT BASHAS 17 99 S GILBERT
30829,AZ LOT BASHAS 17 99 S GILBERT AZ XXXXXX 12/06,LOT BASHAS 17 99 S GILBERT,LOT BASHAS 17 99 S GILBERT
30830,AZ LOT BASHAS 17 99 S GILBERT AZ XXXXXX 12/20,LOT BASHAS 17 99 S GILBERT,LOT BASHAS 17 99 S GILBERT
41638,BASHAS #007 POS XXXX 03/28/21 XXXXXXXX,BASHAS #007 POS,BASHAS
...,...,...,...
401892,PURCHASE WITH CASH BACK $ 20.00 AUTHORIZED ON ...,WITH CASH BACK $ 20.00 BASHAS' #073 MESA,WITH CASH BACK $ 20.00 BASHAS' #073 MESA
402032,PURCHASE WITH CASH BACK $ 20.00 AUTHORIZED ON ...,WITH CASH BACK $ 20.00 BASHAS' #073 MESA,WITH CASH BACK $ 20.00 BASHAS' #073 MESA
402036,PURCHASE WITH CASH BACK $ 20.00 AUTHORIZED ON ...,WITH CASH BACK $ 20.00 BASHAS' #073 MESA,WITH CASH BACK $ 20.00 BASHAS' #073 MESA
485788,Visa Checking BASHAS' #052 TUCSON AZ Date 03/1...,CHECKING BASHAS' #052 TUCSON 0 0 15 #,CHECKING BASHAS'


In [None]:
df[(df['memo_post'].str.contains('COM')) | (df['memo_post'].str.contains('WWW')) | (df['memo_post'].str.contains('HTTP'))].sample(30)['memo_post']#.iloc[0]['memo']

In [None]:
df[(df['memo_post'].str.contains('APPLE')) & (df['memo_post'].str.contains('COM'))]

In [None]:
regex_pattern = r"^[A-Z0-9\.]+\s*\*\s*[A-Z\s0-9'.-]+"

prefix_star_merchants = df[df['memo_pre'].str.contains(regex_pattern, na=False)]

In [None]:
print(prefix_star_merchants['memo_pre'].str.split().sample(100).to_string())

# Phase 2: Extract & Analyze N-Grams

In [None]:
df_p2 = pd.read_csv("memos_P1.csv")

In [None]:
def top_ngrams(corpus: pd.Series, n_gram_range: tuple, top_n: int = 200):
    print(f"Analyzing {n_gram_range} n-grams...")
    vec = CountVectorizer(
        ngram_range=n_gram_range,
        stop_words='english',
        max_features=None  # We want to count all n-grams first
    ).fit(corpus)
    
    # Get the counts
    bag_of_words = vec.transform(corpus)
    
    # Sum the counts for each n-gram
    sum_words = bag_of_words.sum(axis=0)
    
    # Map n-grams to their frequencies
    words_freq = [
        (word, sum_words[0, idx]) 
        for word, idx in vec.vocabulary_.items()
    ]
    
    # Sort by frequency (descending)
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:top_n]

In [None]:
%%time
# corpus = df_p2['memo_post'].fillna('')
# print(f"Analyzing {len(corpus)} cleaned memos...")
# # Get the top 200 of each n-gram type
# top_1grams = top_ngrams(corpus, n_gram_range=(1, 1), top_n=200)
# top_2grams = top_ngrams(corpus, n_gram_range=(2, 2), top_n=200)
# top_3grams = top_ngrams(corpus, n_gram_range=(3, 3), top_n=200)
# print(f"--- N-gram Analysis Complete ---")

In [None]:
top_1grams

In [None]:
top_2grams

In [None]:
top_3grams

In [None]:
# Use 1 grams to find prefixes