# Phase 1: Initial Data Cleaning

## 1. Load the Dataset


In [328]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import time
import sys
import re

In [333]:
df = pd.read_csv("memos.csv")
row_count = df.size
row_count

528766

In [383]:
sample = df['memo'].sample(50).sort_values().reset_index(drop=True)
sample

0                                                                                  Arizona Snowbowl Snow Report
1                                                                                            BACK 40 URBAN FRES
2                                     CHECKCARD XXXX FEDEX OFFICXXXXXXXXXXX RICHMOND VA XXXXXXXXXXXXXXXXXXXXXXX
3                CHECKCARD XXXX SCENTSY,INC. XXX-XXX-XXXX ID XXXXXXXXXXXXXXXXXXXXXXX CKCD XXXX XXXXXXXXXXXXXXXX
4                                          CIRCLE K XXXXX 07/26 #XXXXXXXXX PURCHASE CIRCLE K XXXXX 65 TUCSON AZ
5                                            COSTCO WHSE #0 08/16 #XXXXXXXXX PURCHASE COSTCO WHSE #06 AUSTIN TX
6                                                            CRASHBOAT 08 AGUADILLA AGUADILLA PR          04/26
7                                                                     CULVERS OF VALPARAISO VALPARAISO IN 06/08
8                                                     DBT CRD XXXX XXXXXXXX DOUBLETREE HOTELS AURORA CO 

## 2. Define Regular Expressions

In [367]:
# State abbreviations
STATE_LIST = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", 
    "ID", "IL", "IN(?!\\s+N\\s+OUT\\s+BURGER)", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", 
    "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", 
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", 
    "WV", "WY"
]
STATE_REGEX = r"\b(" + "|".join(STATE_LIST) + r")\b"

# In order of execution
CLEANING_RULES = [
    # === 0) Normalize spaces first ===
    (r"\u00A0", " "),  # Non-breaking space
    (r"\s{2,}", " "),  # Multiple spaces

    # === 0.5) Remove leading dates left over from boilerplate ===
    (r"^\d{1,2}\s+", " "),

    # === 1) “Authorized / Recurring” headers ===
    (r"\b(?:RECURRING\s+)?PAYMENT\s+AUTHORIZED\s+ON(?:\s+\d{2})?\b", " "),
    (r"\bPURCHASE\s+AUTHORIZED\s+ON(?:\s+\d{2})?\b", " "),
    (r"\bAUTHORIZED\s+ON\s+\d{2}(?:\s+\d{2})?\b", " "),

    # === 2) Card & mask boilerplate ===
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER)\s+CHECK\s+CARD\b", " "),
    (r"\bCHECK\s*CARD\b(?:\s*X+)?", " "),
    (r"\bCARD(?:\s+ENDING\s+IN)?\s*X{4}\b", " "),
    (r"\b(?:DEBIT|CREDIT)\s+CARD\s+(?:PURCHASE|DEBIT|AUTH(?:ORIZATION)?)\b", " "),
    (r"\bPOS\s+(?:PURCHASE|WITHDRAWAL|DEBIT)\b", " "),
    (r"\b(?:DDA\s+)?PIN\s+POS\s+PUR\b", " "),
    (r"\bCDX{4,}\b", " "),
    (r"\b(?:X{2,}|X{2,}\b)", " "),
    (r"\b[SP]X{6,}\b", " "),
    (r"\bDEBIT\s+CARD\b", " "),
    (r"\bDEBIT\s+PURCHASE\b", " "),
    (r"\bPOS\s+SIGNATURE\b", " "),
    (r"\b(?:VISA|MASTERCARD|AMEX|DISCOVER|CARD|DATE|MCC)\b", " "),

    # === 3) State + mask tails ===
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\s+CARD\s+X{4}\b", " "),
    (r"\b[A-Z]{2}\s+[SP]?X{6,}\b", " "),

    # === 4) Dates/times ===
    (r"\b#?\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b", " "),
    (r"\b\d{1,2}\s+\d{2}\s+\d{2}\s*(?:AM|PM)\b", " "),
    (r"\b\d{1,2}:\d{2}(?::\d{2})?\s*[AP]?\b", " "),

    # === 5) Merchant-terminal boilerplate ===
    (r"\bMERCHANT\s+PURCHASE\s+TERMINAL\b(?:\s+\w+)?", " "),
    (r"\bPOINT\s+OF\s+SALE\s+(?:WITHDRAWAL|DEBIT)\b", " "),
    (r"\bPAYMENT\s+CARD\s+TARGET\s+DEBIT\s+CRD(?:\s+ACH(?:\s+TRAN)?)?\b", " "),
    (r"\b(?:CRD|ACH)\s+TRAN(?:\s+PPD(?:\s+ID)?)?\b.*", " "),
    (r"\bCO\s+ID\s+\w+\s+(?:WEB|PPD)\b.*", " "),

    # === 6) Misc tails ===
    (r"\b(?:INST|PAYPAL)\s+XFER\b.*", " "),
    (r"\b(?:XFER|WEB)\s+ID\b.*", " "),
    (r"\b(?:ELECTRONIC|EXTERNAL)\s+WITHDRAWAL\b.*", " "),
    (r"\bWITHDRAWAL\s+DEBIT\s+CARD\b(?:\s+DEBIT)?", " "),
    (r"\bO(?:F)?\s+SALE\s+DEBIT\s+L\d{3}\b.*", " "),
    (r"\b(?:ITEM|OVERDRAFT)\s+FEE\s+FOR\s+ACTIVITY\b.*", " "),
    (r"\b(?:GENESIS[-\s]*FS\s+CARD\s+PAYMENT|SEARS\s+BILL\s+PAYMENT)\b", " "),
    (r"\b(?:US|WA)\s+CARD\s+PURCHASE\b", " "),
    (r"\b(?:USA|US)\b$", " "),

    # === 7) Phone numbers ===
    (r"\b(?:\d{3}-\d{3}-\d{4}|XXX-XXX-XXXX)\b", " "),
    (r"\b\d{3}-\d{4}\b", " "),
    (r"\b(?:\d{3}\s*){1,2}\d{3}\s*\d{3,4}\b", " "), # 800 123 4567

    # === 8) URLs/domains ===
    # Specific junk billing info is still good to remove
    (r"\b(?:AMZN\.COM\/BILL|AMZNAMZN\.COM\s*BILL|AMAZON\.COM\/BILL|APPLE\.COM\/BILL|G\.CO\/HELPPAY#|SUPPORT\.GOOGLE?)\b", " "),

    # === 9) State abbreviations ===
    (STATE_REGEX, " "),

    # === 10) Final Tidy (Punctuation) ===
    # Preserving * and # for Phase 2
    (r"[|%_=;\\/]+", " "),
    (r"[-]{2,}", " "),
]

## 3. Define Cleaning Function

In [309]:
def clean_memo(memo):
    memos.cleaned = s.astype(str).fillna('').str.upper()

    # Normalize spaces
    memos.cleaned = memos.cleaned.str.replace(r"\u00A0", " ", regex=True)
    memos.cleaned = memos.cleaned.str.replace(r"\s{2,}", " ", regex=True)
    memos.cleaned = memos.cleaned.str.strip()
    
    # Apply Regex sequentially
    for pattern, repl in CLEANING_RULES:
        memos.cleaned = memos.cleaned.str.replace(pattern, repl, regex=True)

    # Fix spaces
    memos.cleaned = memos.cleaned.str.replace(r"\s{2,}", " ", regex=True)
    memos.cleaned = memos.cleaned.str.replace(r"^[\s-]+|[\s-]+$", "", regex=True)

    return memos.cleaned

## 4. Clean Data (First Pass)

In [372]:
print('Cleaning values in "memo"...')
start_time = time.time()

df['memo_cleaned'] = clean_memo_series(df['memo'])
end_time = time.time()

print('Cleaning complete.')
print(f"Time taken: {end_time - start_time:.3f} seconds.")

# Save to CSV
df[['memo', 'memo_cleaned']].to_csv('memos_cleaned.csv', index=False)
print(Results saved to: memos_cleaned.csv)

Cleaning values in "memo"...
Cleaning complete.
Time taken: 16.809 seconds.
Results saved to: memos_cleaned.csv



In [421]:
# Display updates
sample_df = df.sample(10)
memo_size = sample_df['memo'].str.len()
cleaned_memo_size = sample_df['memo_cleaned'].str.len()
sample_df['removed_chars'] = memo_size - cleaned_memo_size
sample_df.sort_values(by='memo').reset_index(drop=True)

Unnamed: 0,memo,memo_cleaned,removed_chars
0,#49 SAHARA VEGAS-XXXXXX XXXX E SAHARA AVE #A LAS VEGAS NV Card 20 #XXXX,#49 SAHARA VEGAS- E SAHARA AVE #A LAS VEGAS 20 #,23
1,CHECKCARD XXXX SPORTSLINE XXXXXXXXXX CA XXXXXXXXXXXXXXXXXXXXXXX,SPORTSLINE,53
2,POS Debit BOARD AND BREW - LA / HA,BOARD AND BREW - HA,15
3,PURCHASE AUTHORIZED ON 04/05 APPLE VACATIONS TS XXXXXXXXXX PA SXXXXXXXXXXXXXXX CARD XXXX,05 APPLE VACATIONS TS,67
4,PURCHASE AUTHORIZED ON 08/27 BURGER KING #XXXXX RICHMOND VA SXXXXXXXXXXXXXXX CARD XXXX,27 BURGER KING # RICHMOND,61
5,PURCHASE AUTHORIZED ON 11/18 NAYAX VENDING 14 HUNT VALLEY MD SXXXXXXXXXXXXXXX CARD XXXX,18 NAYAX VENDING 14 HUNT VALLEY,56
6,PURCHASE AUTHORIZED ON 12/24 CASH APP*MEGAN COT XXXXXXXXXX CA SXXXXXXXXXXXXXXX CARD XXXX,24 CASH APP*MEGAN COT,67
7,THE HOME DEPOT #XXXX IRVING TX XXXXXX 09/14,THE HOME DEPOT # IRVING,27
8,WWW.MYHWH.COM/BETHEA,WWW.MYHWH.COM BETHEA,0
9,debit card APPLE.COM/BILL ONE APPLE PARK WAY 866-712- Date 05/23/22 9 XXXXXXXXX 0 XXXX MCC XXXX,ONE APPLE PARK WAY 866-712- 9 0,64


# Phase 2: 