In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind

import re
from typing import List, Tuple, Iterable, Optional
from collections import Counter

# Load Data
## Train Test Split

In [2]:
inflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-inflows.pqt')
outflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

outflow_ids = set(outflow["prism_consumer_id"].unique())
inflow_ids = set(inflow["prism_consumer_id"].unique())

in_not_out = inflow_ids - outflow_ids
out_not_in = outflow_ids - inflow_ids

#consumers in both inflow and outflow
consumers_both = sorted(set(inflow["prism_consumer_id"]).intersection(outflow["prism_consumer_id"]))

#80-20 train test split
train_ids, test_ids = train_test_split(consumers_both, test_size=0.2, random_state=42)

inflow_train = inflow[inflow["prism_consumer_id"].isin(train_ids)]
inflow_test  = inflow[inflow["prism_consumer_id"].isin(test_ids)]

outflow_train = outflow[outflow["prism_consumer_id"].isin(train_ids)]
outflow_test  = outflow[outflow["prism_consumer_id"].isin(test_ids)]

## Remove Observed Patterns

In [3]:
## use train_memos.txt to observe patterns from original outflow_train
# unique_memos = np.sort(outflow_train[outflow_train['memo']!=outflow_train['category']]['memo'].str.lower().unique())
# np.savetxt("train_memos.txt", unique_memos, fmt="%s")

In [4]:
m = outflow_train[outflow_train['memo']!=outflow_train['category']]['memo'].str.lower()
clean = m.copy(deep = True)

websites = re.compile(r'\b(?:www\.)?([a-wy-z0-9]+)[.\s](com|com|net|org|gov)\b', flags=re.IGNORECASE)

# extract website domain
def extract_domain(text):
    m = websites.search(text)
    if m:
        return m.group(1)
    else:
        return text

clean = clean.apply(extract_domain)

In [5]:
patterns = [r'^#\d*x*\s', #starting number signs with following 0 or more digits inlcuding x then space
            r'\(?(?:\d|x){3}\)?(?:\s|-)(?:\d|x){3}(?:\s|-)(?:\d|x){4}', # phone numbers in various formats
            r'(?:\s|^)[^A-Za-z0-9]?x+(?:[^A-Za-z0-9]+x+)*(?:\s|$)', #various sequences of trailing x]
            r'[^A-Za-z0-9](al|ak|az|ar|ca|co|ct|de|fl|ga|hi|id|il|ia|ks|ky|la|me|md|ma|mi|mn|ms|mo|mt|ne|nv|nh|nj|nm|ny|nc|nd|oh|ok|or|pa|ri|sc|sd|tn|tx|ut|vt|va|wa|wv|wi|wy|dc)(?![A-Za-z0-9])',
            #state abbreviations besides 'in' bc ex: in n out
            r'\bin$', #ending 'in' state
            r'(?:in)?\scard\s\d{2}\s?',
            # format ex: card 20 unless in card 20
            r'\b\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b', #dates
            r'\d{2}\:\d{2}(?:\:\d{2})?[ap]?m?', #time stamps
            r'(?:date\s)?\%\%(?:\smcc)?', #format ex: %% mcc
            r'#[A-Za-z0-9]+$' #ending #_____
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 601037


In [6]:
clean = clean.reindex(outflow_train.index)
outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])
outflow_train = outflow_train[['prism_consumer_id', 'prism_account_id', 'memo', 'cleaned_memo', 'amount', 'posted_date', 'category']]  
outflow_train[outflow_train['cleaned_memo'] != outflow_train['memo']][['memo','cleaned_memo']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])


Unnamed: 0,memo,cleaned_memo
646,PURCHASE AUTHORIZED ON 03/11 POKE POKU HENDERS...,purchase authorized on poke poku henderson sxx...
651,PURCHASE AUTHORIZED ON 10/01 LIQUOR CITY HENDE...,purchase authorized on liquor city hender hend...
657,PURCHASE INTL AUTHORIZED ON 10/20 Rituals Cosm...,purchase intl authorized on rituals cosmetics ...
658,Trader Joe''s,trader joe''s
660,PURCHASE AUTHORIZED ON 05/28 VANS #174 LAS VEG...,purchase authorized on vans #174 las vegas pxx...
...,...,...
2597457,DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...,amzn
2597462,POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...,pos withdrawalaz lot quiktrip e indian school ...
2597465,POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...,pos withdrawalwal-mart e mckellips rd mesa mcc
2597468,WITHDRAWAL Salt River ProjeTYPE: ONLINE PMT CO...,withdrawal salt river projetype: online pmt :s...


## Build n-gram

In [7]:
# n-gram
def tokenize(text: str, lowercase: bool = True) -> List[str]:
    if lowercase:
        text = text.lower()
    # Keep words, numbers, and apostrophes; split by comma later
    return re.findall(r"[a-z0-9']+", text)

def generate_ngrams(tokens: List[str], n: int) -> Iterable[Tuple[str, ...]]:
    """Generate n-grams of size n from tokens."""
    if n <= 0:
        raise ValueError("n must be >= 1")
    if len(tokens) < n:
        return []
    return zip(*[tokens[i:] for i in range(n)])

def most_common_ngrams(
    text: str,
    n: int = 1,
    top_k: int = 10,
    stopwords: Optional[Iterable[str]] = None,
    min_token_len: int = 1,
) -> List[Tuple[str, int]]:
    # Split the text into comma-separated segments
    segments = [seg.strip() for seg in text.split(",") if seg.strip()]

    all_tokens = []
    for seg in segments:
        tokens = tokenize(seg)
        if stopwords:
            sw = set(stopwords)
            tokens = [t for t in tokens if t not in sw]
        if min_token_len > 1:
            tokens = [t for t in tokens if len(t) >= min_token_len]
        all_tokens.append(tokens)

    # Build all n-grams up to length n within each segment
    grams = []
    for tokens in all_tokens:
        for i in range(1, n + 1):
            grams.extend(" ".join(g) for g in generate_ngrams(tokens, i))

    counts = Counter(grams)
    return counts.most_common(top_k)

In [8]:
# perform n-gram to find most common patterns with <= n tokens

# create corpus using all unique cleaned_memos in outflow_train
memos = outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].unique()
memos = ', '.join(memos)
memos[:1000]

"purchase authorized on poke poku henderson sxxxxxxxxxxxxxxx card, purchase authorized on liquor city hender henderson sxxxxxxxxxxxxxxx card, purchase intl authorized on rituals cosmetics dublin irl sxxxxxxxxxxxxxxx card, trader joe''s, purchase authorized on vans #174 las vegas pxxxxxxxxxxxxxxxxx card, purchase authorized on michael kors - las vegas pxxxxxxxxxxxxxxxxx card, purchase authorized on nails r us henderson sxxxxxxxxxxxxxxx card, purchase authorized on joshua tree nation twentynine sxxxxxxxxxxxxxxx card, albertsons, sephora, lowe''s, purchase authorized on salon evo stud henderson sxxxxxxxxxxxxxxx card, purchase authorized on lee''s dis lee''s discou henderson pxxxxxxxxxxxxxxxxx card, amzn, olo, purchase authorized on star nursery las vegas sxxxxxxxxxxxxxxx card, gosq, purchase authorized on tst* scrambled - h henderson sxxxxxxxxxxxxxxx card, purchase authorized on wahoos fish taco - henderson sxxxxxxxxxxxxxxx card, purchase authorized on i love liquor miami beach sxxxxxxxxx

In [9]:
n_grams = most_common_ngrams(memos, n=10, top_k=50)
# longer patterns may include shorter patterns --> remove longer ones first
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
sorted_by_len[:10]

[('pos debit visa check card', 6611),
 ('pxxxxxxxxxxxxxxxxx card', 3837),
 ('purchase authorized on', 33420),
 ('sxxxxxxxxxxxxxxx card', 26381),
 ('debit visa check card', 6611),
 ('pxxxxxxxxxxxxxxx card', 5700),
 ('pos debit visa check', 6611),
 ('purchase authorized', 33420),
 ('debit card purchase', 5571),
 ('pxxxxxxxxxxxxxxxxx', 3837)]

In [10]:
m = outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].str.lower()
clean = m.copy(deep = True)

patterns = [r'\b(?:in)?\s[sp]xxxxxxxxxxxxxxx\scard',
            r'\b[sp]?x{3,}',
            r'(?:purchase\s)?authorized(?:\son)?',
            r'debit(?:\scard)?',
            r'visa\s(?:check\s)?card',
            r'withdrawal',
            r'recurring',
            r'checkcard',
            r'purchase',
            r'^[^A-Za-z0-9]|\-\s',
            r'\#[a-zA-Z0-9]+\s'
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 353859


### check if any other common patterns can be removed

In [11]:
memos = ', '.join(clean)

max_len = int(outflow_train['cleaned_memo'].str.split().str.len().max())
# find top 50 patterns with <= max_len tokens
n_grams = most_common_ngrams(memos, n=max_len, top_k=50)
# longer patterns may include shorter patterns --> remove longer ones first
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
sorted_by_len[:10]

[('dollar general', 11072),
 ("mcdonald's", 29473),
 ('starbucks', 15682),
 ('cash app', 38841),
 ('doordash', 23205),
 ('7 eleven', 16388),
 ('wal mart', 15836),
 ('circle k', 12485),
 ('walmart', 28231),
 ('payment', 16542)]

## Use txt file to observe again

In [12]:
# np.savetxt("after_ngram_memos.txt", np.sort(clean.unique()), fmt="%s")

patterns = [r'^\d{2}\-\d{2}(?:\-\d{2,4})?\b', #leading dates
            r'c\#\sdbt\scrd', #format ex: c# dbt crd
            r'\bc\#?$', #ending c or c#
            r'\bcard#$', #ending card#
            r'\s[a-z]{0,2}x{3,}\s', #ex: fxxxx, cxxxx
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 366611


In [13]:
clean = clean.reindex(outflow_train.index)
outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])
outflow_train = outflow_train[['prism_consumer_id', 'prism_account_id', 'memo', 'cleaned_memo', 'amount', 'posted_date', 'category']]  
outflow_train.head(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
645,2,acc_3,ENTERTAINMENT,ENTERTAINMENT,59.0,2021-01-07,ENTERTAINMENT
646,2,acc_3,PURCHASE AUTHORIZED ON 03/11 POKE POKU HENDERS...,poke poku henderson,35.08,2021-03-15,FOOD_AND_BEVERAGES
647,2,acc_3,EXTERNAL_TRANSFER,EXTERNAL_TRANSFER,1.0,2021-01-07,EXTERNAL_TRANSFER
648,2,acc_3,ACCOUNT_FEES,ACCOUNT_FEES,3.84,2021-01-13,ACCOUNT_FEES
649,2,acc_3,ATM_CASH,ATM_CASH,1500.0,2021-08-16,ATM_CASH


In [14]:
outflow_train.sample(50).groupby(['memo','cleaned_memo'])[['amount']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount
memo,cleaned_memo,Unnamed: 2_level_1
ARBYS #XXXX NORTHWOOD,arbys northwood,1
ATM_CASH,ATM_CASH,2
AUTOMOTIVE,AUTOMOTIVE,4
Amazon,amazon,1
CANTEEN VENDING BLOOMINGDALE IL 04/11,canteen vending bloomingdale,1
CASH APP*TANESHA TR,cash app*tanesha tr,1
CASH APP*YUMRON TAY XXX-XXX-XXXX CA,cash app*yumron tay,1
CREDIT_CARD_PAYMENT,CREDIT_CARD_PAYMENT,2
Chick-fil-A,chick-fil-a,1
Circle K,circle k,1


In [15]:
# clean_memos = np.sort(outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].unique())
# np.savetxt("cleaned_train_memos.txt", clean_memos, fmt="%s")

In [16]:
outflow_train.sort_values(by='cleaned_memo')

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
393569,869,acc_2508,Debit Card Purchase 03/08 03:05p #XXXX,,24.23,2023-03-10,FOOD_AND_BEVERAGES
763289,1617,acc_4488,XXX-XXX-XXXX,,633.42,2021-06-25,FOOD_AND_BEVERAGES
2385209,5543,acc_9126,XXXXXXXXXXX XXXXXX,,1283.65,2022-05-31,FOOD_AND_BEVERAGES
307332,672,acc_1961,# XXXX,,24.94,2022-04-14,FOOD_AND_BEVERAGES
361768,801,acc_2299,Debit Card Purchase 02/07 03:59p #XXXX,,1.36,2023-02-10,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...,...
5128,12,acc_28,DEBIT CARD PURCHASE / | ON 10/03 AT TST SUNFLO...,| on at tst sunflower drive in| fair oaks |,35.59,2020-10-05,FOOD_AND_BEVERAGES
4931,12,acc_28,DEBIT CARD PURCHASE / | ON 09/15 AT TUG BOAT F...,| on at tug boat fish n chips 2| fair oaks |,45.75,2021-09-17,FOOD_AND_BEVERAGES
5089,12,acc_28,DEBIT CARD PURCHASE / | ON 05/15 AT VINTAGE CH...,| on at vintage charm antiques| loomis |,158.02,2021-05-17,GENERAL_MERCHANDISE
5207,12,acc_28,DEBIT CARD PURCHASE / | ON 01/06 AT WAYSIDE LU...,| on at wayside lumber inc| |,262.18,2021-01-07,GENERAL_MERCHANDISE


In [17]:
outflow_train.loc[307802]

prism_consumer_id                   672
prism_account_id               acc_1961
memo                             # XXXX
cleaned_memo                           
amount                            150.0
posted_date                  2022-04-28
category             FOOD_AND_BEVERAGES
Name: 307802, dtype: object

## Questions

### 1. Many empty memos --> how to categorize?

In [18]:
empty_memo = outflow_train[outflow_train['cleaned_memo'] == '']
empty_memo

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
36942,79,acc_225,*,,42.80,2021-05-05,FOOD_AND_BEVERAGES
109807,241,acc_722,Debit Card Purchase 05/09 08:04p #XXXX,,2500.00,2023-05-11,FOOD_AND_BEVERAGES
109874,241,acc_722,Debit Card Purchase 01/11 12:03p #XXXX,,1000.00,2023-01-13,GENERAL_MERCHANDISE
109903,241,acc_722,Debit Card Purchase 12/12 03:10p #XXXX,,500.00,2022-12-14,FOOD_AND_BEVERAGES
124326,283,acc_866,#BREEDOESHAIR9,,30.00,2020-12-17,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...,...
1931010,4691,acc_8274,XXXXXXXX,,38.89,2021-07-12,FOOD_AND_BEVERAGES
1931019,4691,acc_8274,XXXXXXXX,,99.00,2021-07-27,FOOD_AND_BEVERAGES
2239655,5262,acc_8845,XXXX,,196.00,2022-11-03,FOOD_AND_BEVERAGES
2385209,5543,acc_9126,XXXXXXXXXXX XXXXXX,,1283.65,2022-05-31,FOOD_AND_BEVERAGES


In [19]:
empty_memo['category'].unique()

array(['FOOD_AND_BEVERAGES', 'GENERAL_MERCHANDISE'], dtype=object)

In [20]:
empty_memo['memo'].unique()

array(['*', 'Debit Card Purchase 05/09 08:04p #XXXX',
       'Debit Card Purchase 01/11 12:03p #XXXX',
       'Debit Card Purchase 12/12 03:10p #XXXX', '#BREEDOESHAIR9',
       'XXXXXXXXXX',
       'PURCHASE AUTHORIZED ON 09/21 CT CT SXXXXXXXXXXXXXXX CARD XXXX',
       '# XXXX', 'CHECKCARD XXXX XXXXXXXXXXXXXX',
       'Debit Card Purchase 03/03 01:59p #XXXX',
       'Debit Card Purchase 08/31 03:35p #XXXX',
       'Debit Card Purchase 12/22 03:10p #XXXX',
       'Debit Card Purchase 11/09 12:07p #XXXX',
       'Debit Card Purchase 12/29 08:00p #XXXX',
       'Debit Card Purchase 03/19 02:16p #XXXX',
       'Debit Card Purchase 02/07 03:59p #XXXX',
       'Debit Card Purchase 09/05 01:01p #XXXX',
       'Debit Card Purchase 05/09 02:16p #XXXX',
       'Debit Card Purchase 09/16 07:59p #XXXX',
       'Debit Card Purchase 03/17 03:25p #XXXX',
       'Debit Card Purchase 09/19 03:00p #XXXX',
       'Debit Card Purchase 12/17 12:25a #XXXX',
       'Debit Card Purchase 09/20 03:16p #XXXX',
 