In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind

import re
from typing import List, Tuple, Iterable, Optional
from collections import Counter

# Load Data
## Train Test Split

In [None]:
inflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-inflows.pqt')
outflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

# remove rows where memo = category
outflow = outflow[outflow['memo'] != outflow['category']]

outflow_ids = set(outflow["prism_consumer_id"].unique())
inflow_ids = set(inflow["prism_consumer_id"].unique())

in_not_out = inflow_ids - outflow_ids
out_not_in = outflow_ids - inflow_ids

#consumers in both inflow and outflow
consumers_both = sorted(set(inflow["prism_consumer_id"]).intersection(outflow["prism_consumer_id"]))

#80-20 train test split
train_ids, test_ids = train_test_split(consumers_both, test_size=0.2, random_state=42)

inflow_train = inflow[inflow["prism_consumer_id"].isin(train_ids)]
inflow_test  = inflow[inflow["prism_consumer_id"].isin(test_ids)]

outflow_train = outflow[outflow["prism_consumer_id"].isin(train_ids)]
outflow_test  = outflow[outflow["prism_consumer_id"].isin(test_ids)]

# print(f'Inflow_train: {inflow_train["amount"].median()}\nInflow_test: {inflow_test["amount"].median()}\nOutflow_train: {outflow_train["amount"].median()}\nOutflow_test: {outflow_test["amount"].median()}')

FileNotFoundError: [Errno 2] No such file or directory: 'q1-ucsd-inflows.pqt'

## Remove Observed Patterns

In [3]:
## use train_memos.txt to observe patterns from original outflow_train
# unique_memos = np.sort(outflow_train[outflow_train['memo']!=outflow_train['category']]['memo'].str.lower().unique())
# np.savetxt("train_memos.txt", unique_memos, fmt="%s")

In [4]:
m = outflow_train[outflow_train['memo']!=outflow_train['category']]['memo'].str.lower()
clean = m.copy(deep = True)

websites = re.compile(r'\b(?:www\.)?([a-wy-z0-9]+)[.\s](com|com|net|org|gov)\b', flags=re.IGNORECASE)

# extract website domain
def extract_domain(text):
    m = websites.search(text)
    if m:
        return m.group(1)
    else:
        return text

clean = clean.apply(extract_domain)

In [5]:
patterns = [r'^#\d*x*\s', #starting number signs with following 0 or more digits inlcuding x then space
            r'\(?(?:\d|x){3}\)?(?:\s|-)(?:\d|x){3}(?:\s|-)(?:\d|x){4}', # phone numbers in various formats
            r'(?:\s|^)[^A-Za-z0-9]?x+(?:[^A-Za-z0-9]+x+)*(?:\s|$)', #various sequences of trailing x
            r'\b[^A-Za-z0-9](al|ak|az|ar|ca|co|ct|de|fl|ga|hi|id|il|ia|ks|ky|me|md|ma|mi|mn|ms|mo|mt|ne|nv|nh|nj|nm|ny|nc|nd|oh|ok|or|pa|ri|sc|sd|tn|tx|ut|vt|va|wa|wv|wi|wy|dc)(?![A-Za-z0-9])\b',
            #state abbreviations besides 'in' bc ex: in n out
            r'\bin$', #ending 'in' state and la
            r'(?:in)?\scard\s\d{2}\s?',
            # format ex: card 20 unless in card 20
            r'\b\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b', #dates
            r'\d{2}\:\d{2}(?:\:\d{2})?[ap]?m?', #time stamps
            r'(?:date\s)?\%\%(?:\smcc)?', #format ex: %% mcc
            r'#[A-Za-z0-9]+$' #ending #_____
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 605120


In [6]:
clean = clean.reindex(outflow_train.index)
outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])
outflow_train = outflow_train[['prism_consumer_id', 'prism_account_id', 'memo', 'cleaned_memo', 'amount', 'posted_date', 'category']]  
outflow_train[outflow_train['cleaned_memo'] != outflow_train['memo']][['memo','cleaned_memo']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])


Unnamed: 0,memo,cleaned_memo
2,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio - exp fairlawn
4,Buffalo Wild Wings,buffalo wild wings
6,Oculus CA 04/16,oculus
7,LOS GIRASOLES STOW OH 03/08,los girasoles stow
8,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1
...,...,...
2595705,PURCHASE 01-14 PUREKANA XXX-XXXXXXX NJ VNT XXXX,purchase purekana vnt
2595706,PURCHASE 01-18 WAL-MART #XXXX HUTCHINSON KS S...,purchase wal-mart hutchinson srf
2595708,PURCHASE 01-27 CLKBANK*MoonReading XXX-XXX-XXX...,purchase clkbank*moonreading vnt
2595709,PURCHASE 01-26 CASH APP*DIANA STIE XXXXXXXXXX ...,purchase cash app*diana stie vnt


## Build n-gram

In [7]:
# n-gram
def tokenize(text: str, lowercase: bool = True) -> List[str]:
    if lowercase:
        text = text.lower()
    # Keep words, numbers, and apostrophes; split by comma later
    return re.findall(r"[a-z0-9']+", text)

def generate_ngrams(tokens: List[str], n: int) -> Iterable[Tuple[str, ...]]:
    """Generate n-grams of size n from tokens."""
    if n <= 0:
        raise ValueError("n must be >= 1")
    if len(tokens) < n:
        return []
    return zip(*[tokens[i:] for i in range(n)])

def most_common_ngrams(
    text: str,
    n: int = 1,
    top_k: int = 10,
    stopwords: Optional[Iterable[str]] = None,
    min_token_len: int = 1,
) -> List[Tuple[str, int]]:
    # Split the text into comma-separated segments
    segments = [seg.strip() for seg in text.split(",") if seg.strip()]

    all_tokens = []
    for seg in segments:
        tokens = tokenize(seg)
        if stopwords:
            sw = set(stopwords)
            tokens = [t for t in tokens if t not in sw]
        if min_token_len > 1:
            tokens = [t for t in tokens if len(t) >= min_token_len]
        all_tokens.append(tokens)

    # Build all n-grams up to length n within each segment
    grams = []
    for tokens in all_tokens:
        for i in range(1, n + 1):
            grams.extend(" ".join(g) for g in generate_ngrams(tokens, i))

    counts = Counter(grams)
    return counts.most_common(top_k)

In [8]:
# perform n-gram to find most common patterns with <= n tokens

# create corpus using all unique cleaned_memos in outflow_train
memos = outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].unique()
memos = ', '.join(memos)
memos[:1000]

"tst* casa del rio - exp fairlawn, buffalo wild wings, oculus, los girasoles stow, buzzis laundry 1, tgi fridays stow, tst* the basement sp cuyahoga fall, lowe's, piada - 39, grubhub, hardees akron, market di state cuyahoga fall, swensons - montrose akron, great clips, apple, wing warehouse cuyah cuyahoga fall, winking lizard - 30, longhorn steak cuyahoga fall, on tap - cuyahoga fa cuyahoga fall, home depot, falls discount tobacc cuyahoga fls, burger king, o'charley's, homedepot, dairy queen, east of chicago - cu cuyahoga fall, giant-eag corpora uniontown, fin's bar & chill pigeon forge, acme no. 12 bai cuyahoga fall ohxxxxxx, taco bell, chick-fil-a, walmart, moe's sw grill cuyahoga fall, texas roadhouse, 39 piada cuyahoga fall, cleveland gaming fairview park, circle k, iah cnbc smartshop houston, bob evans rest stow, kohl's, moe's sw grill akron, wendy's, rays place of fairlawn fairlawn, get go st cuyahoga fall ohxxxxxx, dd doordash dashmart, chipotle mexican grill, swensons - north a

In [9]:
n_grams = most_common_ngrams(memos, n=10, top_k=50)
# longer patterns may include shorter patterns --> remove longer ones first
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
sorted_by_len[:10]

[('pos debit visa check card', 6856),
 ('pxxxxxxxxxxxxxxxxx card', 4003),
 ('purchase authorized on', 33710),
 ('sxxxxxxxxxxxxxxx card', 26726),
 ('debit visa check card', 6856),
 ('pxxxxxxxxxxxxxxx card', 5666),
 ('pos debit visa check', 6856),
 ('purchase authorized', 33710),
 ('debit card purchase', 6352),
 ('pxxxxxxxxxxxxxxxxx', 4003)]

In [10]:
m = outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].str.lower()
clean = m.copy(deep = True)

patterns = [r'\b(?:in)?\s[sp]xxxxxxxxxxxxxxx\scard',
            r'\b[sp]?x{3,}',
            r'(?:purchase\s)?authorized(?:\son)?',
            r'debit(?:\scard)?',
            r'visa\s(?:check\s)?card',
            r'withdrawal',
            r'recurring',
            r'checkcard',
            r'purchase',
            r'^[^A-Za-z0-9]|\-\s',
            r'\#[a-zA-Z0-9]+\s'
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 367753


### check if any other common patterns can be removed

In [11]:
memos = ', '.join(clean)

max_len = int(outflow_train['cleaned_memo'].str.split().str.len().max())
# find top 50 patterns with <= max_len tokens
n_grams = most_common_ngrams(memos, n=max_len, top_k=50)
# longer patterns may include shorter patterns --> remove longer ones first
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
sorted_by_len[:10]

[('dollar general', 10898),
 ("mcdonald's", 29782),
 ('starbucks', 15166),
 ('cash app', 37056),
 ('doordash', 22096),
 ('7 eleven', 16599),
 ('wal mart', 16281),
 ('circle k', 12553),
 ('afterpay', 10472),
 ('point of', 10160)]

## Use txt file to observe again

In [12]:
# np.savetxt("after_ngram_memos.txt", np.sort(clean.unique()), fmt="%s")

patterns = [r'^\d{2}\-\d{2}(?:\-\d{2,4})?\b', #leading dates
            r'c\#\sdbt\scrd', #format ex: c# dbt crd
            r'\bc\#?$', #ending c or c#
            r'\bcard#$', #ending card#
            r'\s[a-z]{0,2}x{3,}\s', #ex: fxxxx, cxxxx
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 380312


In [13]:
clean = clean.reindex(outflow_train.index)
outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])
outflow_train = outflow_train[['prism_consumer_id', 'prism_account_id', 'memo', 'cleaned_memo', 'amount', 'posted_date', 'category']]  
outflow_train.head(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,Buffalo Wild Wings,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,Oculus CA 04/16,oculus,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,LOS GIRASOLES STOW OH 03/08,los girasoles stow,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1,4.16,2022-03-29,GENERAL_MERCHANDISE


In [14]:
outflow_train.sample(50).groupby(['memo','cleaned_memo'])[['amount']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount
memo,cleaned_memo,Unnamed: 2_level_1
#XXXXXX POS CASH APP*GOLD H SQUAREUP.COM SAN FRANCISCO CA MCC XXXX,squareup,1
AJ'S #087 CHANDLER AZ XXXXXX 08/31,aj's chandler,1
Amazon,amazon,2
Amazon Prime,amazon prime,1
Annas Liquor,annas liquor,1
Apple,apple,1
Audible,audible,1
CHECKCARD XXXX AMAZON.COM*1M0VI8J90 AM AMZN.COM/BILLWA XXXXXXXXXXXXXXXXXXXXXXX,amazon,1
CHECKCARD XXXX CRYPTO.COM XXX-XXXXXXX DE XXXXXXXXXXXXXXXXXXXXXXX,crypto,1
CHECKCARD XXXX LITTLE CAESARS 32 SC XXXXXXXXXXXXXXXXXXXXXXX,little caesars 32,1


In [15]:
# clean_memos = np.sort(outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].unique())
# np.savetxt("cleaned_train_memos.txt", clean_memos, fmt="%s")

In [16]:
outflow_train.sort_values(by='cleaned_memo')

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
763744,1617,acc_4488,XXX-XXX-XXXX,,633.42,2021-09-17,FOOD_AND_BEVERAGES
306986,672,acc_1961,# XXXX,,150.00,2022-01-19,FOOD_AND_BEVERAGES
306988,672,acc_1961,# XXXX,,250.00,2022-05-26,FOOD_AND_BEVERAGES
307058,672,acc_1961,# XXXX,,33.00,2022-03-18,FOOD_AND_BEVERAGES
307066,672,acc_1961,# XXXX,,10.00,2022-05-27,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...,...
5128,12,acc_28,DEBIT CARD PURCHASE / | ON 10/03 AT TST SUNFLO...,| on at tst sunflower drive in| fair oaks |,35.59,2020-10-05,FOOD_AND_BEVERAGES
4931,12,acc_28,DEBIT CARD PURCHASE / | ON 09/15 AT TUG BOAT F...,| on at tug boat fish n chips 2| fair oaks |,45.75,2021-09-17,FOOD_AND_BEVERAGES
5089,12,acc_28,DEBIT CARD PURCHASE / | ON 05/15 AT VINTAGE CH...,| on at vintage charm antiques| loomis |,158.02,2021-05-17,GENERAL_MERCHANDISE
5207,12,acc_28,DEBIT CARD PURCHASE / | ON 01/06 AT WAYSIDE LU...,| on at wayside lumber inc| |,262.18,2021-01-07,GENERAL_MERCHANDISE


In [17]:
outflow_train.loc[307802]

prism_consumer_id                   672
prism_account_id               acc_1961
memo                             # XXXX
cleaned_memo                           
amount                            150.0
posted_date                  2022-04-28
category             FOOD_AND_BEVERAGES
Name: 307802, dtype: object

## Questions

### 1. Many empty memos --> how to categorize?

In [18]:
empty_memo = outflow_train[outflow_train['cleaned_memo'] == '']
empty_memo

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
36942,79,acc_225,*,,42.80,2021-05-05,FOOD_AND_BEVERAGES
109807,241,acc_722,Debit Card Purchase 05/09 08:04p #XXXX,,2500.00,2023-05-11,FOOD_AND_BEVERAGES
109874,241,acc_722,Debit Card Purchase 01/11 12:03p #XXXX,,1000.00,2023-01-13,GENERAL_MERCHANDISE
109903,241,acc_722,Debit Card Purchase 12/12 03:10p #XXXX,,500.00,2022-12-14,FOOD_AND_BEVERAGES
124326,283,acc_866,#BREEDOESHAIR9,,30.00,2020-12-17,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...,...
1846481,4531,acc_8114,XXXXXXXXXXXXXXX,,1.60,2022-10-07,FOOD_AND_BEVERAGES
1846492,4531,acc_8114,XXXXXXXXXXXXXXX,,1.60,2022-10-09,FOOD_AND_BEVERAGES
1876720,4595,acc_8178,XXXXX,,41.11,2022-11-25,FOOD_AND_BEVERAGES
2385209,5543,acc_9126,XXXXXXXXXXX XXXXXX,,1283.65,2022-05-31,FOOD_AND_BEVERAGES


In [19]:
empty_memo['category'].unique()

array(['FOOD_AND_BEVERAGES', 'GENERAL_MERCHANDISE'], dtype=object)

In [20]:
empty_memo['memo'].unique()

array(['*', 'Debit Card Purchase 05/09 08:04p #XXXX',
       'Debit Card Purchase 01/11 12:03p #XXXX',
       'Debit Card Purchase 12/12 03:10p #XXXX', '#BREEDOESHAIR9',
       'XXXXXXXX', 'XXXXXXXXXX',
       'PURCHASE AUTHORIZED ON 09/21 CT CT SXXXXXXXXXXXXXXX CARD XXXX',
       '# XXXX', 'CHECKCARD XXXX XXXXXXXXXXXXXX',
       'Debit Card Purchase 03/03 01:59p #XXXX',
       'Debit Card Purchase 08/31 03:35p #XXXX',
       'Debit Card Purchase 12/22 03:10p #XXXX',
       'Debit Card Purchase 11/09 12:07p #XXXX',
       'Debit Card Purchase 12/29 08:00p #XXXX',
       'Debit Card Purchase 03/19 02:16p #XXXX',
       'Debit Card Purchase 02/07 03:59p #XXXX',
       'Debit Card Purchase 09/05 01:01p #XXXX',
       'Debit Card Purchase 05/09 02:16p #XXXX',
       'Debit Card Purchase 09/16 07:59p #XXXX',
       'Debit Card Purchase 03/17 03:25p #XXXX',
       'Debit Card Purchase 09/19 03:00p #XXXX',
       'Debit Card Purchase 12/17 12:25a #XXXX',
       'Debit Card Purchase 09/20 03:1