In [1]:
import pandas as pd

In [2]:
# no need to clean inflow memo bc memo = category
outflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

In [3]:
print(outflow.shape)
outflow.sample(5)

(2597488, 6)


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
1708194,4291,acc_7874,AFTERPAY SAN FRANCISCO CA USA,60.68,2021-05-27,GENERAL_MERCHANDISE
253518,558,acc_1652,PURCHASE AUTHORIZED ON 02/08 THE RUCK NY SXXXX...,55.98,2021-02-10,FOOD_AND_BEVERAGES
535021,1162,acc_3280,RECURRING APPLE.COM/BILL,5.99,2022-02-17,GENERAL_MERCHANDISE
152886,359,acc_1066,ESSENTIAL_SERVICES,381.86,2021-07-06,ESSENTIAL_SERVICES
2027196,4865,acc_8448,PURCHASE AUTHORIZED ON 09/17 LIDS XXXX BOCA RA...,48.14,2022-09-19,GENERAL_MERCHANDISE


In [4]:
import re
from typing import List, Tuple, Iterable, Optional
from collections import Counter

def tokenize(text: str, lowercase: bool = True) -> List[str]:
    if lowercase:
        text = text.lower()
    # Keep words, numbers, and apostrophes; split by comma later
    return re.findall(r"[a-z0-9']+", text)

def generate_ngrams(tokens: List[str], n: int) -> Iterable[Tuple[str, ...]]:
    """Generate n-grams of size n from tokens."""
    if n <= 0:
        raise ValueError("n must be >= 1")
    if len(tokens) < n:
        return []
    return zip(*[tokens[i:] for i in range(n)])

def most_common_ngrams(
    text: str,
    n: int = 1,
    top_k: int = 10,
    stopwords: Optional[Iterable[str]] = None,
    min_token_len: int = 1,
) -> List[Tuple[str, int]]:
    # Split the text into comma-separated segments
    segments = [seg.strip() for seg in text.split(",") if seg.strip()]

    all_tokens = []
    for seg in segments:
        tokens = tokenize(seg)
        if stopwords:
            sw = set(stopwords)
            tokens = [t for t in tokens if t not in sw]
        if min_token_len > 1:
            tokens = [t for t in tokens if len(t) >= min_token_len]
        all_tokens.append(tokens)

    # Build all n-grams up to length n within each segment
    grams = []
    for tokens in all_tokens:
        for i in range(1, n + 1):
            grams.extend(" ".join(g) for g in generate_ngrams(tokens, i))

    counts = Counter(grams)
    return counts.most_common(top_k)

## round 1

In [5]:
memos = outflow[outflow['memo']!=outflow['category']]['memo'].unique()
memos = ', '.join(memos)

In [6]:
memos[:1000]

"TST* Casa Del Rio - Exp Fairlawn OH 09/24, Buffalo Wild Wings, Oculus CA 04/16, LOS GIRASOLES STOW OH 03/08, BUZZIS LAUNDRY 1 OH 03/28, BUZZIS LAUNDRY 1 OH 02/13, TGI FRIDAYS XXXX STOW OH 12/31, TST* The Basement Sp Cuyahoga Fall OH 06/06, Lowe's, PIADA - 39 OH 08/23, GrubHub, HARDEES XXXXXXX AKRON OH 05/29, MARKET DI XXXX State Cuyahoga Fall OH 04/06, SWENSONS - MONTROSE AKRON OH 06/29, Great Clips, APPLE.COM/BILL CA 04/07, APPLE.COM/BILL CA 04/29, APPLE.COM/BILL CA 01/28, LOS GIRASOLES STOW OH 05/24, WING WAREHOUSE CUYAH CUYAHOGA FALL OH 03/29, WINKING LIZARD - 30 MA OH 05/23, LONGHORN STEAK XXXXX CUYAHOGA FALL OH 10/09, ON TAP - CUYAHOGA FA CUYAHOGA FALL OH 12/06, TST* The Basement Sp Cuyahoga Fall OH 08/26, Oculus CA 03/12, Home Depot, FALLS DISCOUNT TOBACC CUYAHOGA FLS OH 05/20, Burger King, O'Charley's, HOMEDEPOT.COM GA 10/08, Dairy Queen, EAST OF CHICAGO - CU CUYAHOGA FALL OH 03/08, GIANT-EAG XXXX Corpora Uniontown OH 09/28, WING WAREHOUSE CUYAH CUYAHOGA FALL OH 10/12, FIN'S BA

In [7]:
outflow['memo'].str.split().str.len().max()

41

In [8]:
n_grams = most_common_ngrams(memos, n=41, top_k=50)
n_grams

[('xxxx', 405935),
 ('card', 231945),
 ('purchase', 220149),
 ('card xxxx', 173894),
 ('on', 159821),
 ('authorized', 154350),
 ('authorized on', 154307),
 ('purchase authorized', 140141),
 ('purchase authorized on', 140141),
 ('sxxxxxxxxxxxxxxx', 118231),
 ('sxxxxxxxxxxxxxxx card', 118231),
 ('sxxxxxxxxxxxxxxx card xxxx', 118231),
 ('ca', 103001),
 ('com', 92857),
 ('xxx', 70724),
 ('debit', 67661),
 ('amzn', 59044),
 ('xxxxxx', 46670),
 ('10', 46644),
 ('pos', 46347),
 ('12', 45201),
 ('09', 44181),
 ('xxxxxxxxxxxxxxxxxxxxxxx', 44176),
 ('08', 43977),
 ('wa', 43142),
 ('07', 43060),
 ('11', 42208),
 ('05', 41251),
 ('03', 40748),
 ('06', 40271),
 ('04', 40219),
 ('01', 39908),
 ('amzn com', 37465),
 ('ca sxxxxxxxxxxxxxxx', 37239),
 ('ca sxxxxxxxxxxxxxxx card', 37239),
 ('ca sxxxxxxxxxxxxxxx card xxxx', 37239),
 ('02', 36771),
 ('fl', 36570),
 ('bill', 34079),
 ('xxxxx', 33923),
 ('amazon', 33190),
 ('22', 33110),
 ('com bill', 33095),
 ('checkcard', 31265),
 ('checkcard xxxx', 31234)

In [9]:
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
for item in sorted_by_len:
    print(item)

('ca sxxxxxxxxxxxxxxx card xxxx', 37239)
('sxxxxxxxxxxxxxxx card xxxx', 118231)
('ca sxxxxxxxxxxxxxxx card', 37239)
('xxxxxxxxxxxxxxxxxxxxxxx', 44176)
('purchase authorized on', 140141)
('sxxxxxxxxxxxxxxx card', 118231)
('purchase authorized', 140141)
('ca sxxxxxxxxxxxxxxx', 37239)
('sxxxxxxxxxxxxxxx', 118231)
('checkcard xxxx', 31234)
('authorized on', 154307)
('xxx xxx xxxx', 28777)
('authorized', 154350)
('debit card', 30134)
('card xxxx', 173894)
('checkcard', 31265)
('purchase', 220149)
('amzn com', 37465)
('com bill', 33095)
('xxx xxxx', 28777)
('xxx xxx', 28777)
('xxxxxx', 46670)
('amazon', 33190)
('debit', 67661)
('xxxxx', 33923)
('xxxx', 405935)
('card', 231945)
('amzn', 59044)
('bill', 34079)
('com', 92857)
('xxx', 70724)
('pos', 46347)
('on', 159821)
('ca', 103001)
('10', 46644)
('12', 45201)
('09', 44181)
('08', 43977)
('wa', 43142)
('07', 43060)
('11', 42208)
('05', 41251)
('03', 40748)
('06', 40271)
('04', 40219)
('01', 39908)
('02', 36771)
('fl', 36570)
('22', 33110)
('t

In [10]:
m = outflow[outflow['memo']!=outflow['category']]['memo'].str.lower()
clean = m.copy(deep = True)
# 236785
# patterns = [r'[a-z]{2}\son\s+xxxxxx\sfrom\s+card#:?\sxxxxxxxxxxxxxxxx',
#             r'debit\scard\spurchase\sat\s', 
#             r'\sxxx-xxx-xxxx\s[a-z]{2}\ssxxxxxxxxxxxxxxx\scard\sxxxx',
#             r'purchase\sauthorized\son\s\d\d\/\d\d',
#             r'point\sof\ssale\spurchase\sus\sut\ssalt\slake\scit',
#             r'business\sto\sbusiness\sach',
#             r'\sxxxx\sxxxxxxxxxxxxxxxxxxxxxxx\sxxxxxx',
#             r'xxx\-xxx\-xxxx',
#             r',auth#\sxxxxxx\sdebit\scard\sdebit',
#             r'\ssxxxxxxxxxxxxxxx\scard\sxxxx',
#             r'xxxxxxxx\sxxxxxx',
#             r'id\:x+']
patterns = [r'sxxxxxxxxxxxxxxx\scard\sxxxx',
            r'sxxxxxxxxxxxxxxx\scard',
            r'[sp]?x{3,}',
            r'(?:purchase\s)?authorized(?:\son)?',
            r'debit card',
            r'\d{2}\/\d{2}\/\d{4}|\d{4}\/\d{2}\/\d{2}|\d{2}\/\d{2}',
            r'/[,\s]al[,\s]|[,\s]ak[,\s]|[,\s]az[,\s]|[,\s]ar[,\s]|[,\s]ca[,\s]|[,\s]co[,\s]|[,\s]ct[,\s]|[,\s]de[,\s]|[,\s]fl[,\s]|[,\s]ga[,\s]|[,\s]hi[,\s]|[,\s]id[,\s]|[,\s]il[,\s]|[,\s]in[,\s]|[,\s]ia[,\s]|[,\s]ks[,\s]|[,\s]ky[,\s]|[,\s]la[,\s]|[,\s]me[,\s]|[,\s]md[,\s]|[,\s]ma[,\s]|[,\s]mi[,\s]|[,\s]mn[,\s]|[,\s]ms[,\s]|[,\s]mo[,\s]|[,\s]mt[,\s]|[,\s]ne[,\s]|[,\s]nv[,\s]|[,\s]nh[,\s]|[,\s]nj[,\s]|[,\s]nm[,\s]|[,\s]ny[,\s]|[,\s]nc[,\s]|[,\s]nd[,\s]|[,\s]oh[,\s]|[,\s]ok[,\s]|[,\s]or[,\s]|[,\s]pa[,\s]|[,\s]ri[,\s]|[,\s]sc[,\s]|[,\s]sd[,\s]|[,\s]tn[,\s]|[,\s]tx[,\s]|[,\s]ut[,\s]|[,\s]vt[,\s]|[,\s]va[,\s]|[,\s]wa[,\s]|[,\s]wv[,\s]|[,\s]wi[,\s]|[,\s]wy[,\s]|[,\s]dc[,\s]',
            ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 734733


In [11]:
changed = pd.DataFrame({'original': m[changed_mask], 'cleaned': clean[changed_mask]})

In [12]:
changed

Unnamed: 0,original,cleaned
2,tst* casa del rio - exp fairlawn oh 09/24,tst* casa del rio - exp fairlawn
6,oculus ca 04/16,oculus
7,los girasoles stow oh 03/08,los girasoles stow
8,buzzis laundry 1 oh 03/28,buzzis laundry 1
9,buzzis laundry 1 oh 02/13,buzzis laundry 1
...,...,...
2597457,debit card withdrawal purchaseamazon prime*ti4...,withdrawal purchaseamazon prime*ti40l27r3 am...
2597462,pos withdrawalaz lot quiktrip xxxx xxxx e indi...,pos withdrawalaz lot quiktrip e indian sch...
2597465,pos withdrawalwal-mart #xxxx xxxx e mckellips ...,pos withdrawalwal-mart # e mckellips rd mes...
2597468,withdrawal salt river projetype: online pmt co...,withdrawal salt river projetype: online pmt co...


In [13]:
clean_all = pd.DataFrame({'original': outflow[outflow['memo']!=outflow['category']]['memo'], 'cleaned': clean})
# clean_all.loc[clean_all['cleaned'].isna(), 'cleaned'] = clean_all['original']
clean_all

Unnamed: 0,original,cleaned
2,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio - exp fairlawn
4,Buffalo Wild Wings,buffalo wild wings
6,Oculus CA 04/16,oculus
7,LOS GIRASOLES STOW OH 03/08,los girasoles stow
8,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1
...,...,...
2597457,DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...,withdrawal purchaseamazon prime*ti40l27r3 am...
2597462,POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...,pos withdrawalaz lot quiktrip e indian sch...
2597465,POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...,pos withdrawalwal-mart # e mckellips rd mes...
2597468,WITHDRAWAL Salt River ProjeTYPE: ONLINE PMT CO...,withdrawal salt river projetype: online pmt co...


## round 2

In [14]:
# continue n-grams
memos = clean_all[clean_all['cleaned'] != clean_all['original']]['cleaned'].unique()
memos = ', '.join(memos)
memos[:1000]

"tst* casa del rio - exp fairlawn  , buffalo wild wings, oculus  , los girasoles stow  , buzzis laundry 1  , tgi fridays   stow  , tst* the basement sp cuyahoga fall  , lowe's, piada - 39  , grubhub, hardees   akron  , market di   state cuyahoga fall  , swensons - montrose akron  , great clips, apple.com/bill  , wing warehouse cuyah cuyahoga fall  , winking lizard - 30 oh  , longhorn steak   cuyahoga fall  , on tap - cuyahoga fa cuyahoga fall  , home depot, falls discount tobacc cuyahoga fls  , burger king, o'charley's, homedepot.com  , dairy queen, east of chicago - cu cuyahoga fall  , giant-eag   corpora uniontown  , fin's bar & chill pigeon forge  , acme no. 12   bai cuyahoga fall   , taco bell, chick-fil-a, walmart, moe's sw grill #  cuyahoga fall  , texas roadhouse, 39 piada cuyahoga fall  , cleveland gaming fairview park  , circle k, iah cnbc smartshop houston  , bob evans rest #  stow  , kohl's, moe's sw grill #  akron  , wendy's, rays place of fairlawn fairlawn  , get go #    s

In [15]:
clean_all['cleaned'].str.split().str.len().max()

35

In [16]:
n_grams = most_common_ngrams(memos, n=35, top_k=50)
n_grams

[('com', 78759),
 ('purchase', 60407),
 ('amzn', 58420),
 ('card', 49212),
 ('pos', 41730),
 ('amzn com', 37069),
 ('amazon', 33285),
 ('debit', 32279),
 ('checkcard', 30931),
 ('bill', 29968),
 ('com bill', 29040),
 ('amzn com bill', 26107),
 ('amazon com', 25537),
 ('us', 23799),
 ('mktp', 18711),
 ('amzn mktp', 18644),
 ('mktp us', 18581),
 ('amzn mktp us', 18535),
 ('withdrawal', 17377),
 ('visa', 17038),
 ('pos debit', 14672),
 ('22', 14563),
 ('cash', 13902),
 ('check', 13541),
 ('date', 13458),
 ('check card', 13346),
 ('visa check', 11463),
 ('visa check card', 11421),
 ('debit visa', 11252),
 ('debit visa check', 11252),
 ('pos debit visa', 11211),
 ('pos debit visa check', 11211),
 ('debit visa check card', 11211),
 ('pos debit visa check card', 11211),
 ('app', 10495),
 ('21', 10194),
 ('cash app', 10093),
 ('doordash', 9641),
 ('of', 9529),
 ('c', 8297),
 ('pur', 8283),
 ('billwa', 8267),
 ('com billwa', 8267),
 ('s', 8117),
 ('pos purchase', 8026),
 ('amzn com billwa', 789

In [17]:
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
for item in sorted_by_len:
    print(item)

('pos debit visa check card', 11211)
('debit visa check card', 11211)
('pos debit visa check', 11211)
('debit visa check', 11252)
('visa check card', 11421)
('amzn com billwa', 7896)
('pos debit visa', 11211)
('amzn com bill', 26107)
('amzn mktp us', 18535)
('pos purchase', 8026)
('amazon com', 25537)
('withdrawal', 17377)
('check card', 13346)
('visa check', 11463)
('debit visa', 11252)
('com billwa', 8267)
('checkcard', 30931)
('amzn mktp', 18644)
('pos debit', 14672)
('purchase', 60407)
('amzn com', 37069)
('com bill', 29040)
('cash app', 10093)
('doordash', 9641)
('mktp us', 18581)
('amazon', 33285)
('billwa', 8267)
('debit', 32279)
('check', 13541)
('amzn', 58420)
('card', 49212)
('bill', 29968)
('mktp', 18711)
('visa', 17038)
('cash', 13902)
('date', 13458)
('mart', 6729)
('com', 78759)
('pos', 41730)
('app', 10495)
('pur', 8283)
('the', 7380)
('pin', 7267)
('tst', 7263)
('us', 23799)
('22', 14563)
('21', 10194)
('of', 9529)
('c', 8297)
('s', 8117)


In [18]:
m = clean_all['cleaned'].str.lower()
clean = m.copy(deep = True)

patterns = [r'(?:pos\s)?(?:debit\s)?(?:visa\s)+(?:check\s)?(?:card)?',
            r'pos\spurchase',
            r'withdraw',
            r'\s{2,}']

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 726258


In [19]:
changed = pd.DataFrame({'original': m[changed_mask], 'cleaned': clean[changed_mask]})
changed

Unnamed: 0,original,cleaned
2,tst* casa del rio - exp fairlawn,tst* casa del rio - exp fairlawn
6,oculus,oculus
7,los girasoles stow,los girasoles stow
8,buzzis laundry 1,buzzis laundry 1
9,buzzis laundry 1,buzzis laundry 1
...,...,...
2597457,withdrawal purchaseamazon prime*ti40l27r3 am...,al purchaseamazon prime*ti40l27r3 amzn.com/bi...
2597462,pos withdrawalaz lot quiktrip e indian sch...,pos alaz lot quiktrip e indian school rd phoen...
2597465,pos withdrawalwal-mart # e mckellips rd mes...,pos alwal-mart # e mckellips rd mesa card 15 #...
2597468,withdrawal salt river projetype: online pmt co...,al salt river projetype: online pmt co:salt r...


In [20]:
clean_all = pd.DataFrame({'original': outflow[outflow['memo']!=outflow['category']]['memo'], 'cleaned': clean})
clean_all

Unnamed: 0,original,cleaned
2,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio - exp fairlawn
4,Buffalo Wild Wings,buffalo wild wings
6,Oculus CA 04/16,oculus
7,LOS GIRASOLES STOW OH 03/08,los girasoles stow
8,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1
...,...,...
2597457,DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...,al purchaseamazon prime*ti40l27r3 amzn.com/bi...
2597462,POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...,pos alaz lot quiktrip e indian school rd phoen...
2597465,POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...,pos alwal-mart # e mckellips rd mesa card 15 #...
2597468,WITHDRAWAL Salt River ProjeTYPE: ONLINE PMT CO...,al salt river projetype: online pmt co:salt r...


## round 3

In [21]:
# continue n-grams
memos = clean_all['cleaned'].unique()
memos = ', '.join(memos)
memos[:1000]

"tst* casa del rio - exp fairlawn , buffalo wild wings, oculus , los girasoles stow , buzzis laundry 1 , tgi fridays stow , tst* the basement sp cuyahoga fall , lowe's, piada - 39 , grubhub, hardees akron , market di state cuyahoga fall , swensons - montrose akron , great clips, apple.com/bill , wing warehouse cuyah cuyahoga fall , winking lizard - 30 oh , longhorn steak cuyahoga fall , on tap - cuyahoga fa cuyahoga fall , home depot, falls discount tobacc cuyahoga fls , burger king, o'charley's, homedepot.com , dairy queen, east of chicago - cu cuyahoga fall , giant-eag corpora uniontown , fin's bar & chill pigeon forge , acme no. 12 bai cuyahoga fall , taco bell, chick-fil-a, walmart, moe's sw grill # cuyahoga fall , texas roadhouse, 39 piada cuyahoga fall , cleveland gaming fairview park , circle k, iah cnbc smartshop houston , bob evans rest # stow , kohl's, moe's sw grill # akron , wendy's, rays place of fairlawn fairlawn , get go # st cuyahoga fall , dd doordash dashmart , chipot

In [22]:
clean_all['cleaned'].str.split().str.len().max()

35

In [23]:
n_grams = most_common_ngrams(memos, n=35, top_k=50)
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
for item in sorted_by_len:
    print(item)

('amzn com billwa', 7896)
('amzn com bill', 26103)
('point of sale', 6162)
('amzn mktp us', 18535)
('amazon com', 25530)
('com billwa', 8267)
('checkcard', 30862)
('amzn mktp', 18644)
('pos debit', 14657)
('recurring', 5559)
('purchase', 52168)
('amzn com', 37066)
('com bill', 28988)
('cash app', 10014)
('doordash', 9555)
('point of', 6162)
('mktp us', 18581)
('of sale', 6162)
('date 22', 5492)
('amazon', 33276)
('billwa', 8267)
('debit', 32080)
('point', 6695)
('amzn', 58417)
('card', 35678)
('bill', 29916)
('mktp', 18709)
('cash', 13816)
('date', 13456)
('mart', 6695)
('sale', 6196)
('com', 78531)
('pos', 33641)
('app', 10414)
('pur', 8265)
('the', 7330)
('pin', 7256)
('tst', 7249)
('san', 6582)
('crd', 6407)
('wal', 6168)
('us', 23778)
('al', 18630)
('22', 14549)
('21', 10183)
('of', 9519)
('id', 6350)
('12', 6071)
('c', 8269)
('s', 8088)


In [24]:
[i for i in clean_all['cleaned']]

['tst* casa del rio - exp fairlawn ',
 'buffalo wild wings',
 'oculus ',
 'los girasoles stow ',
 'buzzis laundry 1 ',
 'buzzis laundry 1 ',
 'tgi fridays stow ',
 'tst* the basement sp cuyahoga fall ',
 "lowe's",
 'piada - 39 ',
 'grubhub',
 'hardees akron ',
 'market di state cuyahoga fall ',
 'swensons - montrose akron ',
 'great clips',
 'apple.com/bill ',
 'apple.com/bill ',
 'apple.com/bill ',
 'los girasoles stow ',
 'wing warehouse cuyah cuyahoga fall ',
 'winking lizard - 30 oh ',
 'longhorn steak cuyahoga fall ',
 'on tap - cuyahoga fa cuyahoga fall ',
 'tst* the basement sp cuyahoga fall ',
 'great clips',
 'oculus ',
 'home depot',
 "lowe's",
 'falls discount tobacc cuyahoga fls ',
 'burger king',
 "lowe's",
 "o'charley's",
 'homedepot.com ',
 'dairy queen',
 'east of chicago - cu cuyahoga fall ',
 'giant-eag corpora uniontown ',
 'wing warehouse cuyah cuyahoga fall ',
 "fin's bar & chill pigeon forge ",
 'acme no. 12 bai cuyahoga fall ',
 'taco bell',
 'chick-fil-a',
 'app

In [25]:
# see what else to remove --> especially addresses

In [26]:
clean_all['cleaned'].value_counts()

cleaned
amazon                                                 31725
walmart                                                31619
mcdonald's                                             22671
starbucks                                              12778
7-eleven                                               11757
                                                       ...  
 ale house bar & gr shamokin                               1
 arbys # milroy                                            1
 arthritis & osteop wyomissing                             1
 big top beverage abington                                 1
pos alwal-mart # e mckellips rd mesa card 15 # mcc         1
Name: count, Length: 332690, dtype: int64

In [27]:
# unique memo count
# 240076 ~ 10%
sum(clean_all['cleaned'].value_counts()==1)

240076

In [28]:
clean_all = clean_all.reindex(outflow.index)
outflow['cleaned_memo'] = clean_all['cleaned'].fillna(outflow['memo'])
outflow = outflow[['prism_consumer_id', 'prism_account_id', 'memo', 'cleaned_memo', 
                   'amount', 'posted_date', 'category']]               
outflow

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
0,0,acc_0,LOAN,LOAN,900.60,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,ATM_CASH,80.00,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio - exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,LOAN,634.00,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...,...
2597483,5941,acc_9524,ATM_CASH,ATM_CASH,8.42,2023-01-25,ATM_CASH
2597484,5941,acc_9524,ATM_CASH,ATM_CASH,2.06,2023-01-25,ATM_CASH
2597485,5941,acc_9524,ATM_CASH,ATM_CASH,262.88,2023-01-25,ATM_CASH
2597486,5941,acc_9524,ATM_CASH,ATM_CASH,10.00,2023-01-25,ATM_CASH


In [29]:
outflow[outflow['memo']!=outflow['cleaned_memo']]

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio - exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,Buffalo Wild Wings,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,Oculus CA 04/16,oculus,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,LOS GIRASOLES STOW OH 03/08,los girasoles stow,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1,4.16,2022-03-29,GENERAL_MERCHANDISE
...,...,...,...,...,...,...,...
2597457,5941,acc_9524,DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...,al purchaseamazon prime*ti40l27r3 amzn.com/bi...,15.93,2023-01-16,GENERAL_MERCHANDISE
2597462,5941,acc_9524,POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...,pos alaz lot quiktrip e indian school rd phoen...,25.00,2023-01-18,EDUCATION
2597465,5941,acc_9524,POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...,pos alwal-mart # e mckellips rd mesa card 15 #...,3.68,2023-01-18,FOOD_AND_BEVERAGES
2597468,5941,acc_9524,WITHDRAWAL Salt River ProjeTYPE: ONLINE PMT CO...,al salt river projetype: online pmt co:salt r...,90.00,2023-01-20,FOOD_AND_BEVERAGES


In [30]:
# changed.iloc[0]['cleaned']

In [31]:
# without_cat = clean_all['cleaned']
# sample = without_cat[without_cat.str.lower().str.contains('com billwa', regex=True)]
# sample

In [32]:
# sample.iloc[0]

In [33]:
# outflow[outflow['memo']!=outflow['category']]

## Test Category Imbalance

In [41]:
nine_cat=outflow.groupby('category')['memo'].nunique().reset_index(name='unique_memo_count').sort_values(by='unique_memo_count', ascending=False)[:9]['category']

In [43]:
outflow[outflow['category'].isin(nine_cat)].groupby('category')['memo'].count()

category
EDUCATION                4499
FOOD_AND_BEVERAGES     481994
GENERAL_MERCHANDISE    524063
GROCERIES              219331
MORTGAGE                 1119
OVERDRAFT                3386
PETS                     9266
RENT                     3147
TRAVEL                  59647
Name: memo, dtype: int64

## Finds

#1: Payroll info in memos under category: 'General Merchandise

In [35]:
outflow[outflow['memo'].str.contains('GUSTO')].head(10)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
141680,326,acc_966,GUSTO/NET XXXXXX / 6semjq46avb Excelerate Acad...,gusto/net / 6semjq46avb excelerate academy llc,1227.46,2021-07-23,EDUCATION
141689,326,acc_966,GUSTO/TAX XXXXXX / 6semjra3md0 Excelerate Acad...,gusto/tax / 6semjra3md0 excelerate academy llc,70.4,2022-03-03,EDUCATION
141700,326,acc_966,GUSTO/TAX XXXXXX / 6semjrcmemm Excelerate Acad...,gusto/tax / 6semjrcmemm excelerate academy llc,48.13,2022-03-17,EDUCATION
141803,326,acc_966,GUSTO/TAX XXXXXX / 6semjqaf1tl Excelerate Acad...,gusto/tax / 6semjqaf1tl excelerate academy llc,70.69,2021-09-02,EDUCATION
141878,326,acc_966,GUSTO/NET XXXXXX / 6semjq1ao5i Excelerate Acad...,gusto/net / 6semjq1ao5i excelerate academy llc,566.64,2021-07-06,EDUCATION
141883,326,acc_966,GUSTO/NET XXXXXX / 6semjq63sle Excelerate Acad...,gusto/net / 6semjq63sle excelerate academy llc,1303.6,2021-08-05,EDUCATION
141917,326,acc_966,GUSTO/NET XXXXXX / 6semjr4u83j Excelerate Acad...,gusto/net / 6semjr4u83j excelerate academy llc,1441.36,2022-02-03,EDUCATION
141978,326,acc_966,GUSTO/NET XXXXXX / 6semjqqigkr Excelerate Acad...,gusto/net / 6semjqqigkr excelerate academy llc,690.03,2021-12-09,EDUCATION
141981,326,acc_966,GUSTO/NET XXXXXX / 6semjqt17td Excelerate Acad...,gusto/net / 6semjqt17td excelerate academy llc,267.81,2021-12-23,EDUCATION
141984,326,acc_966,GUSTO/NET XXXXXX / 6semjqjeho5 Excelerate Acad...,gusto/net / 6semjqjeho5 excelerate academy llc,286.29,2021-10-28,EDUCATION


In [36]:
outflow[outflow['memo'].str.contains('GUSTO')]['memo'].loc[572115]

'GUSTO DES:CND XXXXXX ID:6semjrdkv9j INDN:Newhouse Law Group, PC CO ID:XXXXXXXXXX CCD'

In [37]:
outflow[outflow['memo'].str.contains('GUSTO')]['memo'].iloc[0]

'GUSTO/NET XXXXXX / 6semjq46avb Excelerate Academy LLC'