In [44]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re

In [37]:
inflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-inflows.pqt')
outflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

In [95]:
memos = pd.Series(outflow[outflow['category'] != outflow['memo']]['memo'].unique())
memos

0                 TST* Casa Del Rio - Exp Fairlawn OH 09/24
1                                        Buffalo Wild Wings
2                                           Oculus CA 04/16
3                               LOS GIRASOLES STOW OH 03/08
4                                 BUZZIS LAUNDRY 1 OH 03/28
                                ...                        
528761    DEBIT CARD WITHDRAWAL PURCHASEAudible*Z295DXXX...
528762    DEBIT CARD WITHDRAWAL PURCHASEJACK IN THE BOX ...
528763    DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...
528764    POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...
528765    POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...
Length: 528766, dtype: object

In [78]:
# for memo in memos:
#     print(memo)

In [79]:
# pattern = r"X{2,}"
# matches = [s for s in memos if re.search(pattern, s)]
# matches

In [161]:
memos_clean = memos.str.lower()
memos_clean = memos_clean.apply(lambda x: re.sub(r'[^a-z0-9\s]', '', x)) # Remove special characters
memos_clean = memos_clean.apply(lambda x: re.sub(r'\s+', ' ', x).strip()) # Remove whitespace
print(memos)
print(memos_clean)

0                 TST* Casa Del Rio - Exp Fairlawn OH 09/24
1                                        Buffalo Wild Wings
2                                           Oculus CA 04/16
3                               LOS GIRASOLES STOW OH 03/08
4                                 BUZZIS LAUNDRY 1 OH 03/28
                                ...                        
528761    DEBIT CARD WITHDRAWAL PURCHASEAudible*Z295DXXX...
528762    DEBIT CARD WITHDRAWAL PURCHASEJACK IN THE BOX ...
528763    DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...
528764    POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...
528765    POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...
Length: 528766, dtype: object
0                     tst casa del rio exp fairlawn oh 0924
1                                        buffalo wild wings
2                                            oculus ca 0416
3                                los girasoles stow oh 0308
4                                  buzzis laundry 1 oh 0328
          

In [162]:
vectorizer = CountVectorizer(ngram_range=(5, 5), min_df=10)
X = vectorizer.fit_transform(memos_clean)

term_counts = X.sum(axis=0)
frequencies = [(term, term_counts[0, idx]) for term, idx in vectorizer.vocabulary_.items()]

sorted_frequencies = sorted(frequencies, key=lambda x: x[1], reverse=True)

In [163]:
for term, freq in sorted_frequencies[:10]:
    print(f'"{term}": {freq}')

"xxxxxxxxxx ca sxxxxxxxxxxxxxxx card xxxx": 12881
"pos debit visa check card": 11250
"debit visa check card xxxx": 11250
"amzncombill wa sxxxxxxxxxxxxxxx card xxxx": 11150
"wwwdoordash ca sxxxxxxxxxxxxxxx card xxxx": 2239
"pos purchase merchant purchase terminal": 2008
"purchase merchant purchase terminal xxxxxxxx": 2007
"point of sale debit l340": 1639
"on xxxxxx from card xxxxxxxxxxxxxxxx": 1609
"xxxxxxxxxx fl sxxxxxxxxxxxxxxx card xxxx": 1550


In [164]:
memos_clean[memos_clean.str.contains('xxxxxxxxxx ca sxxxxxxxxxxxxxxx card xxxx')]

4190      purchase authorized on 0830 cash appbenjamin x...
4789      purchase authorized on 1220 cash appnakia w xx...
4793      purchase authorized on 0329 cash appkaylen xxx...
4794      purchase authorized on 0419 cash appktr xxxxxx...
4800      purchase authorized on 0226 cash appkaylen xxx...
                                ...                        
525149    purchase authorized on 1024 cash appelizabeth ...
525207    purchase authorized on 1108 cash appelizabeth ...
525275    purchase authorized on 1201 cash appelizabeth ...
525342    purchase authorized on 1220 cash appabby eriq ...
525387    purchase authorized on 0104 cash appelizabeth ...
Length: 12881, dtype: object

In [160]:
memos_clean[memos_clean.str.lower().str.contains('visa check card')]

7828      pos debit visa check card xxxx amazoncom1x3bt3...
7830      pos debit visa check card xxxx instacart159 ht...
7834      pos debit visa check card xxxx amazoncom1r2gz0...
7839      pos debit visa check card xxxx amazoncom6b0ir4...
7841      pos debit visa check card xxxx amazoncompg26a1...
                                ...                        
526419    pos debit visa check card xxxx mcdonalds fxxxx...
526420    pos debit visa check card xxxx fastfix jewelry...
526421    pos debit visa check card xxxx lowes xxxx fair...
526422    pos debit visa check card xxxx target t xxxx h...
526423    pos debit visa check card xxxx sams club vacav...
Length: 11667, dtype: object

In [62]:
# for memo in pd.Series(memos).str.split():
#     if '/' in memo[0]:
#         print(memo)