### Datasets

In [1]:
import pandas as pd

In [2]:
inflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-inflows.pqt')
outflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

In [3]:
inflow.sample(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
267653,3115,acc_6698,EXTERNAL_TRANSFER,200.0,2022-06-14,EXTERNAL_TRANSFER
414687,4817,acc_8400,EXTERNAL_TRANSFER,175.0,2021-07-20,EXTERNAL_TRANSFER
148695,1569,acc_4345,SELF_TRANSFER,100.0,2022-04-29,SELF_TRANSFER
327190,3844,acc_7427,EXTERNAL_TRANSFER,73.69,2022-10-21,EXTERNAL_TRANSFER
241096,2760,acc_6343,EXTERNAL_TRANSFER,825.63,2022-06-29,EXTERNAL_TRANSFER


In [4]:
outflow.sample(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
609852,1304,acc_3648,Chick-fil-A,15.58,2020-12-02,FOOD_AND_BEVERAGES
2301857,5401,acc_8984,CHECKCARD XXXX TARGET.COM * XXX-XXX-XXXX MN XX...,9.77,2021-12-01,GROCERIES
421180,920,acc_2638,EXTERNAL_TRANSFER,5.0,2023-03-29,EXTERNAL_TRANSFER
1149327,2743,acc_6326,CHECKCARD XXXX KLOVER APP BOOST Chicago IL XXX...,10.0,2022-10-27,GENERAL_MERCHANDISE
500920,1089,acc_3081,CHECKCARD XXXX CHOWKING VALLEJO VALLEJO CA XXX...,23.34,2022-11-14,FOOD_AND_BEVERAGES


# What are the most common merchants per category? [pick 5-10 categories]

### Looking at the dataframes above, it seems that merchant data is stored in the memo column. But before digging into the merchants, we will browse through the categories.

In [5]:
inflow['category'].value_counts()

category
EXTERNAL_TRANSFER        156533
SELF_TRANSFER            110437
DEPOSIT                   61345
MISCELLANEOUS             55648
PAYCHECK                  33138
PAYCHECK_PLACEHOLDER      26087
REFUND                    23220
INVESTMENT_INCOME         17325
SMALL_DOLLAR_ADVANCE      13621
OTHER_BENEFITS             7708
TAX                        3405
LOAN                       2513
UNEMPLOYMENT_BENEFITS      1961
INSURANCE                   174
Name: count, dtype: int64

In [6]:
outflow['category'].value_counts()

category
GENERAL_MERCHANDISE    524063
FOOD_AND_BEVERAGES     481994
EXTERNAL_TRANSFER      320998
GROCERIES              219331
AUTOMOTIVE             208579
ATM_CASH               117651
UNCATEGORIZED          117409
LOAN                    90945
ENTERTAINMENT           80885
ESSENTIAL_SERVICES      77137
CREDIT_CARD_PAYMENT     75506
SELF_TRANSFER           73281
TRAVEL                  59647
HEALTHCARE_MEDICAL      40842
ACCOUNT_FEES            37511
INSURANCE               30244
PETS                     9266
HOME_IMPROVEMENT         8600
GIFTS_DONATIONS          4719
EDUCATION                4499
OVERDRAFT                3386
TAX                      3186
RENT                     3147
BNPL                     1543
MORTGAGE                 1119
PAYCHECK                 1040
CHILD_DEPENDENTS          936
BILLS_UTILITIES            23
AUTO_LOAN                   1
Name: count, dtype: int64

### Let's look through each category and see what merchants there are.

In [7]:
inflow[inflow['category'] == 'EXTERNAL_TRANSFER']['memo'].value_counts()

memo
EXTERNAL_TRANSFER    156533
Name: count, dtype: int64

In [8]:
inflow[inflow['category'] == 'SELF_TRANSFER']['memo'].value_counts()

memo
SELF_TRANSFER    110437
Name: count, dtype: int64

In [9]:
inflow['memo'].value_counts()

memo
EXTERNAL_TRANSFER        156533
SELF_TRANSFER            110437
DEPOSIT                   61345
MISCELLANEOUS             55648
PAYCHECK                  33138
PAYCHECK_PLACEHOLDER      26087
REFUND                    23220
INVESTMENT_INCOME         17325
SMALL_DOLLAR_ADVANCE      13621
OTHER_BENEFITS             7708
TAX                        3405
LOAN                       2513
UNEMPLOYMENT_BENEFITS      1961
INSURANCE                   174
Name: count, dtype: int64

### It seems that there are no merchant information in the inflow df.

In [10]:
outflow['memo'].value_counts()

memo
EXTERNAL_TRANSFER                                                                    320998
AUTOMOTIVE                                                                           208579
ATM_CASH                                                                             117651
UNCATEGORIZED                                                                        117409
LOAN                                                                                  90945
                                                                                      ...  
Par Gators Dockside -                                                                     1
Chilis Mandarin                                                                           1
Chilis Bay Meadows                                                                        1
Southside Liquor                                                                          1
POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS RD MESA AZ  Card 15 #XXXX  MC

### There are in outflow so let's explore more.

In [11]:
outflow[outflow['category'] == 'GENERAL_MERCHANDISE']['memo'].value_counts()

memo
Amazon                                                                                                                           31725
7-Eleven                                                                                                                         11675
Circle K                                                                                                                         10148
Dollar General                                                                                                                    7872
Apple                                                                                                                             7394
                                                                                                                                 ...  
PURCHASE 06-17 FLEXSHOPPER , FL VNT XXXX                                                                                             1
PURCHASE 08-10 PLAYSTATIO FOSTER CITY, CA STR XXXX

In [12]:
outflow[outflow['category'] == 'CHILD_DEPENDENTS']['memo'].value_counts()

memo
CHILD_DEPENDENTS    936
Name: count, dtype: int64

### Some categories had a large variety of merchants while others had none/few as seen above. We continued looking at each category and chose the following ones to work with because they have a wider range of merchant variety.

In [13]:
chosen_cats = ['GENERAL_MERCHANDISE', 'FOOD_AND_BEVERAGES', 'GROCERIES', 'TRAVEL', 'PETS', 'RENT']

### Let's see a few examples of the top 10 merchants.

In [14]:
outflow[outflow['category']=='FOOD_AND_BEVERAGES']['memo'].value_counts()[:10]

memo
McDonald's                22670
Starbucks                 12777
Chick-fil-A                7933
Taco Bell                  6071
Wendy's                    4694
Burger King                4282
Dunkin' Donuts             4265
McDonald''s                3875
Sonic                      2925
Chipotle Mexican Grill     2679
Name: count, dtype: int64

In [15]:
outflow[outflow['category']=='GENERAL_MERCHANDISE']['memo'].value_counts()[:10]

memo
Amazon            31725
7-Eleven          11675
Circle K          10148
Dollar General     7872
Apple              7394
Home Depot         7213
Costco             4760
Google             4567
APPLE.COM/BILL     3270
Amazon Prime       3159
Name: count, dtype: int64

### Let's do the same for the rest of the chosen categories.

In [16]:
merchant_df = pd.DataFrame()
for i in chosen_cats:
    memo_df = pd.DataFrame({'Category': i,'Count': outflow[outflow['category']==i]['memo'].value_counts()[:10]})
    memo_df.index.rename('Merchant', inplace=True)
    memo_df = memo_df.reset_index()
    merchant_df = pd.concat([merchant_df, memo_df], axis=0)
    merchant_df = merchant_df[['Category', 'Merchant', 'Count']]

merchant_df

Unnamed: 0,Category,Merchant,Count
0,GENERAL_MERCHANDISE,Amazon,31725
1,GENERAL_MERCHANDISE,7-Eleven,11675
2,GENERAL_MERCHANDISE,Circle K,10148
3,GENERAL_MERCHANDISE,Dollar General,7872
4,GENERAL_MERCHANDISE,Apple,7394
5,GENERAL_MERCHANDISE,Home Depot,7213
6,GENERAL_MERCHANDISE,Costco,4760
7,GENERAL_MERCHANDISE,Google,4567
8,GENERAL_MERCHANDISE,APPLE.COM/BILL,3270
9,GENERAL_MERCHANDISE,Amazon Prime,3159
