In [1]:
import pandas as pd
import numpy as np

In [63]:
data = pd.read_parquet("../data/ucsd-inflows.pqt")
data

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.00,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-07-29,EXTERNAL_TRANSFER
...,...,...,...,...,...,...
513110,5941,acc_9524,EXTERNAL_TRANSFER,8.66,2023-01-21,EXTERNAL_TRANSFER
513111,5941,acc_9524,EXTERNAL_TRANSFER,267.13,2023-01-23,EXTERNAL_TRANSFER
513112,5941,acc_9524,EXTERNAL_TRANSFER,2.00,2023-01-24,EXTERNAL_TRANSFER
513113,5941,acc_9524,EXTERNAL_TRANSFER,207.16,2023-01-24,EXTERNAL_TRANSFER


In [16]:
consumer_stats = data.groupby("prism_consumer_id")
consumer_stats

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1780308d0>

In [20]:
# determine amount of transactions per customer
customer_transactions = consumer_stats['amount'].count()
customer_transactions

prism_consumer_id
0        91
2       113
4       152
7       129
9       375
       ... 
5930    210
5935     83
5939     91
5940    440
5941    297
Name: amount, Length: 2974, dtype: int64

In [21]:
# There are 2974 individual customers, and we determine the average number of transactions per individual customer here
customer_transactions.mean()

172.5336247478144

In [23]:
# determine total mean of transactions per individual customer
cost_transactions = consumer_stats['amount'].mean()
cost_transactions

prism_consumer_id
0       1212.224505
2       3094.150708
4       3043.144079
7       1939.644109
9        574.245360
           ...     
5930     605.605286
5935     164.338916
5939     372.336813
5940     153.301045
5941     197.420572
Name: amount, Length: 2974, dtype: float64

In [24]:
# determine total mean of all transactions
cost_transactions.mean()

876.8506062831314

In [32]:
# determine income categories that we want to use
data['category'].unique()

array(['PAYCHECK', 'EXTERNAL_TRANSFER', 'MISCELLANEOUS',
       'INVESTMENT_INCOME', 'TAX', 'DEPOSIT', 'SELF_TRANSFER', 'REFUND',
       'PAYCHECK_PLACEHOLDER', 'INSURANCE', 'OTHER_BENEFITS',
       'UNEMPLOYMENT_BENEFITS', 'LOAN', 'SMALL_DOLLAR_ADVANCE'],
      dtype=object)

In [82]:
# divide by the category of the transactions
cleaned_data = data.copy()
cleaned_data = cleaned_data.replace('PAYCHECK_PLACEHOLDER', 'PAYCHECK')
cleaned_data['category'].unique()

array(['PAYCHECK', 'EXTERNAL_TRANSFER', 'MISCELLANEOUS',
       'INVESTMENT_INCOME', 'TAX', 'DEPOSIT', 'SELF_TRANSFER', 'REFUND',
       'INSURANCE', 'OTHER_BENEFITS', 'UNEMPLOYMENT_BENEFITS', 'LOAN',
       'SMALL_DOLLAR_ADVANCE'], dtype=object)

In [83]:
cleaned_data = cleaned_data[
    cleaned_data["category"].isin(
        [
            "PAYCHECK",
            "EXTERNAL_TRANSFER",
            "INVESTMENT_INCOME",
            "DEPOSIT",
            "PAYCHECK_PLACEHOLDER",
            "INSURANCE",
            "OTHER_BENEFITS",
            "UNEMPLOYMENT_BENEFITS",
        ]
    )
]
transaction_category = cleaned_data.groupby(["prism_consumer_id", "category"])

In [84]:
# create specific dataframes for the total amount gained in income from certain categories, and also mean amount
category_sums = pd.DataFrame(transaction_category['amount'].sum())
category_sums = category_sums.reset_index()

In [91]:
# get user totals so we can divide to find percentage
user_totals = cleaned_data.groupby('prism_consumer_id')['amount'].sum().reset_index()
user_totals = user_totals.rename(columns = {'amount': 'total_income'})

In [92]:
# merge, then divide to get percentages
merged_df = category_sums.merge(user_totals, on="prism_consumer_id")
merged_df["percentage"] = (merged_df["amount"] / merged_df["total_income"] * 100).round(2)


In [94]:
merged_df[merged_df['prism_consumer_id'] == 0]

Unnamed: 0,prism_consumer_id,category,amount,total_income,percentage
0,0,DEPOSIT,3375.18,105654.93,3.19
1,0,EXTERNAL_TRANSFER,42090.58,105654.93,39.84
2,0,INVESTMENT_INCOME,0.52,105654.93,0.0
3,0,PAYCHECK,60188.65,105654.93,56.97


In [95]:
# create pivot table for better vis
pivot_table = merged_df.pivot(index = 'prism_consumer_id',
                              columns = 'category',
                              values = 'percentage')

# if someone's income is not represented by a category, fill NA values with 0
pivot_table = pivot_table.fillna(0)

In [96]:
pivot_table

category,DEPOSIT,EXTERNAL_TRANSFER,INSURANCE,INVESTMENT_INCOME,OTHER_BENEFITS,PAYCHECK,UNEMPLOYMENT_BENEFITS
prism_consumer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3.19,39.84,0.00,0.00,0.00,56.97,0.00
2,13.69,4.83,0.02,81.46,0.00,0.00,0.00
4,15.01,3.60,0.00,0.00,0.00,81.40,0.00
7,0.04,1.07,0.00,0.35,0.00,98.55,0.00
9,56.23,12.70,0.00,0.00,10.03,2.27,18.77
...,...,...,...,...,...,...,...
5930,84.82,14.19,0.00,1.00,0.00,0.00,0.00
5935,8.58,91.42,0.00,0.00,0.00,0.00,0.00
5939,77.87,21.75,0.00,0.00,0.00,0.39,0.00
5940,35.08,27.64,0.00,2.47,0.00,34.81,0.00


In [97]:
# calculate average breakdown across all users
average_breakdown = pivot_table.mean().round(2)
std_breakdown = pivot_table.std().round(2)

summary = pd.DataFrame({
    'Average Percentage': average_breakdown,
    'Standard Deviation': std_breakdown
}).sort_values("Average Percentage", ascending = False)

In [98]:
summary

Unnamed: 0_level_0,Average Percentage,Standard Deviation
category,Unnamed: 1_level_1,Unnamed: 2_level_1
PAYCHECK,41.37,35.49
EXTERNAL_TRANSFER,27.31,28.01
DEPOSIT,21.99,27.51
OTHER_BENEFITS,6.3,19.25
INVESTMENT_INCOME,2.3,9.63
UNEMPLOYMENT_BENEFITS,0.6,4.29
INSURANCE,0.14,2.7
