# Exploratory Analysis of Personal Finance Bookkeeping Activity
Per Offical Account (OA) dashboard
1/31/2025  
628,181 Friends added  
312,619 Target reach  
288,289 Blocked count

Transaction Database  
651,793 users (acc_user)  
317,726 blocked (acc_user)  
399,125 unique users (acc_cashflow)  
254,583 never logged any transaction (acc_users x acc_cashflow), of which 98,310 not blocked

What is the motivation for conducting this analysis?


# Data Source
Loading, cleaning, and pre-processing
- acc_user
- acc_cashflow

In [None]:
import pandas as pd

users = pd.read_feather('../data/users.feather')
# users.info()

cashflow = pd.read_feather('../data/cashflow_cleaned.feather')
# cahsflow.info()

td = pd.read_feather('../data/tidy.feather')
# td.info()

In [None]:
# observation period
tsl = pd.to_datetime(['2018-06-01', '2025-02-01'])

# Never active

_n.b._ There is about <20 users in the cashflow table not found in the users table.

In [None]:
x = users.loc[~users.is_bot, ['user_id', 'is_agree']]
x.shape

In [None]:
x.groupby('is_agree').size()

In [None]:
# y = cashflow.loc[~cashflow.isBad & (cashflow.amt != 0), 'user_id'].drop_duplicates()
y = cashflow.loc[~cashflow.isBad, 'user_id'].drop_duplicates().to_frame()
y['right'] = True
y.shape

In [None]:
td.shape

In [None]:
z = x.merge(y, on='user_id', how = 'left')

Number of users who never logged any transactions

In [None]:
print(z.right.isna().sum())

What is the split between blocked or non-blocked users?

In [None]:
z[z.right.isna()].groupby('is_agree').size()

# WIP: Group of One

What are the patterns of this cluster of users
who tracks income or expenses using one or more groups
with no other members in the group?

In [None]:
mbr = pd.read_feather('../data/members.feather')

In [None]:
x = mbr.groupby('group_id').agg(n_member=('member_id', 'nunique'))
y = x.query('n_member == 1')
grp_lst = [i for i in y.index]
grp_one = cashflow[(cashflow.amt != 0) & (cashflow.group_id.isin(grp_lst)) & ~cashflow.isBad] \
    .groupby('group_id') \
    .agg(first_entry=('ts', 'min'), last_entry=('ts', 'max'),
         nbr_entry=('amt', 'count'),
         grp_exp = ('amt', lambda x: x[x < 0].sum()),
         nbr_grp_exp = ('amt', lambda x: x[x < 0].count()),
         grp_inc = ('amt', lambda x: x[x > 0].sum()),
         nbr_grp_inc = ('amt', lambda x: x[x > 0].count())
        )

grp_one.describe()

In [None]:
len(grp_lst) / x.shape[0]
# x.shape[0]

# Churned by Default

Users who had not blocked/unfollowed and had not logged any transactions after 365 days are by defaulted **churned**.

In [None]:
td['days_since'] = tsl[1] - td.last_entry
td['churned'] = ~td.is_agree
i = td.is_agree & (td.days_since > pd.Timedelta(days=365))
td.loc[i, 'churned'] = True
td.groupby('churned').size()

# Segmentation by `tenure`, `days_since`, `days_active` and `survival_time`
- `tenure` is the number of days between the users first and last date of expense or income entry
- `days_since` is the number of days since the user has made the last entry
- `days_active` is the number of days that a user log entries
- `survivial time` is the number of days between the system initially recongizes user's activity and the last day of observation '2025-01-31' or when the users had churned


calculate survivle time





In [None]:
# calculate start time: the time when user was first recognized or
# logged the first transaction
td['t0'] = td[['user_ts', 'first_entry']].min(axis=1)

`tsl[1]` is the observation end time  

if `churned` is True, set the user end time to larger of 
`last_entry` or user record timestamp `user_ts` from acc_user table,
else the end time is the observation end time, i.e. right censored.

In [None]:
# calculate end time and survival_time
td['t1'] = tsl[1]
td.loc[td.churned, 't1'] = td.loc[td.churned, ['user_ts', 'last_entry']].max(axis=1, skipna=True)
td['survival_time'] = td.t1 - td.t0

In [None]:
x = td['survival_time'].dt.total_seconds() / 3600 / 24 # days
print(x.describe(percentiles=[.25, .5, .75, .8, .818, .9]))
_ = x.plot.hist(xlabel='survival time (days)')

In [None]:
# how many have churned beyound the oberservation period
td.loc[td.churned & (td.t1 >= tsl[1])].shape[0]

In [None]:
# [(x.left.round('D').days, x.right.days) for x in pd.qcut(td.survival_time, q=10).unique().sort_values()]

In [None]:
# [(x.left.round('D').days, x.right.days) for x in pd.qcut(td.days_since, q=10).unique().sort_values()]

In [None]:
# [(round(x.left), round(x.right)) for x in pd.qcut(td.dats_active.dt.days, q=10, duplicates='drop').unique().sort_values()]
# td.info()

In [None]:
# count null of time_elasped << user has made only one valid entry
td[td.nbr_entry == 1].shape

In [None]:
# [round(i.right) for i in pd.qcut(df.fq_mean, q=12, duplicates='drop').cat.categories]

In [None]:
td.groupby('churned').size()

In [None]:
df = td.loc[~td.churned, ['user_id', 'days_since', 'tenure', 'days_active', 'pct_active',
                          'survival_time',
          'fq_mean', 'fq_median', 'time_elapsed_mean']].copy()
# df.info()
df.describe(percentiles=[.5, .6, .7, .8, .9])

In [None]:
# cut series and tag quantiles
x = pd.qcut(df.days_since.dt.ceil('D').dt.days, q=11, duplicates='drop')
df['days_since_decile'] = x.apply(lambda x: (x.right).astype('int') )  
x = pd.qcut(df.tenure.dt.days, q=10, duplicates='drop')
df['tenure_decile'] = x.apply(lambda x: round(x.right)) 
x = pd.qcut(df.survival_time.dt.days, q=10)
df['survival_time_decile'] = x.apply(lambda x: round(x.right))
x = pd.qcut(df.fq_mean, q=12, duplicates='drop')
df['mean_wk_fq'] = x.apply(lambda x: round(x.right))
x = pd.qcut(df.fq_median, q=12, duplicates='drop')
df['median_wk_fq'] = x.apply(lambda x: round(x.right))
x = pd.qcut(df.time_elapsed_mean, q=12, duplicates='drop')
# round(x[397139].right.seconds / 3600, 1)
# x[22].right.round('1h')
df['mean_interval'] = x.apply(lambda x: x.right.round('6h').total_seconds() / 3600 / 24) # day hours

In [None]:
# df.time_elapsed_mean.describe()
# df.median_interval.describe()
# x[53]#.right.seconds / 60 / 60
# df.mean_interval.describe()

In [None]:
import seaborn as sns

# df_plot = df.groupby(['days_since_decile', 'days_active_decile'], observed=True).agg({'user_id':'count'}).reset_index()
# df_plot = df_plot.pivot(index='dasy_active_decile', columns='days_since_decile', values='user_id')
df_plot = df.groupby(['days_since_decile', 'survival_time_decile'], observed=True).agg({'user_id':'count'}).reset_index()
df_plot = df_plot.pivot(index='survival_time_decile', columns='days_since_decile', values='user_id')

# Set figure size globally
sns.set_theme(rc={'figure.figsize': (12, 6)})

_ = sns.heatmap(df_plot, annot=False, cmap='Greens')
_.set_title("Days Since Last Active vs Survival Time (censored)", y=1.02)
df_plot

In [None]:
import seaborn as sns

# df_plot = df.groupby(['days_since_decile', 'days_active_decile'], observed=True).agg({'user_id':'count'}).reset_index()
# df_plot = df_plot.pivot(index='days_activee_decile', columns='days_since_decile', values='user_id')
df_plot = df.groupby(['days_since_decile', 'mean_wk_fq'], observed=True).agg({'user_id':'count'}).reset_index()
df_plot = df_plot.pivot(index='mean_wk_fq', columns='days_since_decile', values='user_id')

# Set figure size globally
sns.set_theme(rc={'figure.figsize': (12, 6)})

_ = sns.heatmap(df_plot, annot=False, cmap='Purples')
df_plot

In [None]:
df_plot = df.groupby(['days_since_decile', 'mean_interval'], observed=True).agg({'user_id':'count'}).reset_index()
df_plot = df_plot.pivot(index='mean_interval', columns='days_since_decile', values='user_id')

# Set figure size globally
sns.set_theme(rc={'figure.figsize': (12, 6)})

_ = sns.heatmap(df_plot, annot=False, cmap='Blues')
df_plot

In [None]:
td.query("time_elapsed_mean.dt.days  > 74 & days_since.dt.days <= 2")[['time_elapsed_mean', 'days_since']]

In [None]:
# extreme cases
td.loc[(td.fq_mean > 70) & ~td.churned,
['fq_mean', 'tenure', 'days_active', 'days_since', 'user_id', 'nbr_entry', 'n_grp', 'nbr_connection']]

In [None]:
td[td.user_id == 'U8835e86e095f591d93b8d36454174525'][['fq_mean', 'tenure', 'days_active', 'days_since', 'nbr_entry', 'n_grp', 'nbr_connection']]

In [None]:
cashflow.query("user_id == 'U8835e86e095f591d93b8d36454174525'").groupby('yyyy_mm').size().plot()

In [None]:
cashflow.query("user_id == 'Uff7dc69b55ff36a6cf8fa0bd1e0356c8' & ts > '2025-01-25'")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute correlation matrix
corr_matrix = df.drop(columns='user_id').corr()

# Create heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
# Create pairplot
sns.pairplot(df.drop(columns='user_id'), kind="scatter", corner=True)
plt.suptitle("Pairwise Scatterplots of Correlations", y=1.02)

# Explore Group 

In [None]:
members = pd.read_feather('../data/members.feather')

In [None]:
#number of members in each group
nbr_mbr_grp = members.groupby('group_id')['user_id'].nunique()

nbr_mbr_grp.agg(['min', 'max', 'mean', 'median'])

In [None]:
nbr_mbr_grp.quantile([.25, .5, .75, .8, .9, .95, .99])

# Cash flow

In [None]:
# is this interesting?

cashflow.groupby(['user_id'])['amt'].sum().quantile([.1, .25, .5, .75, .9])