In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind

In [2]:
inflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-inflows.pqt')
outflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

In [3]:
inflow.head(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.0,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-07-29,EXTERNAL_TRANSFER


In [4]:
outflow.head(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.6,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.0,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.0,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES


In [5]:
outflow_ids = set(outflow["prism_consumer_id"].unique())
inflow_ids = set(inflow["prism_consumer_id"].unique())

# Consumers in inflow but not in outflow
in_not_out = inflow_ids - outflow_ids
in_not_out

{2748, 4192, 4386, 4481, 4813, 4839, 5582}

In [6]:
# Consumers in outflow but not in inflow
out_not_in = outflow_ids - inflow_ids
out_not_in

{5943}

In [7]:
#consumers in both inflow and outflow
consumers_both = sorted(set(inflow["prism_consumer_id"]).intersection(outflow["prism_consumer_id"]))

#80-20 train test split
train_ids, test_ids = train_test_split(consumers_both, test_size=0.2, random_state=42)

inflow_train = inflow[inflow["prism_consumer_id"].isin(train_ids)]
inflow_test  = inflow[inflow["prism_consumer_id"].isin(test_ids)]

outflow_train = outflow[outflow["prism_consumer_id"].isin(train_ids)]
outflow_test  = outflow[outflow["prism_consumer_id"].isin(test_ids)]

In [8]:
len(inflow_train['prism_consumer_id'].unique())

2373

In [9]:
len(inflow_test['prism_consumer_id'].unique())

594

In [10]:
len(outflow_train['prism_consumer_id'].unique())

2373

In [11]:
len(outflow_test['prism_consumer_id'].unique())

594

In [12]:
# t-test to check biasness
ttest_inflow = ttest_ind(inflow_train["amount"], inflow_test["amount"], equal_var=False)
ttest_outflow = ttest_ind(outflow_train["amount"], outflow_test["amount"], equal_var=False)

print("Inflow amount t-test p-value:", ttest_inflow.pvalue)
print("Outflow amount t-test p-value:", ttest_outflow.pvalue)

Inflow amount t-test p-value: 4.636829117538466e-15
Outflow amount t-test p-value: 1.4156782129930558e-21


In [13]:
# Check medians instead since ammounts are severely skewed (mean in t-test not appropriate above)
print(f'Inflow_train: {inflow_train["amount"].median()}\nInflow_test: {inflow_test["amount"].median()}\nOutflow_train: {outflow_train["amount"].median()}\nOutflow_test: {outflow_test["amount"].median()}')

Inflow_train: 100.0
Inflow_test: 100.0
Outflow_train: 24.23
Outflow_test: 24.4
