In [1]:
import pandas as pd
import numpy as np

In [2]:
cons = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-consDF.pqt")
cons = cons.drop(columns = ["credit_score"])
cons = cons.dropna()
acc = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt")
txn = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt")
catmap = pd.read_csv("/uss/hdsi-prismdata/q2-ucsd-cat-map.csv")

# Sample of good and bads

In [3]:
bads = cons[cons['DQ_TARGET'] == 1].sample(100, random_state=42)['prism_consumer_id'].to_list()
goods = cons[cons['DQ_TARGET'] == 0].sample(100, random_state=42)['prism_consumer_id'].to_list()

In [4]:
cons_bads = cons[cons['prism_consumer_id'].isin(bads)]
cons_bads

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET
432,432,2021-01-01,1.0
959,959,2022-02-01,1.0
1368,1368,2021-06-01,1.0
1747,1747,2021-04-01,1.0
2283,2283,2021-06-01,1.0
...,...,...,...
12671,12671,2022-02-12,1.0
13029,13029,2022-01-12,1.0
13301,13301,2022-01-05,1.0
13620,13620,2022-02-05,1.0


In [5]:
cons_goods = cons[cons['prism_consumer_id'].isin(goods)]
cons_goods

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET
33,33,2021-11-01,0.0
110,110,2021-07-01,0.0
258,258,2022-03-01,0.0
311,311,2021-02-01,0.0
469,469,2021-03-01,0.0
...,...,...,...
13612,13612,2022-03-20,0.0
13727,13727,2022-02-10,0.0
13760,13760,2022-01-01,0.0
13960,13960,2022-02-26,0.0


In [6]:
acc_bads = acc[acc['prism_consumer_id'].isin(bads)]
acc_bads

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
2234,2798,2234,SAVINGS,2022-02-28,0.04
2235,2798,2235,CHECKING,2022-02-28,522.16
3021,959,3021,CHECKING,2022-01-31,0.14
4138,3376,4138,SAVINGS,2022-03-31,0.00
4139,3376,4139,CHECKING,2022-03-31,3.48
...,...,...,...,...,...
22768,10199,22768,CREDIT CARD,2021-12-11,9070.87
22769,10199,22769,SAVINGS,2021-12-11,2881.56
22781,10199,22781,CHECKING,2021-12-11,475.00
22782,10199,22782,LINE OF CREDIT,2021-12-11,0.00


In [7]:
acc_goods = acc[acc['prism_consumer_id'].isin(goods)]
acc_goods

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
31,1439,31,SAVINGS,2021-02-28,1805.72
32,1439,32,CHECKING,2021-02-28,19941.06
218,1785,218,CHECKING,2021-08-31,660.48
243,1306,243,CHECKING,2021-05-31,500.00
446,1143,446,SAVINGS,2022-02-28,13360.16
...,...,...,...,...,...
23610,12971,23610,CREDIT CARD,2022-02-09,2424.36
23611,12971,23611,SAVINGS,2022-02-09,817.70
23612,12971,23612,SAVINGS,2022-02-09,785.70
24245,11285,24245,CHECKING,2022-02-02,700.00


In [8]:
txn_bads = txn[txn['prism_consumer_id'].isin(bads)]
txn_bads

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
353052,2798,352859,0,20.00,CREDIT,2021-09-19
353053,2798,352860,4,0.01,CREDIT,2021-10-15
353054,2798,352861,0,130.33,CREDIT,2022-01-25
353055,2798,352862,4,0.04,CREDIT,2022-02-18
353056,2798,352863,3,31.51,CREDIT,2021-09-19
...,...,...,...,...,...,...
5902603,12339,5900591,20,39.00,DEBIT,2021-11-30
5902604,12339,5900592,1,757.80,DEBIT,2021-12-01
5902605,12339,5900593,17,181.03,DEBIT,2021-12-02
5902606,12339,5900594,20,2.99,DEBIT,2021-12-02


In [9]:
txn_goods = txn[txn['prism_consumer_id'].isin(goods)]
txn_goods

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
3020,1439,3020,49,2575.29,CREDIT,2020-10-01
3021,1439,3021,0,1500.00,CREDIT,2020-10-05
3022,1439,3022,1,2600.00,CREDIT,2020-10-06
3023,1439,3023,4,0.18,CREDIT,2020-10-09
3024,1439,3024,49,2575.29,CREDIT,2020-11-02
...,...,...,...,...,...,...
6293804,11285,6291792,0,777.78,DEBIT,2022-01-10
6293805,11285,6291793,19,500.00,DEBIT,2022-01-13
6293806,11285,6291794,23,4.95,DEBIT,2022-01-14
6293807,11285,6291795,19,500.00,DEBIT,2022-01-24


## Observations

### 1. Good accounts have higher checking and savings balance (potential lower credit card use and higher consumer balance)

In [10]:
# average account balance per account type
acc_bads.groupby(['account_type'])[['balance']].agg(['mean','count']).sort_values(('balance', 'count'), ascending=False)

Unnamed: 0_level_0,balance,balance
Unnamed: 0_level_1,mean,count
account_type,Unnamed: 1_level_2,Unnamed: 2_level_2
CHECKING,720.343302,106
SAVINGS,766.007353,34
CREDIT CARD,3331.38,4
CONSUMER,5814.39,2
LINE OF CREDIT,7871.66,2
AUTO,12040.82,1


In [11]:
acc_goods.groupby(['account_type'])[['balance']].agg(['mean','count']).sort_values(('balance', 'count'), ascending=False)

Unnamed: 0_level_0,balance,balance
Unnamed: 0_level_1,mean,count
account_type,Unnamed: 1_level_2,Unnamed: 2_level_2
CHECKING,16817.758351,97
SAVINGS,2962.56,53
CREDIT CARD,2403.271,10
MONEY MARKET,1006.035,2
CONSUMER,18679.06,1


### 2. Good Consumers (avg net sum): higher paychecks, much higher external transfers, negative net tax amount, etc.

In [12]:
# turn debit into negative values (bc spend)
txn_bads.loc[txn_bads['credit_or_debit'] == 'DEBIT', 'amount'] *= -1
txn_goods.loc[txn_goods['credit_or_debit'] == 'DEBIT', 'amount'] *= -1

# average net transaction amounts of each consumer per category
per_cons_bad = txn_bads.groupby(['category','prism_consumer_id'])[['amount']].agg(['sum','count']).reset_index()
per_cons_bad.columns = ['category','id','bad_cons_mean','bad_cons_count']
per_cons_bad = per_cons_bad.groupby(['category'])[['bad_cons_mean', 'bad_cons_count']].mean() # mean net value across bad consumers
per_cons_good = txn_goods.groupby(['category','prism_consumer_id'])[['amount']].agg(['sum','count']).reset_index()
per_cons_good.columns = ['category','id','good_cons_mean','good_cons_count']
per_cons_good = per_cons_good.groupby(['category'])[['good_cons_mean', 'good_cons_count']].mean() # mean net value across good consumers

txn_df = per_cons_bad.merge(per_cons_good, left_index=True, right_index=True)
txn_df = txn_df.merge(catmap, left_index=True, right_on='category_id').drop(columns='category_id')
txn_df = txn_df.set_index('category')[['bad_cons_mean',	'bad_cons_count', 'good_cons_mean',	'good_cons_count']]
txn_df['count_diff'] = txn_df['good_cons_count'] - txn_df['bad_cons_count']
txn_df['mean_diff'] = txn_df['good_cons_mean'] - txn_df['bad_cons_mean']
txn_df = txn_df.sort_values(by=['mean_diff', 'count_diff'], key=lambda x: x.abs(), ascending=False)
txn_df

Unnamed: 0_level_0,bad_cons_mean,bad_cons_count,good_cons_mean,good_cons_count,count_diff,mean_diff
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PAYCHECK,13775.901667,18.5,44916.531912,15.985294,-2.514706,31140.630245
EXTERNAL_TRANSFER,-1583.986977,54.023256,-13038.709756,52.146341,-1.876914,-11454.722779
TAX,3168.451818,3.954545,-2730.546757,2.837838,-1.116708,-5898.998575
MISCELLANEOUS,-287.641848,37.836957,-6056.826292,42.505618,4.668661,-5769.184444
SELF_TRANSFER,1375.325714,84.238095,-3690.691923,65.525641,-18.712454,-5066.017637
PENSION,13757.838,8.4,9805.06,7.125,-1.275,-3952.778
TIME_OR_STUFF,136.263333,1.666667,3601.436,87.2,85.533333,3465.172667
GAMBLING,-897.77625,19.0,-4114.803333,21.666667,2.666667,-3217.027083
CREDIT_CARD_PAYMENT,-3197.404151,14.603774,-6004.542982,21.280702,6.676928,-2807.138832
ATM_CASH,-6131.74726,13.424658,-3869.821765,8.485294,-4.939363,2261.925496


## Features (income and balance related)

(brainstorm)
- average paycheck net amount in time periods
- net balance in time periods
- sum of inflow
- sum of outflow
- income in time periods
- grocery ratio to income & balance
- entertainment ratio to income

In [13]:
txn.loc[txn['credit_or_debit'] == 'DEBIT', 'amount'] *= -1

In [14]:
# net balance in time periods (1 month, 3 months, 6 months)

txn['posted_date'] = pd.to_datetime(txn['posted_date'])
txn = txn.sort_values(['prism_consumer_id','posted_date']) # sort for correct rolling window behavior

txn['rolling_30d_net'] = (
    txn
    .groupby('prism_consumer_id')
    .rolling('30D', on='posted_date')['amount']
    .sum()
    .reset_index(level=0, drop=True)
    .values
)

txn['rolling_90d_net'] = (
    txn
    .groupby('prism_consumer_id')
    .rolling('90D', on='posted_date')['amount']
    .sum()
    .reset_index(level=0, drop=True)
    .values
)

txn['rolling_180d_net'] = (
    txn
    .groupby('prism_consumer_id')
    .rolling('180D', on='posted_date')['amount']
    .sum()
    .reset_index(level=0, drop=True)
    .values
)

# def rolling_slope(y):
#     """
#     Computes slope of y against time index using OLS.
#     Returns NaN if not enough points.
#     """
#     if len(y) < 2:
#         return np.nan
#     x = np.arange(len(y))
#     return np.polyfit(x, y, 1)[0]

# txn['rolling_30d_slope'] = (
#     txn
#     .groupby('prism_consumer_id')['rolling_30d_net']
#     .rolling(window=30, min_periods=5)
#     .apply(rolling_slope, raw=True)
#     .reset_index(level=0, drop=True)
#     .values
# )

txn

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date,rolling_30d_net,rolling_90d_net,rolling_180d_net
136802,0,136738,14,-27.62,DEBIT,2021-03-16,-27.62,-27.62,-27.62
136767,0,136703,11,1400.00,CREDIT,2021-03-17,1372.38,1372.38,1372.38
136803,0,136739,39,-25.10,DEBIT,2021-03-17,1347.28,1347.28,1347.28
136804,0,136740,37,-500.00,DEBIT,2021-03-17,847.28,847.28,847.28
136805,0,136741,14,-25.00,DEBIT,2021-03-18,822.28,822.28,822.28
...,...,...,...,...,...,...,...,...,...
1524647,9999,1522635,16,-66.63,DEBIT,2023-08-08,-164.51,-274.02,-274.02
1524648,9999,1522636,14,-16.91,DEBIT,2023-08-08,-181.42,-290.93,-290.93
1524649,9999,1522637,14,-3.52,DEBIT,2023-08-08,-184.94,-294.45,-294.45
1524650,9999,1522638,16,-7.99,DEBIT,2023-08-08,-192.93,-302.44,-302.44


In [15]:
# rolling entertainment ratio to paycheck (improvement: include investment and other sources of income)

entertainment = catmap.loc[catmap['category'] == 'ENTERTAINMENT']['category_id'].iloc[0]
txn['ent_spend'] = np.where(
    (txn['category'] == entertainment) & (txn['amount'] < 0),
    -txn['amount'],   # make positive
    0
)
paycheck = catmap.loc[catmap['category'] == 'PAYCHECK']['category_id'].iloc[0]
txn['paycheck'] = np.where(
    (txn['category'] == paycheck),
    txn['amount'],
    0
)

txn['ent_1y_spend'] = (
    txn
    .groupby('prism_consumer_id')
    .rolling('365D', on='posted_date')['ent_spend']
    .sum()
    .reset_index(level=0, drop=True)
    .values
)

txn['income_1y'] = (
    txn
    .groupby('prism_consumer_id')
    .rolling('365D', on='posted_date')['paycheck']
    .sum()
    .reset_index(level=0, drop=True)
    .values
)

txn['ent_to_inc_1y'] = (
    txn['ent_1y_spend'] / txn['income_1y']
)

# guard against divide-by-zero / no income
txn.loc[txn['income_1y'] == 0, 'ent_to_inc_1y'] = np.nan

# txn

In [16]:
# rolling grocery ratio to paycheck (improvement: include investment and other sources of income)
# (rolling income computed above)
grocery = catmap.loc[catmap['category'] == 'GROCERIES']['category_id'].iloc[0]
txn['groc_spend'] = np.where(
    (txn['category'] == grocery) & (txn['amount'] < 0),
    -txn['amount'],   # make positive
    0
)

txn['groc_1y_spend'] = (
    txn
    .groupby('prism_consumer_id')
    .rolling('365D', on='posted_date')['groc_spend']
    .sum()
    .reset_index(level=0, drop=True)
    .values
)

txn['groc_to_inc_1y'] = (
    txn['groc_1y_spend'] / txn['income_1y']
)

# guard against divide-by-zero / no income
txn.loc[txn['income_1y'] == 0, 'groc_to_inc_1y'] = np.nan

# txn

In [17]:
txn.drop(['ent_spend', 'paycheck', 'ent_1y_spend', 'income_1y', 'groc_spend', 'groc_1y_spend'], axis=1)

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date,rolling_30d_net,rolling_90d_net,rolling_180d_net,ent_to_inc_1y,groc_to_inc_1y
136802,0,136738,14,-27.62,DEBIT,2021-03-16,-27.62,-27.62,-27.62,,
136767,0,136703,11,1400.00,CREDIT,2021-03-17,1372.38,1372.38,1372.38,,
136803,0,136739,39,-25.10,DEBIT,2021-03-17,1347.28,1347.28,1347.28,,
136804,0,136740,37,-500.00,DEBIT,2021-03-17,847.28,847.28,847.28,,
136805,0,136741,14,-25.00,DEBIT,2021-03-18,822.28,822.28,822.28,,
...,...,...,...,...,...,...,...,...,...,...,...
1524647,9999,1522635,16,-66.63,DEBIT,2023-08-08,-164.51,-274.02,-274.02,0.019701,0.032653
1524648,9999,1522636,14,-16.91,DEBIT,2023-08-08,-181.42,-290.93,-290.93,0.019701,0.032653
1524649,9999,1522637,14,-3.52,DEBIT,2023-08-08,-184.94,-294.45,-294.45,0.019701,0.032653
1524650,9999,1522638,16,-7.99,DEBIT,2023-08-08,-192.93,-302.44,-302.44,0.019701,0.032653
