In [1]:
import pandas as pd
import numpy as np

In [2]:
cons = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-consDF.pqt")
cons = cons.drop(columns = ["credit_score"])
cons = cons.dropna()
acc = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt")
txn = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt")
catmap = pd.read_csv("/uss/hdsi-prismdata/q2-ucsd-cat-map.csv")

# account type counts per consumer (ohe)

In [3]:
acc_counts = acc.groupby("prism_consumer_id")["account_type"].value_counts().unstack(fill_value=0)
acc_counts.columns.name = None
cons = cons.merge(acc_counts, left_on= 'prism_consumer_id', right_index=True)
cons

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET,401K,AUTO,BROKERAGE,CASH MANAGEMENT,CD,CHECKING,CONSUMER,...,MONEYMARKET,MORTGAGE,OTHER,OVERDRAFT,PREPAID,RETIREMENT,ROTH,SAVINGS,STOCK PLAN,STUDENT
0,0,2021-09-01,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,1,2021-07-01,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,2,2021-05-01,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,3,2021-03-01,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,4,2021-10-01,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,13995,2022-01-22,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
13996,13996,2022-02-01,0.0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,3,0,0
13997,13997,2021-12-24,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
13998,13998,2022-01-30,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,2,0,0


# over time attributes
Create some attributes based on differences in spending/income/balance over time

In [4]:
txn

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16
...,...,...,...,...,...,...
6407316,10533,6405304,31,4.96,DEBIT,2022-03-11
6407317,10533,6405305,12,63.48,DEBIT,2022-03-30
6407318,10533,6405306,12,53.99,DEBIT,2022-03-30
6407319,10533,6405307,12,175.98,DEBIT,2022-03-31


In [5]:
income_cats = [2,3,5,7,8,9,49]

## spending vs income ratios

In [6]:
txn_copy = txn.copy(deep=True)
txn_copy["posted_date"] = pd.to_datetime(txn_copy["posted_date"])
txn_copy["month"] = txn_copy["posted_date"].dt.to_period("M")

# 1) create filtered income + spend columns
txn_copy["income_amt"] = np.where((txn_copy["credit_or_debit"] == "CREDIT") &(txn_copy["category"].isin(income_cats)),
                                  txn_copy["amount"],0.0)

txn_copy["spend_amt"] = np.where(txn_copy["credit_or_debit"] == "DEBIT", txn_copy["amount"],0.0)

# 2) monthly aggregation
monthly = (txn_copy.groupby(["prism_consumer_id", "month"], as_index=False)
           .agg(income=("income_amt", "sum"), spend=("spend_amt", "sum"))
          )

# 3) monthly spend-to-income ratio
monthly["spend_to_income"] = np.where(monthly["income"] > 0, monthly["spend"] / monthly["income"], np.nan)

# 4) consumer-level mean
feat_spend_to_income = (monthly.groupby("prism_consumer_id")["spend_to_income"].agg(['mean','median']) # skips NaNs by default
                             .rename(columns={"mean": "mean_monthly_spend_to_income", "median": "med_monthly_spend_to_income"})
                             .reset_index()
                            )

# 5) impute NaNs with 95 percentile (bc higher ratio = higher risk)
impute_p95_mean = feat_spend_to_income['mean_monthly_spend_to_income'].quantile(0.95) 
feat_spend_to_income["mean_monthly_spend_to_income"] = feat_spend_to_income["mean_monthly_spend_to_income"].fillna(impute_p95_mean)
impute_p95_med = feat_spend_to_income['med_monthly_spend_to_income'].quantile(0.95)
feat_spend_to_income["med_monthly_spend_to_income"] = feat_spend_to_income["med_monthly_spend_to_income"].fillna(impute_p95_med)

monthly["income_zero"] = (monthly["income"] == 0).astype(int)

feat_income_zero_frac = (monthly.groupby("prism_consumer_id")[["income_zero"]].mean()
                         .rename(columns={"income_zero":"frac_months_income_zero"})
                         .reset_index()
                        )

In [18]:
# monthly
monthly[monthly['spend_to_income'].isna()]

89553

In [14]:
feat_spend_to_income
# sum(feat_spend_to_income['med_monthly_spend_to_income'].isna())

0

In [12]:
feat_income_zero_frac
# sum(feat_income_zero_frac['frac_months_income_zero'].isna())

0

### Stress Behavior (In the worst 10% of months, how bad does spending relative to income get?)
What do people look like in their worst financial months?

In [20]:
feat_p90_spend_to_income = (
    monthly
    .groupby("prism_consumer_id")["spend_to_income"]
    .quantile(0.90)
    .rename("p90_monthly_spend_to_income")
    .reset_index()
)

impute_p95_90 = feat_p90_spend_to_income['p90_monthly_spend_to_income'].quantile(0.95) 
feat_p90_spend_to_income["p90_monthly_spend_to_income"] = feat_p90_spend_to_income["p90_monthly_spend_to_income"].fillna(impute_p95_90)


feat_max_spend_to_income = (
    monthly
    .groupby("prism_consumer_id")["spend_to_income"]
    .max()
    .rename("max_monthly_spend_to_income")
    .reset_index()
)

impute_p95_max = feat_max_spend_to_income['max_monthly_spend_to_income'].quantile(0.95) 
feat_max_spend_to_income["max_monthly_spend_to_income"] = feat_max_spend_to_income["max_monthly_spend_to_income"].fillna(impute_p95_max)

In [24]:
feat_p90_spend_to_income

Unnamed: 0,prism_consumer_id,p90_monthly_spend_to_income
0,0,9.277178
1,1,2.833610
2,10,1.900582
3,100,2.676729
4,1000,2.612073
...,...,...
14487,9995,1.661085
14488,9996,43962.800000
14489,9997,1.214974
14490,9998,2.324081


In [25]:
feat_max_spend_to_income

Unnamed: 0,prism_consumer_id,max_monthly_spend_to_income
0,0,19.668121
1,1,3.819494
2,10,2.006956
3,100,3.215536
4,1000,3.011904
...,...,...
14487,9995,1.685508
14488,9996,45737.000000
14489,9997,1.305735
14490,9998,2.349866


### avg ratio of worst k months

In [30]:
def mean_of_worst_k(x, k=3):
    return x.nlargest(k).mean()

feat_worst3_avg = (
    monthly
    .groupby("prism_consumer_id")["spend_to_income"]
    .apply(mean_of_worst_k)
    .rename("avg_worst_3_months_spend_to_income")
    .reset_index()
)

impute_p95_worst3 = feat_worst3_avg['avg_worst_3_months_spend_to_income'].quantile(0.95) 
feat_worst3_avg["avg_worst_3_months_spend_to_income"] = feat_worst3_avg["avg_worst_3_months_spend_to_income"].fillna(impute_p95_worst3)


In [31]:
feat_worst3_avg
# sum(feat_worst3_avg["avg_worst_3_months_spend_to_income"].isna())

0

### are bad months getting worse (slope of months with ratios > 1)

In [36]:
from scipy.stats import linregress

def stress_slope(df):
    df = df.dropna(subset=["spend_to_income"])
    stress = df[df["spend_to_income"] > 1] #ratio >1
    if len(stress) < 3:
        return np.nan
    x = np.arange(len(stress))
    return linregress(x, stress["spend_to_income"]).slope

feat_stress_slope = (
    monthly
    .sort_values("month")
    .groupby("prism_consumer_id")
    .apply(stress_slope)
    .rename("stress_ratio_slope")
    .reset_index()
)

impute_p95_slope = feat_stress_slope['stress_ratio_slope'].quantile(0.95) 
feat_stress_slope["stress_ratio_slope"] = feat_stress_slope["stress_ratio_slope"].fillna(impute_p95_slope)

  .apply(stress_slope)


In [38]:
feat_stress_slope
# sum(feat_stress_slope["stress_ratio_slope"].isna())

Unnamed: 0,prism_consumer_id,stress_ratio_slope
0,0,0.066520
1,1,0.364657
2,10,-0.017239
3,100,-0.215102
4,1000,-0.335906
...,...,...
14487,9995,0.061057
14488,9996,11921.500000
14489,9997,9.501000
14490,9998,0.180084


In [40]:
# dfs must have 'prism_consumer_id' as columns, not as index
feat_dfs = [feat_spend_to_income, feat_income_zero_frac, feat_p90_spend_to_income, feat_max_spend_to_income, feat_stress_slope]
new_feats = cons.copy()
for feat in feat_dfs:
    new_feats = new_feats.merge(feat, left_on = 'prism_consumer_id', right_on = 'prism_consumer_id')

new_feats

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET,401K,AUTO,BROKERAGE,CASH MANAGEMENT,CD,CHECKING,CONSUMER,...,ROTH,SAVINGS,STOCK PLAN,STUDENT,mean_monthly_spend_to_income,med_monthly_spend_to_income,frac_months_income_zero,p90_monthly_spend_to_income,max_monthly_spend_to_income,stress_ratio_slope
0,0,2021-09-01,0.0,0,0,0,0,0,1,0,...,0,1,0,0,3.953563,1.098073,0.000000,9.277178,19.668121,0.066520
1,1,2021-07-01,0.0,0,0,0,0,0,1,0,...,0,1,0,0,1.876386,1.660304,0.000000,2.833610,3.819494,0.364657
2,2,2021-05-01,0.0,0,0,0,0,0,1,0,...,0,1,0,0,13.466530,13.466530,0.714286,21.503546,23.512800,9.501000
3,3,2021-03-01,0.0,0,0,0,0,0,1,0,...,0,1,0,0,2.603152,1.853908,0.285714,4.712070,5.399628,0.382801
4,4,2021-10-01,0.0,0,0,0,0,0,1,0,...,0,1,0,0,1.678499,1.143839,0.000000,3.117119,4.383074,-0.140950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10117,13995,2022-01-22,0.0,0,0,0,0,0,1,0,...,0,1,0,0,988.575606,66.666667,0.111111,3129.473684,5200.000000,-619.514560
10118,13996,2022-02-01,0.0,0,0,0,0,0,2,0,...,0,3,0,0,20567.497913,5.988123,0.000000,37028.712789,185051.000000,-12337.478297
10119,13997,2021-12-24,0.0,0,0,0,0,0,1,0,...,0,1,0,0,1.759568,0.696240,0.000000,4.889872,7.180356,3.051428
10120,13998,2022-01-30,0.0,0,0,0,0,0,1,0,...,0,2,0,0,1.720148,1.038057,0.111111,3.088822,6.322488,0.708997


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

feature_cols = new_feats.drop(
    columns=["prism_consumer_id", "DQ_TARGET", "evaluation_date"]
)

for col in feature_cols.columns:
    X = feature_cols[[col]]
    y = new_feats["DQ_TARGET"]
    
    model = LogisticRegression(class_weight='balanced')
    model.fit(X, y)
    
    preds = model.predict_proba(X)[:, 1]
    
    score = roc_auc_score(y, preds)
    
    print(col)
    print(score)

401K
0.5005253727637398
AUTO
0.5
BROKERAGE
0.5006224253073986
CASH MANAGEMENT
0.5004173071654812
CD
0.5012420347769738
CHECKING
0.5119556093804695
CONSUMER
0.5010075693475153
CREDIT CARD
0.5035480808062858
HOME EQUITY
0.5
HSA
0.5
IRA
0.5003780105842963
LINE OF CREDIT
0.501984227053143
LOAN
0.5015793721482603
MONEY MARKET
0.5006086589891493
MONEYMARKET
0.5
MORTGAGE
0.500972027216762
OTHER
0.500324009072254
OVERDRAFT
0.5
PREPAID
0.500565420235011
RETIREMENT
0.5
ROTH
0.5002700075602117
SAVINGS
0.6148664723446555
STOCK PLAN
0.5005793742757821
STUDENT
0.5005793742757821
mean_monthly_spend_to_income
0.5286592844993635
med_monthly_spend_to_income
0.5515283303946416
frac_months_income_zero
0.5086926790844698
p90_monthly_spend_to_income
0.5056548280918494
max_monthly_spend_to_income
0.4936074849724993
stress_ratio_slope
0.47313615627124
