In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

In [2]:
consumers = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-consDF.pqt")

In [3]:
consumers

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0
...,...,...,...,...
14995,14995,2022-03-08,655.0,
14996,14996,2022-01-15,625.0,
14997,14997,2022-01-31,688.0,
14998,14998,2022-03-08,722.0,


In [4]:
transactions = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt")

In [5]:
transactions

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16
...,...,...,...,...,...,...
6407316,10533,6405304,31,4.96,DEBIT,2022-03-11
6407317,10533,6405305,12,63.48,DEBIT,2022-03-30
6407318,10533,6405306,12,53.99,DEBIT,2022-03-30
6407319,10533,6405307,12,175.98,DEBIT,2022-03-31


In [6]:
accounts = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt")

In [7]:
accounts

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.90
...,...,...,...,...,...
24461,11500,24461,CHECKING,2022-03-27,732.75
24462,11615,24462,SAVINGS,2022-03-30,5.00
24463,11615,24463,CHECKING,2022-03-30,1956.46
24464,12210,24464,CHECKING,2022-03-28,2701.51


In [8]:
categories = pd.read_csv("/uss/hdsi-prismdata/q2-ucsd-cat-map.csv")

In [9]:
categories

Unnamed: 0,category_id,category
0,0,SELF_TRANSFER
1,1,EXTERNAL_TRANSFER
2,2,DEPOSIT
3,3,PAYCHECK
4,4,MISCELLANEOUS
5,5,PAYCHECK_PLACEHOLDER
6,6,REFUND
7,7,INVESTMENT_INCOME
8,8,OTHER_BENEFITS
9,9,UNEMPLOYMENT_BENEFITS


In [10]:
income_cats = [2, 3, 7, 8, 9, 49]

In [11]:
income_trxns = transactions[transactions['category'].isin(income_cats)]

In [15]:
income_trxns.loc['posted_date'] = pd.to_datetime(income_trxns['posted_date'])

In [16]:
monthly_income = income_trxns.groupby(['prism_consumer_id', income_trxns['posted_date'].dt.to_period('M')])['amount'].sum()

In [17]:
avg_monthly_income = monthly_income.groupby('prism_consumer_id').mean().rename('avg_monthly_income')

In [18]:
avg_monthly_income

prism_consumer_id
0       1331.508571
1       1915.941429
10      2202.962857
100         4068.63
1000    6911.228571
           ...     
9995        3742.28
9996           0.01
9997       4168.215
9998    2401.076667
9999        4185.22
Name: avg_monthly_income, Length: 13878, dtype: object

In [19]:
income_std = monthly_income.groupby('prism_consumer_id').std().rename('income_std')

In [20]:
income_std

prism_consumer_id
0        723.657216
1        621.198567
10       958.152740
100      949.527857
1000    3141.124869
           ...     
9995     918.977561
9996       0.000000
9997    1891.170898
9998     104.801560
9999    2853.394063
Name: income_std, Length: 13878, dtype: float64

In [21]:
avg_balance = accounts.groupby('prism_consumer_id')['balance'].mean().rename('avg_balance')

In [22]:
avg_balance

prism_consumer_id
0        160.185
1       1651.210
10       412.120
100     1327.735
1000      47.625
          ...   
9995       0.000
9996     252.670
9997     611.280
9998    -862.990
9999      -9.020
Name: avg_balance, Length: 13009, dtype: float64

In [23]:
features = pd.DataFrame(index=consumers['prism_consumer_id'].unique())
features = features.join([avg_monthly_income, income_std, avg_balance])

In [24]:
features['balance_income_ratio'] = features['avg_balance'] / (features['avg_monthly_income'])

In [25]:
features['prism_consumer_id'] = features.index

In [61]:
features[features.isna()] = 0

In [35]:
features

Unnamed: 0,avg_monthly_income,income_std,avg_balance,balance_income_ratio,prism_consumer_id
0,1331.508571,723.657216,160.185,0.120303,0
1,1915.941429,621.198567,1651.210,0.861827,1
2,550.0,636.396103,1402.680,2.550327,2
3,2155.562,2768.113331,3833.505,1.778425,3
4,1724.92,949.138545,197.275,0.114368,4
...,...,...,...,...,...
14995,2323.835,623.490298,0.000,0,14995
14996,3504.064444,3370.822540,3410.960,0.97343,14996
14997,2892.74625,556.164692,0.000,0,14997
14998,6847.975,1771.276101,0.000,0,14998


In [36]:
df = consumers.merge(features, on='prism_consumer_id')

In [37]:
df = df[~df['DQ_TARGET'].isnull()]

In [54]:
df

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,avg_monthly_income,income_std,avg_balance,balance_income_ratio
0,0,2021-09-01,726.0,0.0,1331.508571,723.657216,160.185000,0.120303
1,1,2021-07-01,626.0,0.0,1915.941429,621.198567,1651.210000,0.861827
2,2,2021-05-01,680.0,0.0,550.0,636.396103,1402.680000,2.550327
3,3,2021-03-01,734.0,0.0,2155.562,2768.113331,3833.505000,1.778425
4,4,2021-10-01,676.0,0.0,1724.92,949.138545,197.275000,0.114368
...,...,...,...,...,...,...,...,...
13995,13995,2022-01-22,802.0,0.0,0.27875,0.138918,342.933333,1230.254111
13996,13996,2022-02-01,652.0,0.0,1687.03,1880.778971,1642.252857,0.973458
13997,13997,2021-12-24,765.0,0.0,749.562857,1136.546514,1198.425000,1.598832
13998,13998,2022-01-30,685.0,0.0,4184.25875,4473.856723,2967.142000,0.70912


In [60]:
for col in features.drop(columns=['prism_consumer_id']):
    X = df[[col]]
    y = df['DQ_TARGET']
    
    model = LogisticRegression(class_weight='balanced')
    model.fit(X, y)
    
    preds = model.predict_proba(X)[:, 1]
    
    score = roc_auc_score(y, preds)
    
    print(col)
    print(score)

avg_monthly_income
0.5467276837429127
income_std
0.5265585855433164
avg_balance
0.6649332674138904
balance_income_ratio
0.638123415229923
