In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

In [5]:
consDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-consDF.pqt")
consDF = consDF.drop(columns = ["credit_score"])
consDF = consDF.dropna()
acctDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt")
trxnDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt")
catmap = pd.read_csv("/uss/hdsi-prismdata/q2-ucsd-cat-map.csv")

In [20]:
acctDF[acctDF["prism_account_id"]=='1000']

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
1000,405,1000,CHECKING,2022-02-28,0.0


In [28]:
consDF.merge(acctDF, on="prism_consumer_id", how="left")

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET,prism_account_id,account_type,balance_date,balance
0,0,2021-09-01,0.0,862,SAVINGS,2021-08-31,25.70
1,0,2021-09-01,0.0,863,CHECKING,2021-08-31,294.67
2,1,2021-07-01,0.0,7754,SAVINGS,2021-06-30,3211.18
3,1,2021-07-01,0.0,7755,CHECKING,2021-06-30,91.24
4,2,2021-05-01,0.0,4666,SAVINGS,2021-04-30,2561.43
...,...,...,...,...,...,...,...
21074,13998,2022-01-30,0.0,19885,CHECKING,2022-01-30,476.85
21075,13998,2022-01-30,0.0,19936,LOAN,2022-01-30,252.93
21076,13998,2022-01-30,0.0,19960,CREDIT CARD,2022-01-30,155.25
21077,13999,2022-01-26,0.0,24213,SAVINGS,2022-01-26,39.01


In [6]:
consumer["evaluation_date"] = pd.to_datetime(consumer["evaluation_date"])
transaction["posted_date"] = pd.to_datetime(transaction["posted_date"])

# Merge target onto transactions
tx = transaction.merge(
    consumer[["prism_consumer_id", "evaluation_date", "DQ_TARGET"]],
    on="prism_consumer_id",
    how="inner"
)

#last 90 days
tx = tx[
    (tx["posted_date"] < tx["evaluation_date"]) &
    (tx["posted_date"] >= tx["evaluation_date"] - pd.Timedelta(days=90))
]

# credit is "+" debit is "-"
tx["signed_amount"] = np.where(
    tx["credit_or_debit"] == "CREDIT",
    tx["amount"],
    -tx["amount"]
)

# Net cash flow per consumer
net_cf = (
    tx.groupby("prism_consumer_id")["signed_amount"]
    .sum()
    .reset_index(name="net_cash_flow_last_90d")
)

model_df = consumer.merge(net_cf, on="prism_consumer_id", how="left")

# Train model
train = model_df.dropna(subset=["DQ_TARGET"])

X = train[["net_cash_flow_last_90d"]].fillna(0)
y = train["DQ_TARGET"]

In [7]:
#fit model
model = LogisticRegression()
model.fit(X, y)

# Probability of BAD (DQ_TARGET = 1)
pred_prob_bad = model.predict_proba(X)[:, 1]

# Binary prediction
pred_bad = (pred_prob_bad > 0.5).astype(int)

In [28]:
#auc score
auc = roc_auc_score(y, pred_prob_bad)
auc

0.5508340714309738

In [29]:
#accuracy score
accuracy = accuracy_score(y, pred_bad)
accuracy

0.9161666666666667

In [30]:
#coefficient (negative coefficient means higher cash flow reduces risk of default)
model.coef_

array([[-9.22411161e-06]])

In [8]:
pred_prob_bad

array([0.08568433, 0.08290525, 0.08335001, ..., 0.0849501 , 0.08450044,
       0.08404202])

In [14]:
transaction = transaction.merge(
    mapping,
    left_on="category",
    right_on="category_id",
    how="left"
)

# Merge evaluation date + target
tx = transaction.merge(
    consumer[["prism_consumer_id", "evaluation_date", "DQ_TARGET"]],
    on="prism_consumer_id",
    how="inner"
)

# 90-day window
tx = tx[
    (tx["posted_date"] < tx["evaluation_date"]) &
    (tx["posted_date"] >= tx["evaluation_date"] - pd.Timedelta(days=90))
]

# Keep only income-related credits
income_categories = ["PAYCHECK", "DEPOSIT"]

income_tx = tx[
    (tx["credit_or_debit"] == "CREDIT") &
    (tx["category_y"].isin(income_categories))
]

# Income per consumer
income_90d = (
    income_tx.groupby("prism_consumer_id")["amount"]
    .sum()
    .reset_index(name="income_90d")
)

# Merge back
model_df = consumer.merge(income_90d, on="prism_consumer_id", how="left")

# Train data
train = model_df.dropna(subset=["DQ_TARGET"])

X = train[["income_90d"]].fillna(0)
y = train["DQ_TARGET"]  # 1 = BAD

In [15]:
model = LogisticRegression()
model.fit(X, y)

pred_prob_bad = model.predict_proba(X)[:, 1]
pred_bad = (pred_prob_bad > 0.5).astype(int)

In [16]:
auc = roc_auc_score(y, pred_prob_bad)
auc

0.52270531802816

In [17]:
accuracy = accuracy_score(y, pred_bad)
accuracy

0.9161666666666667

In [18]:
model.coef_

array([[-2.86176975e-06]])

In [19]:
savings = account[account["account_type"] == "SAVINGS"]

# Merge evaluation date
savings = savings.merge(
    consumer[["prism_consumer_id", "evaluation_date", "DQ_TARGET"]],
    on="prism_consumer_id",
    how="inner"
)

# Keep balances on or before evaluation
savings = savings[savings["balance_date"] <= savings["evaluation_date"]]

# Most recent balance per savings account
latest_balances = (
    savings.sort_values("balance_date")
    .groupby(["prism_consumer_id", "prism_account_id"])
    .tail(1)
)

# Sum savings balances per consumer
savings_balance = (
    latest_balances.groupby("prism_consumer_id")["balance"]
    .sum()
    .reset_index(name="savings_balance")
)

# Merge back to consumer table
analysis_df = consumer.merge(
    savings_balance,
    on="prism_consumer_id",
    how="left"
)

analysis_df["savings_balance"] = analysis_df["savings_balance"].fillna(0)
analysis_df = analysis_df.dropna(subset=["DQ_TARGET"])

In [31]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    analysis_df,
    test_size=0.3,
    random_state=42,
)

X_train = train_df[["savings_balance"]]
y_train = train_df["DQ_TARGET"]

X_test = test_df[["savings_balance"]]
y_test = test_df["DQ_TARGET"]

model = LogisticRegression()
model.fit(X_train, y_train)

In [32]:
pred_prob_bad = model.predict_proba(X_test)[:, 1]
pred_bad = (pred_prob_bad > 0.5).astype(int)

In [33]:
auc = roc_auc_score(y_test, pred_prob_bad)
auc

0.6062628423592626