In [1]:
import pandas as pd
import numpy as np

In [6]:
accounts = pd.read_parquet('../data/q2-ucsd-acctDF.pqt')
accounts.head()

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.9


In [13]:
consumer = pd.read_parquet('../data/q2-ucsd-consDF.pqt')
consumer = consumer[consumer["DQ_TARGET"].notna()]
consumer.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0


In [21]:
# proportion of delinquent consumers

consumer["DQ_TARGET"].value_counts()

DQ_TARGET
0.0    10994
1.0     1006
Name: count, dtype: int64

In [24]:
categories = pd.read_csv('../data/q2-ucsd-cat-map.csv')
categories.head()

Unnamed: 0,category_id,category
0,0,SELF_TRANSFER
1,1,EXTERNAL_TRANSFER
2,2,DEPOSIT
3,3,PAYCHECK
4,4,MISCELLANEOUS


In [10]:
transactions = pd.read_parquet('../data/q2-ucsd-trxnDF.pqt')
transactions.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16


In [26]:
total_trans = transactions.groupby("prism_consumer_id")[["amount"]].sum()
total_trans

Unnamed: 0_level_0,amount
prism_consumer_id,Unnamed: 1_level_1
0,29295.23
1,48002.17
10,42343.16
100,74979.45
1000,156268.06
...,...
9995,33973.23
9996,2390.92
9997,33909.03
9998,29965.84


In [27]:
total_acc = accounts.groupby("prism_consumer_id")[["balance"]].sum()
total_acc

Unnamed: 0_level_0,balance
prism_consumer_id,Unnamed: 1_level_1
0,320.37
1,3302.42
10,824.24
100,2655.47
1000,95.25
...,...
9995,0.00
9996,252.67
9997,611.28
9998,-862.99


In [37]:
spend_balance = total_trans.merge(total_acc, how="inner", on="prism_consumer_id")
spend_balance["spending_balance_ratio"] = spend_balance["amount"] / (spend_balance["balance"] + 1)
spend_balance_dq = spend_balance.merge(consumer, how="inner", on="prism_consumer_id")
spend_balance_dq.head()

Unnamed: 0,prism_consumer_id,amount,balance,spending_balance_ratio,evaluation_date,credit_score,DQ_TARGET
0,0,29295.23,320.37,91.157326,2021-09-01,726.0,0.0
1,1,48002.17,3302.42,14.531053,2021-07-01,626.0,0.0
2,10,42343.16,824.24,51.310116,2022-02-01,654.0,0.0
3,100,74979.45,2655.47,28.22522,2021-12-01,750.0,0.0
4,1000,156268.06,95.25,1623.56426,2021-03-01,756.0,0.0


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import warnings

def run_classification(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run Logistic Regression and Random Forest Classification on a dataset.

    Parameters:
        feature_column (list): List of columns that can be used as features.
        target_column (str): The name of the target column.
        dataset (pd.DataFrame): The dataset containing the features and target.
        test_size (float): Proportion of the dataset to include in the test split (default 0.3).
        random_state (int): Random seed for reproducibility (default 42).

    Returns:
        None: Prints the classification reports for both models.
    """
    warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

    # Drop NaN values and shuffle the dataset
    dataset = dataset.dropna()
    dataset = dataset.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # balance the dataset
    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

    # Logistic Regression
    log_model = LogisticRegression(class_weight = 'balanced')
    log_model.fit(X_train_bal, y_train_bal)
    log_y_pred = log_model.predict(X_test)
    print(f"Logistic Regression for {feature_column}")
    print(classification_report(y_test, log_y_pred))

    # Random Forest Classification
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(X_train_bal, y_train_bal)
    rfc_y_pred = rfc_model.predict(X_test)
    print(f"\nRandom Forest Classification for {feature_column}")
    print(classification_report(y_test, rfc_y_pred))

    # Light GBM 
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train_bal, y_train_bal)
    lgb_y_pred = lgb_model.predict(X_test)
    print(f"\nLGB Model Classification for {feature_column}")
    print(classification_report(y_test, lgb_y_pred))

    # SVC Model
    svc_model = SVC(probability=True, random_state=random_state)
    svc_model.fit(X_train_bal, y_train_bal)
    svc_y_pred = svc_model.predict(X_test)
    print(f"\nSVM Classification for {feature_column}")
    print(classification_report(y_test, svc_y_pred))

    # XGB Model
    xgb_model = xgb.XGBClassifier(
        scale_pos_weight=len(y_train) / y_train.sum()
        )
    xgb_model.fit(X_train_bal, y_train_bal)
    xgb_y_pred = xgb_model.predict(X_test)
    print("\nXGBoost:")
    print(classification_report(y_test, xgb_y_pred))

In [38]:
run_classification(["spending_balance_ratio"], "DQ_TARGET", spend_balance_dq)

Logistic Regression for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.92      0.94      0.93      1850
         1.0       0.24      0.19      0.21       175

    accuracy                           0.88      2025
   macro avg       0.58      0.57      0.57      2025
weighted avg       0.87      0.88      0.87      2025


Random Forest Classification for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.93      0.66      0.77      1850
         1.0       0.12      0.50      0.19       175

    accuracy                           0.64      2025
   macro avg       0.53      0.58      0.48      2025
weighted avg       0.86      0.64      0.72      2025

[LightGBM] [Info] Number of positive: 7409, number of negative: 7409
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[Li

In [16]:
cons_acc = consumer.merge(accounts, how="inner", on="prism_consumer_id")
cons_acc

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,prism_account_id,account_type,balance_date,balance
0,0,2021-09-01,726.0,0.0,862,SAVINGS,2021-08-31,25.70
1,0,2021-09-01,726.0,0.0,863,CHECKING,2021-08-31,294.67
2,1,2021-07-01,626.0,0.0,7754,SAVINGS,2021-06-30,3211.18
3,1,2021-07-01,626.0,0.0,7755,CHECKING,2021-06-30,91.24
4,2,2021-05-01,680.0,0.0,4666,SAVINGS,2021-04-30,2561.43
...,...,...,...,...,...,...,...,...
19482,13998,2022-01-30,685.0,0.0,19885,CHECKING,2022-01-30,476.85
19483,13998,2022-01-30,685.0,0.0,19936,LOAN,2022-01-30,252.93
19484,13998,2022-01-30,685.0,0.0,19960,CREDIT CARD,2022-01-30,155.25
19485,13999,2022-01-26,653.0,0.0,24213,SAVINGS,2022-01-26,39.01
