In [1]:
import numpy as np
import pandas as pd

In [2]:
categories = pd.read_csv('../../data/q2-ucsd-cat-map.csv')
consumer = pd.read_parquet('../../data/q2-ucsd-consDF.pqt')
acct = pd.read_parquet("../../data/q2-ucsd-acctDF.pqt")
transactions = pd.read_parquet("../../data/q2-ucsd-trxnDF.pqt")

In [3]:
consumer.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0


In [4]:
transactions.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16


In [5]:
transactions[transactions['prism_consumer_id'] == "3023"]

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16
...,...,...,...,...,...,...
200,3023,200,39,10.91,DEBIT,2021-09-17
201,3023,201,4,81.73,DEBIT,2021-09-18
202,3023,202,16,21.85,DEBIT,2021-09-20
203,3023,203,45,25.00,DEBIT,2021-09-20


In [6]:
acct.head()

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.9


In [7]:
len(acct['prism_consumer_id'].unique())

13009

In [8]:
# merge 
acct['account_type'].unique()

# most important account_types: ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN]
# most_important_accounts = ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN']
acctDF = acct.copy()
total_balance = acctDF.groupby('prism_consumer_id')['balance'].sum()

In [9]:
consumer_balance = consumer.merge(pd.DataFrame(total_balance), on = 'prism_consumer_id', how = 'outer')
consumer_balance['std_credit'] = (consumer_balance['credit_score'] - consumer_balance['credit_score'].mean()) / consumer_balance['credit_score'].std()
consumer_balance["std_balance"] = (
    consumer_balance["balance"] - consumer_balance["balance"].mean()
    ) / consumer_balance["balance"].std()
consumer_balance

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027
2,10,2022-02-01,654.0,0.0,824.24,-0.094005,-0.136727
3,100,2021-12-01,750.0,0.0,2655.47,1.160470,-0.102219
4,1000,2021-03-01,756.0,0.0,95.25,1.238874,-0.150464
...,...,...,...,...,...,...,...
14995,9995,2023-03-02,578.0,,0.00,-1.087132,-0.152259
14996,9996,2023-10-11,610.0,,252.67,-0.668973,-0.147498
14997,9997,2023-05-25,675.0,,611.28,0.180411,-0.140740
14998,9998,2023-11-02,534.0,,-862.99,-1.662099,-0.168522


In [10]:
# spending balance ratio

total_trans = transactions.groupby("prism_consumer_id")[["amount"]].sum()
total_acc = acct.groupby("prism_consumer_id")[["balance"]].sum()

spend_balance = total_trans.merge(total_acc, how="inner", on="prism_consumer_id")
spend_balance["spending_balance_ratio"] = spend_balance["amount"] / (spend_balance["balance"] + 1)
spend_balance_dq = spend_balance.merge(consumer, how="inner", on="prism_consumer_id")
spend_balance_dq.head()

Unnamed: 0,prism_consumer_id,amount,balance,spending_balance_ratio,evaluation_date,credit_score,DQ_TARGET
0,0,29295.23,320.37,91.157326,2021-09-01,726.0,0.0
1,1,48002.17,3302.42,14.531053,2021-07-01,626.0,0.0
2,10,42343.16,824.24,51.310116,2022-02-01,654.0,0.0
3,100,74979.45,2655.47,28.22522,2021-12-01,750.0,0.0
4,1000,156268.06,95.25,1623.56426,2021-03-01,756.0,0.0


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.svm import SVC
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import warnings

def run_classification(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run Logistic Regression and Random Forest Classification on a dataset.

    Parameters:
        feature_column (list): List of columns that can be used as features.
        target_column (str): The name of the target column.
        dataset (pd.DataFrame): The dataset containing the features and target.
        test_size (float): Proportion of the dataset to include in the test split (default 0.3).
        random_state (int): Random seed for reproducibility (default 42).

    Returns:
        None: Prints the classification reports for both models.
    """
    warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

    # Drop NaN values and shuffle the dataset
    dataset = dataset.dropna()

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # balance the dataset
    resampler = SMOTETomek(random_state=random_state)
    X_train, y_train = resampler.fit_resample(X_train, y_train)

    # Logistic Regression
    log_model = LogisticRegression(class_weight = 'balanced')
    log_model.fit(X_train, y_train)
    log_y_pred = log_model.predict(X_test)
    print(f"Logistic Regression for {feature_column}")
    print(classification_report(y_test, log_y_pred))

    # Random Forest Classification
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(X_train, y_train)
    rfc_y_pred = rfc_model.predict(X_test)
    print(f"\nRandom Forest Classification for {feature_column}")
    print(classification_report(y_test, rfc_y_pred))

    # Light GBM 
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train, y_train)
    lgb_y_pred = lgb_model.predict(X_test)
    print(f"\nLGB Model Classification for {feature_column}")
    print(classification_report(y_test, lgb_y_pred))

    # SVC Model
    # svc_model = SVC(probability=True, random_state=random_state)
    # svc_model.fit(X_train, y_train)
    # svc_y_pred = svc_model.predict(X_test)
    # print(f"\nSVM Classification for {feature_column}")
    # print(classification_report(y_test, svc_y_pred))

    # Balanced Random Forest
    brf_model = BalancedRandomForestClassifier(random_state=random_state)
    brf_model.fit(X_train, y_train)
    brf_y_pred = brf_model.predict(X_test)
    print(f"\nBalanced Random Forest Classification for {feature_column}")
    print(classification_report(y_test, brf_y_pred))

In [12]:
# Example usage:
run_classification(["std_balance"], "DQ_TARGET", consumer_balance)


Logistic Regression for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.97      0.37      0.54      1898
         1.0       0.12      0.89      0.21       184

    accuracy                           0.42      2082
   macro avg       0.55      0.63      0.37      2082
weighted avg       0.90      0.42      0.51      2082


Random Forest Classification for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.93      0.68      0.79      1898
         1.0       0.13      0.50      0.21       184

    accuracy                           0.66      2082
   macro avg       0.53      0.59      0.50      2082
weighted avg       0.86      0.66      0.74      2082

[LightGBM] [Info] Number of positive: 5975, number of negative: 5975
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total B

In [13]:
run_classification(["std_credit", "std_balance"], "DQ_TARGET", consumer_balance)


Logistic Regression for ['std_credit', 'std_balance']
              precision    recall  f1-score   support

         0.0       0.96      0.72      0.82      1898
         1.0       0.19      0.70      0.30       184

    accuracy                           0.72      2082
   macro avg       0.58      0.71      0.56      2082
weighted avg       0.89      0.72      0.78      2082


Random Forest Classification for ['std_credit', 'std_balance']
              precision    recall  f1-score   support

         0.0       0.95      0.80      0.87      1898
         1.0       0.21      0.55      0.31       184

    accuracy                           0.78      2082
   macro avg       0.58      0.68      0.59      2082
weighted avg       0.88      0.78      0.82      2082

[LightGBM] [Info] Number of positive: 6906, number of negative: 6906
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_col_wise=true` to remove the overhea

In [14]:
run_classification(["spending_balance_ratio"], "DQ_TARGET", spend_balance_dq)

Logistic Regression for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.93      0.94      0.93      1855
         1.0       0.21      0.17      0.19       170

    accuracy                           0.88      2025
   macro avg       0.57      0.56      0.56      2025
weighted avg       0.87      0.88      0.87      2025


Random Forest Classification for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.93      0.68      0.79      1855
         1.0       0.11      0.43      0.18       170

    accuracy                           0.66      2025
   macro avg       0.52      0.56      0.48      2025
weighted avg       0.86      0.66      0.74      2025

[LightGBM] [Info] Number of positive: 5697, number of negative: 5697
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[Li