In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
categories = pd.read_csv("../../data_q2/q2-ucsd-cat-map.csv")
consumer = pd.read_parquet("../../data_q2/q2-ucsd-consDF.pqt")
acct = pd.read_parquet("../../data_q2/q2-ucsd-acctIDF.pqt")
transactions = pd.read_parquet("../../data_q2/q2-ucsd-trxnDF.pqt")

In [4]:
consumer.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0


In [5]:
transactions.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16


In [6]:
transactions[transactions["prism_consumer_id"] == "3023"]

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16
...,...,...,...,...,...,...
200,3023,200,39,10.91,DEBIT,2021-09-17
201,3023,201,4,81.73,DEBIT,2021-09-18
202,3023,202,16,21.85,DEBIT,2021-09-20
203,3023,203,45,25.00,DEBIT,2021-09-20


In [7]:
acct.head()

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.9


In [8]:
len(acct["prism_consumer_id"].unique())

13009

In [9]:
# merge
acct["account_type"].unique()

# most important account_types: ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN]
# most_important_accounts = ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN']
acctDF = acct.copy()
total_balance = acctDF.groupby("prism_consumer_id")["balance"].sum()

In [10]:
consumer_balance = consumer.merge(
    pd.DataFrame(total_balance), on="prism_consumer_id", how="outer"
)
consumer_balance["std_credit"] = (
    consumer_balance["credit_score"] - consumer_balance["credit_score"].mean()
) / consumer_balance["credit_score"].std()
consumer_balance["std_balance"] = (
    consumer_balance["balance"] - consumer_balance["balance"].mean()
) / consumer_balance["balance"].std()
consumer_balance

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.007780
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824
...,...,...,...,...,...,...,...
14995,14995,2022-03-08,655.0,,,-0.080938,
14996,14996,2022-01-15,625.0,,6821.92,-0.472962,-0.023705
14997,14997,2022-01-31,688.0,,,0.350288,
14998,14998,2022-03-08,722.0,,,0.794581,


In [11]:
# spending balance ratio

total_trans = transactions.groupby("prism_consumer_id")[["amount"]].sum()
total_acc = acct.groupby("prism_consumer_id")[["balance"]].sum()

spend_balance = total_trans.merge(total_acc, how="inner", on="prism_consumer_id")
spend_balance["spending_balance_ratio"] = spend_balance["amount"] / (
    spend_balance["balance"] + 1
)
spend_balance_dq = spend_balance.merge(consumer, how="inner", on="prism_consumer_id")
spend_balance_dq.head()

Unnamed: 0,prism_consumer_id,amount,balance,spending_balance_ratio,evaluation_date,credit_score,DQ_TARGET
0,0,29295.23,320.37,91.157326,2021-09-01,726.0,0.0
1,1,48002.17,3302.42,14.531053,2021-07-01,626.0,0.0
2,10,42343.16,824.24,51.310116,2022-02-01,654.0,0.0
3,100,74979.45,2655.47,28.22522,2021-12-01,750.0,0.0
4,1000,156268.06,95.25,1623.56426,2021-03-01,756.0,0.0


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.svm import SVC
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import warnings


def run_classification(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run Logistic Regression and Random Forest Classification on a dataset.

    Parameters:
        feature_column (list): List of columns that can be used as features.
        target_column (str): The name of the target column.
        dataset (pd.DataFrame): The dataset containing the features and target.
        test_size (float): Proportion of the dataset to include in the test split (default 0.3).
        random_state (int): Random seed for reproducibility (default 42).

    Returns:
        None: Prints the classification reports for both models.
    """
    warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)

    # Drop NaN values and shuffle the dataset
    dataset = dataset.dropna()

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # balance the dataset
    resampler = SMOTETomek(random_state=random_state)
    X_train, y_train = resampler.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Logistic Regression
    log_model = LogisticRegression(class_weight="balanced", max_iter = 200)
    log_model.fit(X_train, y_train)
    log_y_pred = log_model.predict(X_test)
    log_y_proba = log_model.predict_proba(X_test)[:, 1]
    print(f"Logistic Regression for {feature_column}")
    print(classification_report(y_test, log_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, log_y_proba):.3f}")

    # Random Forest Classification
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(X_train, y_train)
    rfc_y_pred = rfc_model.predict(X_test)
    rfc_y_proba = rfc_model.predict_proba(X_test)[:, 1]
    print(f"\nRandom Forest Classification for {feature_column}")
    print(classification_report(y_test, rfc_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, rfc_y_proba):.3f}")

    # Light GBM
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train, y_train)
    lgb_y_pred = lgb_model.predict(X_test)
    lgb_y_proba = lgb_model.predict_proba(X_test)[:, 1]
    print(f"\nLGB Model Classification for {feature_column}")
    print(classification_report(y_test, lgb_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, lgb_y_proba):.3f}")


    # Balanced Random Forest
    brf_model = BalancedRandomForestClassifier(random_state=random_state)
    brf_model.fit(X_train, y_train)
    brf_y_pred = brf_model.predict(X_test)
    brf_y_proba = brf_model.predict_proba(X_test)[:, 1]
    print(f"\nBalanced Random Forest Classification for {feature_column}")
    print(classification_report(y_test, brf_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, brf_y_proba):.3f}")
    feature_importances = brf_model.feature_importances_
    # Assuming `feature_names` contains the column names


    feature_importance_df = pd.DataFrame(
        {"Feature": feature_column, "Importance": feature_importances}
    )

    # Sort by importance
    feature_importance_df = feature_importance_df.sort_values(
        by="Importance", ascending=False
    )

    # Display top 10 most important features
    print(feature_importance_df.head(10))


In [13]:
# Example usage:
run_classification(["std_balance"], "DQ_TARGET", consumer_balance)


Logistic Regression for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.98      0.35      0.52      1896
         1.0       0.12      0.91      0.21       186

    accuracy                           0.40      2082
   macro avg       0.55      0.63      0.37      2082
weighted avg       0.90      0.40      0.49      2082

ROC-AUC Score: 0.720

Random Forest Classification for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.93      0.68      0.79      1896
         1.0       0.12      0.44      0.19       186

    accuracy                           0.66      2082
   macro avg       0.52      0.56      0.49      2082
weighted avg       0.85      0.66      0.73      2082

ROC-AUC Score: 0.616
[LightGBM] [Info] Number of positive: 6004, number of negative: 6004
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_row_wise=true` to remo

In [14]:
run_classification(["std_credit", "std_balance"], "DQ_TARGET", consumer_balance)


Logistic Regression for ['std_credit', 'std_balance']
              precision    recall  f1-score   support

         0.0       0.96      0.71      0.82      1896
         1.0       0.20      0.72      0.31       186

    accuracy                           0.72      2082
   macro avg       0.58      0.72      0.57      2082
weighted avg       0.89      0.72      0.77      2082

ROC-AUC Score: 0.796

Random Forest Classification for ['std_credit', 'std_balance']
              precision    recall  f1-score   support

         0.0       0.94      0.79      0.86      1896
         1.0       0.20      0.52      0.28       186

    accuracy                           0.77      2082
   macro avg       0.57      0.65      0.57      2082
weighted avg       0.88      0.77      0.81      2082

ROC-AUC Score: 0.748
[LightGBM] [Info] Number of positive: 6898, number of negative: 6898
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `

In [15]:
run_classification(["spending_balance_ratio"], "DQ_TARGET", spend_balance_dq)

Logistic Regression for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.92      0.94      0.93      1855
         1.0       0.21      0.16      0.19       170

    accuracy                           0.88      2025
   macro avg       0.57      0.55      0.56      2025
weighted avg       0.87      0.88      0.87      2025

ROC-AUC Score: 0.646

Random Forest Classification for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.93      0.68      0.79      1855
         1.0       0.11      0.43      0.18       170

    accuracy                           0.66      2025
   macro avg       0.52      0.56      0.48      2025
weighted avg       0.86      0.66      0.74      2025

ROC-AUC Score: 0.587
[LightGBM] [Info] Number of positive: 5697, number of negative: 5697
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_

In [16]:
# feature creation

transaction_categories = transactions.merge(
    categories, how="left", left_on="category", right_on="category_id"
)
transaction_categories.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category_x,amount,credit_or_debit,posted_date,category_id,category_y
0,3023,0,4,0.05,CREDIT,2021-04-16,4,MISCELLANEOUS
1,3023,1,12,481.56,CREDIT,2021-04-30,12,LOAN
2,3023,2,4,0.05,CREDIT,2021-05-16,4,MISCELLANEOUS
3,3023,3,4,0.07,CREDIT,2021-06-16,4,MISCELLANEOUS
4,3023,4,4,0.06,CREDIT,2021-07-16,4,MISCELLANEOUS


In [17]:
# create features based on the number of unique occurences in the transactions dataset
outflow_occurences = (
    transaction_categories[transaction_categories['credit_or_debit'] == "DEBIT"].groupby(["prism_consumer_id", "category_x"])
    .size()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
outflow_occurences = outflow_occurences.rename(
    columns=lambda col: f"outflow_occurrences_{col}" if col != "prism_consumer_id" else col
)
outflow_occurences.head()

category_x,prism_consumer_id,outflow_occurrences_0,outflow_occurrences_1,outflow_occurrences_4,outflow_occurrences_11,outflow_occurrences_12,outflow_occurrences_13,outflow_occurrences_14,outflow_occurrences_16,outflow_occurrences_17,outflow_occurrences_18,outflow_occurrences_19,outflow_occurrences_20,outflow_occurrences_21,outflow_occurrences_22,outflow_occurrences_23,outflow_occurrences_24,outflow_occurrences_25,outflow_occurrences_26,outflow_occurrences_27,outflow_occurrences_28,outflow_occurrences_29,outflow_occurrences_30,outflow_occurrences_31,outflow_occurrences_32,outflow_occurrences_33,outflow_occurrences_34,outflow_occurrences_35,outflow_occurrences_36,outflow_occurrences_37,outflow_occurrences_38,outflow_occurrences_39,outflow_occurrences_40,outflow_occurrences_41,outflow_occurrences_42,outflow_occurrences_43,outflow_occurrences_44,outflow_occurrences_45,outflow_occurrences_46,outflow_occurrences_47
0,0,15,9,30,0,0,0,214,26,21,25,3,6,2,0,0,0,0,0,2,5,0,0,0,0,0,0,0,0,5,0,6,1,0,0,0,0,0,0,0
1,1,39,1,14,0,0,0,18,54,7,24,35,22,0,0,0,0,0,0,3,10,0,0,0,0,0,0,14,0,0,0,0,2,0,0,0,0,0,0,0
2,10,8,67,10,0,0,0,73,33,23,10,18,2,0,1,0,0,0,1,2,0,0,1,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,4,0
3,100,23,17,8,0,0,7,0,0,0,1,1,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1000,68,19,9,0,8,0,0,0,6,0,0,0,0,0,0,0,0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0


In [18]:
# create features based on the sum of occurences in the transactions dataset
outflow_sums = (
    transaction_categories[transaction_categories["credit_or_debit"] == "DEBIT"]
    .groupby(["prism_consumer_id", "category_x"])["amount"]
    .sum()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
outflow_sums = outflow_sums.rename(
    columns=lambda col: f"outflow_sums_{col}"
    if col != "prism_consumer_id"
    else col
)
outflow_sums.head()


category_x,prism_consumer_id,outflow_sums_0,outflow_sums_1,outflow_sums_4,outflow_sums_11,outflow_sums_12,outflow_sums_13,outflow_sums_14,outflow_sums_16,outflow_sums_17,outflow_sums_18,outflow_sums_19,outflow_sums_20,outflow_sums_21,outflow_sums_22,outflow_sums_23,outflow_sums_24,outflow_sums_25,outflow_sums_26,outflow_sums_27,outflow_sums_28,outflow_sums_29,outflow_sums_30,outflow_sums_31,outflow_sums_32,outflow_sums_33,outflow_sums_34,outflow_sums_35,outflow_sums_36,outflow_sums_37,outflow_sums_38,outflow_sums_39,outflow_sums_40,outflow_sums_41,outflow_sums_42,outflow_sums_43,outflow_sums_44,outflow_sums_45,outflow_sums_46,outflow_sums_47
0,0,471.37,1741.1,1665.51,0.0,0.0,0.0,4079.35,1974.0,527.15,997.63,540.0,151.71,108.75,0.0,0.0,0.0,0.0,0.0,193.38,325.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1980.0,0.0,150.6,2.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,9103.0,2.42,1773.26,0.0,0.0,0.0,337.0,3314.53,195.18,427.27,6999.13,275.25,0.0,0.0,0.0,0.0,0.0,0.0,57.76,310.34,0.0,0.0,0.0,0.0,0.0,0.0,251.43,0.0,0.0,0.0,0.0,51.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,1900.55,7837.16,968.06,0.0,0.0,0.0,3348.14,1611.16,483.06,621.79,4112.0,107.64,0.0,99.0,0.0,0.0,0.0,180.0,144.0,0.0,0.0,50.0,0.0,0.0,0.0,103.0,0.0,0.0,0.0,0.0,0.0,35.04,0.0,0.0,0.0,0.0,0.0,166.0,0.0
3,100,11952.68,8264.0,791.5,0.0,0.0,1942.76,0.0,0.0,0.0,403.5,200.0,0.0,0.0,0.0,0.0,0.0,0.0,16188.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000,49384.91,2820.04,7261.82,0.0,2388.14,0.0,0.0,0.0,204.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14756.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1100.0,0.0,0.0


In [19]:
transaction_categories.category_id.unique()

array([ 4, 12,  1,  2,  0, 45,  3,  6, 20, 14, 18, 39, 16, 27, 46, 30, 26,
       17, 24, 19, 40, 49, 13, 11, 23, 34, 31, 22, 29, 21, 28, 32, 37, 38,
       36, 35, 42, 47, 41, 43,  8, 48,  9, 33, 44,  7, 25])

In [20]:
transaction_categories[transaction_categories['prism_consumer_id'] == "4"]

Unnamed: 0,prism_consumer_id,prism_transaction_id,category_x,amount,credit_or_debit,posted_date,category_id,category_y
525521,4,525148,4,0.57,CREDIT,2020-07-24,4,MISCELLANEOUS
525522,4,525149,0,60.00,CREDIT,2020-07-27,0,SELF_TRANSFER
525523,4,525150,2,15.00,CREDIT,2020-08-07,2,DEPOSIT
525524,4,525151,4,0.02,CREDIT,2020-12-25,4,MISCELLANEOUS
525525,4,525152,0,1000.00,CREDIT,2021-05-20,0,SELF_TRANSFER
...,...,...,...,...,...,...,...,...
525822,4,525449,14,50.00,DEBIT,2021-06-19,14,FOOD_AND_BEVERAGES
525823,4,525450,22,166.14,DEBIT,2021-06-19,22,ESSENTIAL_SERVICES
525824,4,525451,18,15.75,DEBIT,2021-06-20,18,GROCERIES
525825,4,525452,13,73.17,DEBIT,2021-06-21,13,INSURANCE


In [21]:
transaction_categories[transaction_categories['prism_consumer_id'] == "1100"]

Unnamed: 0,prism_consumer_id,prism_transaction_id,category_x,amount,credit_or_debit,posted_date,category_id,category_y
555588,1100,555215,0,1100.00,CREDIT,2020-12-17,0,SELF_TRANSFER
555589,1100,555216,3,1130.11,CREDIT,2020-12-17,3,PAYCHECK
555590,1100,555217,4,0.03,CREDIT,2020-12-18,4,MISCELLANEOUS
555591,1100,555218,0,1000.00,CREDIT,2020-12-24,0,SELF_TRANSFER
555592,1100,555219,0,700.00,CREDIT,2020-12-31,0,SELF_TRANSFER
...,...,...,...,...,...,...,...,...
555722,1100,555349,17,40.01,DEBIT,2021-04-29,17,AUTOMOTIVE
555723,1100,555350,17,49.95,DEBIT,2021-05-05,17,AUTOMOTIVE
555724,1100,555351,18,22.89,DEBIT,2021-05-06,18,GROCERIES
555725,1100,555352,0,760.00,DEBIT,2021-05-07,0,SELF_TRANSFER


In [22]:
transaction_categories.groupby(["prism_consumer_id", "category_x"])['amount'].sum()

prism_consumer_id  category_x
0                  0             2683.77
                   1             1969.85
                   2              500.00
                   3             8820.56
                   4             1667.14
                                  ...   
9999               35             275.36
                   39             176.97
                   40             132.73
                   42              30.00
                   45               6.00
Name: amount, Length: 280352, dtype: float64

In [23]:
transaction_categories.shape

(6407321, 8)

In [24]:
outflow_occurences.shape

(14196, 40)

In [25]:
# create features based on the number of unique occurences in the transactions dataset
inflow_occurences = (
    transaction_categories[transaction_categories["credit_or_debit"] == "CREDIT"]
    .groupby(["prism_consumer_id", "category_x"])
    .size()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
inflow_occurences = inflow_occurences.rename(
    columns=lambda col: f"inflow_occurrences_{col}"
    if col != "prism_consumer_id"
    else col
)
inflow_occurences.head()

category_x,prism_consumer_id,inflow_occurrences_0,inflow_occurrences_1,inflow_occurrences_2,inflow_occurrences_3,inflow_occurrences_4,inflow_occurrences_6,inflow_occurrences_7,inflow_occurrences_8,inflow_occurrences_9,inflow_occurrences_11,inflow_occurrences_12,inflow_occurrences_13,inflow_occurrences_45,inflow_occurrences_46,inflow_occurrences_47,inflow_occurrences_48,inflow_occurrences_49
0,0,8,4,1,9,12,1,0,0,0,3,0,0,0,0,0,0,0
1,1,39,0,2,14,13,1,0,0,0,2,0,0,0,0,0,0,0
2,10,8,41,2,15,14,5,0,0,0,0,0,0,0,0,0,0,0
3,100,13,2,0,23,4,8,0,0,0,0,0,0,0,0,0,0,0
4,1000,13,13,3,26,8,1,0,0,0,0,0,0,1,0,0,0,0


In [26]:
# create features based on the sum of occurences in the transactions dataset
inflow_sums = (
    transaction_categories[transaction_categories["credit_or_debit"] == "CREDIT"]
    .groupby(["prism_consumer_id", "category_x"])['amount']
    .sum()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
inflow_sums = inflow_sums.rename(
    columns=lambda col: f"inflow_sums_{col}"
    if col != "prism_consumer_id"
    else col
)
inflow_sums.head()


category_x,prism_consumer_id,inflow_sums_0,inflow_sums_1,inflow_sums_2,inflow_sums_3,inflow_sums_4,inflow_sums_6,inflow_sums_7,inflow_sums_8,inflow_sums_9,inflow_sums_11,inflow_sums_12,inflow_sums_13,inflow_sums_45,inflow_sums_46,inflow_sums_47,inflow_sums_48,inflow_sums_49
0,0,2212.4,228.75,500.0,8820.56,1.63,19.96,0.0,0.0,0.0,2603.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,9103.0,0.0,1492.95,11918.64,61.39,2.42,0.0,0.0,0.0,2325.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,1900.55,3156.0,700.0,14720.74,6.94,92.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100,7750.0,3060.0,0.0,24411.78,3.31,11.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000,17757.64,1342.17,4720.0,43658.6,258.92,1.37,0.0,0.0,0.0,0.0,0.0,0.0,10614.37,0.0,0.0,0.0,0.0


In [27]:
features = outflow_occurences.merge(outflow_sums, how = "left", on = "prism_consumer_id").merge(inflow_occurences, how = "left", on = "prism_consumer_id").merge(inflow_sums, how = "left", on = "prism_consumer_id")
features.head()

category_x,prism_consumer_id,outflow_occurrences_0,outflow_occurrences_1,outflow_occurrences_4,outflow_occurrences_11,outflow_occurrences_12,outflow_occurrences_13,outflow_occurrences_14,outflow_occurrences_16,outflow_occurrences_17,outflow_occurrences_18,outflow_occurrences_19,outflow_occurrences_20,outflow_occurrences_21,outflow_occurrences_22,outflow_occurrences_23,outflow_occurrences_24,outflow_occurrences_25,outflow_occurrences_26,outflow_occurrences_27,outflow_occurrences_28,outflow_occurrences_29,outflow_occurrences_30,outflow_occurrences_31,outflow_occurrences_32,outflow_occurrences_33,outflow_occurrences_34,outflow_occurrences_35,outflow_occurrences_36,outflow_occurrences_37,outflow_occurrences_38,outflow_occurrences_39,outflow_occurrences_40,outflow_occurrences_41,outflow_occurrences_42,outflow_occurrences_43,outflow_occurrences_44,outflow_occurrences_45,outflow_occurrences_46,outflow_occurrences_47,outflow_sums_0,outflow_sums_1,outflow_sums_4,outflow_sums_11,outflow_sums_12,outflow_sums_13,outflow_sums_14,outflow_sums_16,outflow_sums_17,outflow_sums_18,outflow_sums_19,outflow_sums_20,outflow_sums_21,outflow_sums_22,outflow_sums_23,outflow_sums_24,outflow_sums_25,outflow_sums_26,outflow_sums_27,outflow_sums_28,outflow_sums_29,outflow_sums_30,outflow_sums_31,outflow_sums_32,outflow_sums_33,outflow_sums_34,outflow_sums_35,outflow_sums_36,outflow_sums_37,outflow_sums_38,outflow_sums_39,outflow_sums_40,outflow_sums_41,outflow_sums_42,outflow_sums_43,outflow_sums_44,outflow_sums_45,outflow_sums_46,outflow_sums_47,inflow_occurrences_0,inflow_occurrences_1,inflow_occurrences_2,inflow_occurrences_3,inflow_occurrences_4,inflow_occurrences_6,inflow_occurrences_7,inflow_occurrences_8,inflow_occurrences_9,inflow_occurrences_11,inflow_occurrences_12,inflow_occurrences_13,inflow_occurrences_45,inflow_occurrences_46,inflow_occurrences_47,inflow_occurrences_48,inflow_occurrences_49,inflow_sums_0,inflow_sums_1,inflow_sums_2,inflow_sums_3,inflow_sums_4,inflow_sums_6,inflow_sums_7,inflow_sums_8,inflow_sums_9,inflow_sums_11,inflow_sums_12,inflow_sums_13,inflow_sums_45,inflow_sums_46,inflow_sums_47,inflow_sums_48,inflow_sums_49
0,0,15,9,30,0,0,0,214,26,21,25,3,6,2,0,0,0,0,0,2,5,0,0,0,0,0,0,0,0,5,0,6,1,0,0,0,0,0,0,0,471.37,1741.1,1665.51,0.0,0.0,0.0,4079.35,1974.0,527.15,997.63,540.0,151.71,108.75,0.0,0.0,0.0,0.0,0.0,193.38,325.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1980.0,0.0,150.6,2.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0,1.0,9.0,12.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2212.4,228.75,500.0,8820.56,1.63,19.96,0.0,0.0,0.0,2603.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,39,1,14,0,0,0,18,54,7,24,35,22,0,0,0,0,0,0,3,10,0,0,0,0,0,0,14,0,0,0,0,2,0,0,0,0,0,0,0,9103.0,2.42,1773.26,0.0,0.0,0.0,337.0,3314.53,195.18,427.27,6999.13,275.25,0.0,0.0,0.0,0.0,0.0,0.0,57.76,310.34,0.0,0.0,0.0,0.0,0.0,0.0,251.43,0.0,0.0,0.0,0.0,51.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,2.0,14.0,13.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9103.0,0.0,1492.95,11918.64,61.39,2.42,0.0,0.0,0.0,2325.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,8,67,10,0,0,0,73,33,23,10,18,2,0,1,0,0,0,1,2,0,0,1,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,4,0,1900.55,7837.16,968.06,0.0,0.0,0.0,3348.14,1611.16,483.06,621.79,4112.0,107.64,0.0,99.0,0.0,0.0,0.0,180.0,144.0,0.0,0.0,50.0,0.0,0.0,0.0,103.0,0.0,0.0,0.0,0.0,0.0,35.04,0.0,0.0,0.0,0.0,0.0,166.0,0.0,8.0,41.0,2.0,15.0,14.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1900.55,3156.0,700.0,14720.74,6.94,92.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100,23,17,8,0,0,7,0,0,0,1,1,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11952.68,8264.0,791.5,0.0,0.0,1942.76,0.0,0.0,0.0,403.5,200.0,0.0,0.0,0.0,0.0,0.0,0.0,16188.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,2.0,0.0,23.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7750.0,3060.0,0.0,24411.78,3.31,11.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000,68,19,9,0,8,0,0,0,6,0,0,0,0,0,0,0,0,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,49384.91,2820.04,7261.82,0.0,2388.14,0.0,0.0,0.0,204.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14756.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1100.0,0.0,0.0,13.0,13.0,3.0,26.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17757.64,1342.17,4720.0,43658.6,258.92,1.37,0.0,0.0,0.0,0.0,0.0,0.0,10614.37,0.0,0.0,0.0,0.0


In [28]:
# merge on consumer_balance to create new features
consumer_features = consumer_balance.merge(features, how = "left", on = "prism_consumer_id")
consumer_features.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance,outflow_occurrences_0,outflow_occurrences_1,outflow_occurrences_4,outflow_occurrences_11,outflow_occurrences_12,outflow_occurrences_13,outflow_occurrences_14,outflow_occurrences_16,outflow_occurrences_17,outflow_occurrences_18,outflow_occurrences_19,outflow_occurrences_20,outflow_occurrences_21,outflow_occurrences_22,outflow_occurrences_23,outflow_occurrences_24,outflow_occurrences_25,outflow_occurrences_26,outflow_occurrences_27,outflow_occurrences_28,outflow_occurrences_29,outflow_occurrences_30,outflow_occurrences_31,outflow_occurrences_32,outflow_occurrences_33,outflow_occurrences_34,outflow_occurrences_35,outflow_occurrences_36,outflow_occurrences_37,outflow_occurrences_38,outflow_occurrences_39,outflow_occurrences_40,outflow_occurrences_41,outflow_occurrences_42,outflow_occurrences_43,outflow_occurrences_44,outflow_occurrences_45,outflow_occurrences_46,outflow_occurrences_47,outflow_sums_0,outflow_sums_1,outflow_sums_4,outflow_sums_11,outflow_sums_12,outflow_sums_13,outflow_sums_14,outflow_sums_16,outflow_sums_17,outflow_sums_18,outflow_sums_19,outflow_sums_20,outflow_sums_21,outflow_sums_22,outflow_sums_23,outflow_sums_24,outflow_sums_25,outflow_sums_26,outflow_sums_27,outflow_sums_28,outflow_sums_29,outflow_sums_30,outflow_sums_31,outflow_sums_32,outflow_sums_33,outflow_sums_34,outflow_sums_35,outflow_sums_36,outflow_sums_37,outflow_sums_38,outflow_sums_39,outflow_sums_40,outflow_sums_41,outflow_sums_42,outflow_sums_43,outflow_sums_44,outflow_sums_45,outflow_sums_46,outflow_sums_47,inflow_occurrences_0,inflow_occurrences_1,inflow_occurrences_2,inflow_occurrences_3,inflow_occurrences_4,inflow_occurrences_6,inflow_occurrences_7,inflow_occurrences_8,inflow_occurrences_9,inflow_occurrences_11,inflow_occurrences_12,inflow_occurrences_13,inflow_occurrences_45,inflow_occurrences_46,inflow_occurrences_47,inflow_occurrences_48,inflow_occurrences_49,inflow_sums_0,inflow_sums_1,inflow_sums_2,inflow_sums_3,inflow_sums_4,inflow_sums_6,inflow_sums_7,inflow_sums_8,inflow_sums_9,inflow_sums_11,inflow_sums_12,inflow_sums_13,inflow_sums_45,inflow_sums_46,inflow_sums_47,inflow_sums_48,inflow_sums_49
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222,15.0,9.0,30.0,0.0,0.0,0.0,214.0,26.0,21.0,25.0,3.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,471.37,1741.1,1665.51,0.0,0.0,0.0,4079.35,1974.0,527.15,997.63,540.0,151.71,108.75,0.0,0.0,0.0,0.0,0.0,193.38,325.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1980.0,0.0,150.6,2.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,4.0,1.0,9.0,12.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2212.4,228.75,500.0,8820.56,1.63,19.96,0.0,0.0,0.0,2603.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027,39.0,1.0,14.0,0.0,0.0,0.0,18.0,54.0,7.0,24.0,35.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9103.0,2.42,1773.26,0.0,0.0,0.0,337.0,3314.53,195.18,427.27,6999.13,275.25,0.0,0.0,0.0,0.0,0.0,0.0,57.76,310.34,0.0,0.0,0.0,0.0,0.0,0.0,251.43,0.0,0.0,0.0,0.0,51.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,2.0,14.0,13.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9103.0,0.0,1492.95,11918.64,61.39,2.42,0.0,0.0,0.0,2325.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394,38.0,22.0,85.0,0.0,0.0,6.0,71.0,39.0,44.0,13.0,9.0,6.0,3.0,3.0,0.0,9.0,0.0,0.0,8.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,4426.75,785.55,3908.95,0.0,0.0,904.3,1317.26,1363.33,1005.74,211.36,6240.0,246.83,391.5,210.0,0.0,394.6,0.0,0.0,201.91,16.46,8.57,0.0,160.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.5,16.97,0.0,0.0,0.0,500.0,0.0,0.0,47.0,6.0,2.0,0.0,18.0,2.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,15439.16,64.0,1100.0,0.0,5.2,56.48,0.0,0.0,0.0,6094.48,0.0,4.87,0.52,0.0,0.0,0.0,0.0
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.00778,23.0,24.0,11.0,1.0,0.0,0.0,17.0,60.0,1.0,4.0,2.0,12.0,0.0,6.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,9782.19,4076.35,504.61,44.04,0.0,0.0,500.49,2487.61,53.85,106.84,113.5,214.12,0.0,335.85,0.0,24.13,0.0,0.0,105.0,0.0,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1286.71,0.0,0.0,0.0,0.0,150.72,0.0,0.0,23.0,2.0,2.0,3.0,12.0,2.0,0.0,0.0,3.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,9782.19,37.75,1030.0,4047.81,4.9,37.88,0.0,0.0,5700.0,2000.0,0.0,0.0,0.72,0.0,0.0,0.0,0.0
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824,9.0,5.0,68.0,0.0,0.0,5.0,15.0,8.0,29.0,54.0,10.0,14.0,5.0,5.0,0.0,14.0,0.0,2.0,4.0,0.0,2.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,2802.27,612.67,2720.78,0.0,0.0,343.17,269.03,486.88,966.43,2274.83,3540.0,604.01,673.23,820.56,0.0,336.82,0.0,225.48,75.56,0.0,23.97,0.0,88.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,0.0,0.0,0.0,0.0,0.0,495.98,0.0,7.0,0.0,3.0,0.0,8.0,5.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2760.0,0.0,54.44,0.0,15.17,116.5,0.0,0.0,12020.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
consumer_features.columns[4:]

Index(['balance', 'std_credit', 'std_balance', 'outflow_occurrences_0',
       'outflow_occurrences_1', 'outflow_occurrences_4',
       'outflow_occurrences_11', 'outflow_occurrences_12',
       'outflow_occurrences_13', 'outflow_occurrences_14',
       ...
       'inflow_sums_8', 'inflow_sums_9', 'inflow_sums_11', 'inflow_sums_12',
       'inflow_sums_13', 'inflow_sums_45', 'inflow_sums_46', 'inflow_sums_47',
       'inflow_sums_48', 'inflow_sums_49'],
      dtype='object', length=115)

In [30]:
consumer_balance

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.007780
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824
...,...,...,...,...,...,...,...
14995,14995,2022-03-08,655.0,,,-0.080938,
14996,14996,2022-01-15,625.0,,6821.92,-0.472962,-0.023705
14997,14997,2022-01-31,688.0,,,0.350288,
14998,14998,2022-03-08,722.0,,,0.794581,


In [31]:
spend_balance_dq

Unnamed: 0,prism_consumer_id,amount,balance,spending_balance_ratio,evaluation_date,credit_score,DQ_TARGET
0,0,29295.23,320.37,91.157326,2021-09-01,726.0,0.0
1,1,48002.17,3302.42,14.531053,2021-07-01,626.0,0.0
2,10,42343.16,824.24,51.310116,2022-02-01,654.0,0.0
3,100,74979.45,2655.47,28.225220,2021-12-01,750.0,0.0
4,1000,156268.06,95.25,1623.564260,2021-03-01,756.0,0.0
...,...,...,...,...,...,...,...
12642,9995,33973.23,0.00,33973.230000,2023-03-02,578.0,
12643,9996,2390.92,252.67,9.425316,2023-10-11,610.0,
12644,9997,33909.03,611.28,55.381574,2023-05-25,675.0,
12645,9998,29965.84,-862.99,-34.763559,2023-11-02,534.0,


In [34]:
classification_features = consumer_features.columns[4:].drop('std_credit')
run_classification(classification_features, "DQ_TARGET", consumer_features)

Logistic Regression for Index(['balance', 'std_balance', 'outflow_occurrences_0',
       'outflow_occurrences_1', 'outflow_occurrences_4',
       'outflow_occurrences_11', 'outflow_occurrences_12',
       'outflow_occurrences_13', 'outflow_occurrences_14',
       'outflow_occurrences_16',
       ...
       'inflow_sums_8', 'inflow_sums_9', 'inflow_sums_11', 'inflow_sums_12',
       'inflow_sums_13', 'inflow_sums_45', 'inflow_sums_46', 'inflow_sums_47',
       'inflow_sums_48', 'inflow_sums_49'],
      dtype='object', length=114)
              precision    recall  f1-score   support

         0.0       0.95      0.61      0.74      1802
         1.0       0.15      0.69      0.25       180

    accuracy                           0.62      1982
   macro avg       0.55      0.65      0.49      1982
weighted avg       0.88      0.62      0.70      1982

ROC-AUC Score: 0.700

Random Forest Classification for Index(['balance', 'std_balance', 'outflow_occurrences_0',
       'outflow_occurrenc

In [37]:
categories

Unnamed: 0,category_id,category
0,0,SELF_TRANSFER
1,1,EXTERNAL_TRANSFER
2,2,DEPOSIT
3,3,PAYCHECK
4,4,MISCELLANEOUS
5,5,PAYCHECK_PLACEHOLDER
6,6,REFUND
7,7,INVESTMENT_INCOME
8,8,OTHER_BENEFITS
9,9,UNEMPLOYMENT_BENEFITS
