In [116]:
import numpy as np
import pandas as pd

In [117]:
pd.set_option("display.max_columns", None)

In [118]:
categories = pd.read_csv("../../data_q2/q2-ucsd-cat-map.csv")
consumer = pd.read_parquet("../../data_q2/q2-ucsd-consDF.pqt")
acct = pd.read_parquet("../../data_q2/q2-ucsd-acctIDF.pqt")
transactions = pd.read_parquet("../../data_q2/q2-ucsd-trxnDF.pqt")

In [4]:
consumer.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0


In [5]:
transactions.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16


In [6]:
transactions[transactions["prism_consumer_id"] == "3023"]

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16
...,...,...,...,...,...,...
200,3023,200,39,10.91,DEBIT,2021-09-17
201,3023,201,4,81.73,DEBIT,2021-09-18
202,3023,202,16,21.85,DEBIT,2021-09-20
203,3023,203,45,25.00,DEBIT,2021-09-20


In [7]:
acct.head()

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.9


In [8]:
len(acct["prism_consumer_id"].unique())

13009

In [119]:
# merge
acct["account_type"].unique()

# most important account_types: ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN]
# most_important_accounts = ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN']
acctDF = acct.copy()
total_balance = acctDF.groupby("prism_consumer_id")["balance"].sum()

In [120]:
consumer_balance = consumer.merge(
    pd.DataFrame(total_balance), on="prism_consumer_id", how="outer"
)
consumer_balance["std_credit"] = (
    consumer_balance["credit_score"] - consumer_balance["credit_score"].mean()
) / consumer_balance["credit_score"].std()
consumer_balance["std_balance"] = (
    consumer_balance["balance"] - consumer_balance["balance"].mean()
) / consumer_balance["balance"].std()
consumer_balance

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.007780
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824
...,...,...,...,...,...,...,...
14995,14995,2022-03-08,655.0,,,-0.080938,
14996,14996,2022-01-15,625.0,,6821.92,-0.472962,-0.023705
14997,14997,2022-01-31,688.0,,,0.350288,
14998,14998,2022-03-08,722.0,,,0.794581,


In [121]:
# spending balance ratio

total_trans = transactions.groupby("prism_consumer_id")[["amount"]].sum()
total_acc = acct.groupby("prism_consumer_id")[["balance"]].sum()

spend_balance = total_trans.merge(total_acc, how="inner", on="prism_consumer_id")
spend_balance["spending_balance_ratio"] = spend_balance["amount"] / (
    spend_balance["balance"] + 1
)
spend_balance_dq = spend_balance.merge(consumer, how="inner", on="prism_consumer_id")
spend_balance_dq.head()

Unnamed: 0,prism_consumer_id,amount,balance,spending_balance_ratio,evaluation_date,credit_score,DQ_TARGET
0,0,29295.23,320.37,91.157326,2021-09-01,726.0,0.0
1,1,48002.17,3302.42,14.531053,2021-07-01,626.0,0.0
2,10,42343.16,824.24,51.310116,2022-02-01,654.0,0.0
3,100,74979.45,2655.47,28.22522,2021-12-01,750.0,0.0
4,1000,156268.06,95.25,1623.56426,2021-03-01,756.0,0.0


In [107]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.svm import SVC
from imblearn.ensemble import BalancedRandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import warnings


def run_classification(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run Logistic Regression and Random Forest Classification on a dataset.

    Parameters:
        feature_column (list): List of columns that can be used as features.
        target_column (str): The name of the target column.
        dataset (pd.DataFrame): The dataset containing the features and target.
        test_size (float): Proportion of the dataset to include in the test split (default 0.3).
        random_state (int): Random seed for reproducibility (default 42).

    Returns:
        None: Prints the classification reports for both models.
    """
    warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)

    # Drop NaN values and shuffle the dataset
    dataset = dataset.dropna()

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # balance the dataset
    resampler = SMOTETomek(random_state=random_state)
    X_train, y_train = resampler.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Logistic Regression
    log_model = LogisticRegression(class_weight="balanced", max_iter = 200)
    log_model.fit(X_train, y_train)
    log_y_pred = log_model.predict(X_test)
    log_y_proba = log_model.predict_proba(X_test)[:, 1]
    print(f"Logistic Regression for {feature_column}")
    print(classification_report(y_test, log_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, log_y_proba):.3f}")

    # Random Forest Classification
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(X_train, y_train)
    rfc_y_pred = rfc_model.predict(X_test)
    rfc_y_proba = rfc_model.predict_proba(X_test)[:, 1]
    print(f"\nRandom Forest Classification for {feature_column}")
    print(classification_report(y_test, rfc_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, rfc_y_proba):.3f}")

    # Light GBM
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train, y_train)
    lgb_y_pred = lgb_model.predict(X_test)
    lgb_y_proba = lgb_model.predict_proba(X_test)[:, 1]
    print(f"\nLGB Model Classification for {feature_column}")
    print(classification_report(y_test, lgb_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, lgb_y_proba):.3f}")


    # Balanced Random Forest
    brf_model = BalancedRandomForestClassifier(random_state=random_state)
    brf_model.fit(X_train, y_train)
    brf_y_pred = brf_model.predict(X_test)
    brf_y_proba = brf_model.predict_proba(X_test)[:, 1]
    print(f"\nBalanced Random Forest Classification for {feature_column}")
    print(classification_report(y_test, brf_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, brf_y_proba):.3f}")
    feature_importances = brf_model.feature_importances_
    # Assuming `feature_names` contains the column names


    feature_importance_df = pd.DataFrame(
        {"Feature": feature_column, "Importance": feature_importances}
    )

    # Sort by importance
    feature_importance_df = feature_importance_df.sort_values(
        by="Importance", ascending=False
    )

    # Display top 10 most important features
    print(feature_importance_df.head(20))


In [63]:
def run_classification2(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run classification models and print comprehensive performance metrics,
    feature importance, and correlations with target.
    """
    warnings.filterwarnings(action="ignore", category=UndefinedMetricWarning)
    dataset = dataset.dropna()

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Balance and scale data
    resampler = SMOTETomek(random_state=random_state)
    X_train, y_train = resampler.fit_resample(X_train, y_train)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # List to track model performance
    model_results = []

    def evaluate_model(model, name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        roc_auc = roc_auc_score(y_test, y_proba)
        acc = accuracy_score(y_test, y_pred)
        clf_report = classification_report(y_test, y_pred, output_dict=True)

        # Store results
        metrics = {
            "model": name,
            "roc_auc": roc_auc,
            "accuracy": acc,
            "precision": clf_report["weighted avg"]["precision"],
            "recall": clf_report["weighted avg"]["recall"],
            "f1": clf_report["weighted avg"]["f1-score"],
        }
        model_results.append(metrics)

        # Print detailed report
        print(f"\n{name} Classification Report:")
        print(classification_report(y_test, y_pred))
        print(f"ROC-AUC Score: {roc_auc:.3f}")

        return model

    # Evaluate models
    models = [
        (
            LogisticRegression(class_weight="balanced", max_iter=200),
            "Logistic Regression",
        ),
        (RandomForestClassifier(random_state=random_state), "Random Forest"),
        (lgb.LGBMClassifier(), "LightGBM"),
        (
            BalancedRandomForestClassifier(random_state=random_state),
            "Balanced Random Forest",
        ),
    ]

    for model, name in models:
        evaluate_model(model, name)

    # Determine best model
    best_model = max(model_results, key=lambda x: x["roc_auc"])
    print("\n=== BEST MODEL ===")
    print(f"Model: {best_model['model']}")
    print(f"ROC-AUC: {best_model['roc_auc']:.3f}")
    print(f"Accuracy: {best_model['accuracy']:.3f}")
    print(f"Precision: {best_model['precision']:.3f}")
    print(f"Recall: {best_model['recall']:.3f}")
    print(f"F1-Score: {best_model['f1']:.3f}")

    # Feature analysis
    feature_correlations = X.corrwith(y)
    brf_model = BalancedRandomForestClassifier(random_state=random_state).fit(
        X_train, y_train
    )

    feature_importance_df = (
        pd.DataFrame(
            {
                "Feature": feature_column,
                "Importance": brf_model.feature_importances_,
                "Correlation": feature_correlations,
            }
        )
        .sort_values("Importance", ascending=False)
        .head(20)
    )

    print("\nTop 20 Features with Correlation Direction:")
    print(feature_importance_df.to_string(index=False))


In [59]:
# Example usage:
run_classification2(["std_balance"], "DQ_TARGET", consumer_balance)


[LightGBM] [Info] Number of positive: 6004, number of negative: 6004
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 12008, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

=== BEST MODEL ===
Model: Logistic Regression
ROC-AUC: 0.720
Accuracy: 0.403

Top 20 Features with Correlation Direction:
    Feature  Importance  Correlation
std_balance         1.0    -0.039617


In [61]:
run_classification2(["std_credit", "std_balance"], "DQ_TARGET", consumer_balance)


[LightGBM] [Info] Number of positive: 6898, number of negative: 6898
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 13796, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

=== BEST MODEL ===
Model: Logistic Regression
ROC-AUC: 0.796
Accuracy: 0.715

Top 20 Features with Correlation Direction:
    Feature  Importance  Correlation
 std_credit    0.539985    -0.281688
std_balance    0.460015    -0.039617


In [15]:
run_classification(["spending_balance_ratio"], "DQ_TARGET", spend_balance_dq)

Logistic Regression for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.92      0.94      0.93      1855
         1.0       0.21      0.16      0.19       170

    accuracy                           0.88      2025
   macro avg       0.57      0.55      0.56      2025
weighted avg       0.87      0.88      0.87      2025

ROC-AUC Score: 0.646

Random Forest Classification for ['spending_balance_ratio']
              precision    recall  f1-score   support

         0.0       0.93      0.68      0.79      1855
         1.0       0.11      0.43      0.18       170

    accuracy                           0.66      2025
   macro avg       0.52      0.56      0.48      2025
weighted avg       0.86      0.66      0.74      2025

ROC-AUC Score: 0.587
[LightGBM] [Info] Number of positive: 5697, number of negative: 5697
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_

In [122]:
# feature creation

transaction_categories = transactions.merge(
    categories, how="left", left_on="category", right_on="category_id"
)
transaction_categories.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category_x,amount,credit_or_debit,posted_date,category_id,category_y
0,3023,0,4,0.05,CREDIT,2021-04-16,4,MISCELLANEOUS
1,3023,1,12,481.56,CREDIT,2021-04-30,12,LOAN
2,3023,2,4,0.05,CREDIT,2021-05-16,4,MISCELLANEOUS
3,3023,3,4,0.07,CREDIT,2021-06-16,4,MISCELLANEOUS
4,3023,4,4,0.06,CREDIT,2021-07-16,4,MISCELLANEOUS


In [123]:
# create features based on the number of unique occurences in the transactions dataset
outflow_occurences = (
    transaction_categories[transaction_categories["credit_or_debit"] == "DEBIT"]
    .groupby(["prism_consumer_id", "category_y"])
    .size()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)
outflow_occurences

category_y,prism_consumer_id,ACCOUNT_FEES,ATM_CASH,AUTOMOTIVE,AUTO_LOAN,BANKING_CATCH_ALL,BILLS_UTILITIES,BNPL,CHILD_DEPENDENTS,CORPORATE_PAYMENTS,CREDIT_CARD_PAYMENT,DEBT,EDUCATION,ENTERTAINMENT,ESSENTIAL_SERVICES,EXTERNAL_TRANSFER,FITNESS,FOOD_AND_BEVERAGES,GAMBLING,GENERAL_MERCHANDISE,GIFTS_DONATIONS,GOVERNMENT_SERVICES,GROCERIES,HEALTHCARE_MEDICAL,HOME_IMPROVEMENT,INSURANCE,INVESTMENT,LEGAL,LOAN,MISCELLANEOUS,MORTGAGE,OVERDRAFT,PETS,RENT,RISK_CATCH_ALL,RTO_LTO,SELF_TRANSFER,TAX,TRANSPORATION,TRAVEL
0,0,0,3,21,0,5,0,0,0,0,0,0,0,6,0,9,6,214,0,26,0,0,25,2,0,0,0,0,0,30,0,0,5,0,0,0,15,0,1,2
1,1,0,35,7,0,0,0,14,0,0,0,0,0,22,0,1,0,18,0,54,0,0,24,3,0,0,0,0,0,14,0,0,10,0,0,0,39,0,2,0
2,10,0,18,23,0,0,0,0,0,0,1,0,0,2,1,67,0,73,4,33,1,0,10,2,0,0,0,0,0,10,0,0,0,1,0,0,8,0,2,0
3,100,0,1,0,0,0,0,0,0,0,15,0,0,0,0,17,0,0,0,0,0,0,1,0,0,7,0,0,0,8,0,0,0,0,0,0,23,0,0,0
4,1000,0,0,6,0,0,0,0,0,0,34,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,2,0,8,9,0,0,0,0,0,0,68,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14191,9995,0,3,5,2,0,1,11,0,0,7,0,0,1,1,106,2,23,0,15,0,0,7,3,0,0,0,0,43,3,0,0,0,0,0,0,0,0,0,0
14192,9996,0,0,4,0,4,0,0,0,0,0,0,0,1,0,0,0,4,0,8,0,0,3,0,0,0,0,0,2,6,0,0,0,0,0,0,0,0,0,1
14193,9997,2,4,8,6,0,1,20,0,0,13,0,3,3,3,58,6,5,0,10,0,0,0,5,0,6,0,0,13,27,0,28,0,0,0,0,0,0,0,3
14194,9998,4,3,6,1,0,0,0,0,0,0,0,0,3,4,4,0,7,0,53,0,0,3,0,1,0,0,0,34,13,0,27,0,0,0,0,2,0,1,0


In [124]:
# create features based on the number of unique occurences in the transactions dataset
outflow_occurences = (
    transaction_categories[transaction_categories['credit_or_debit'] == "DEBIT"].groupby(["prism_consumer_id", "category_y"])
    .size()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
outflow_occurences = outflow_occurences.rename(
    columns=lambda col: f"outflow_occurrences_{col}" if col != "prism_consumer_id" else col
)
outflow_occurences.head()

category_y,prism_consumer_id,outflow_occurrences_ACCOUNT_FEES,outflow_occurrences_ATM_CASH,outflow_occurrences_AUTOMOTIVE,outflow_occurrences_AUTO_LOAN,outflow_occurrences_BANKING_CATCH_ALL,outflow_occurrences_BILLS_UTILITIES,outflow_occurrences_BNPL,outflow_occurrences_CHILD_DEPENDENTS,outflow_occurrences_CORPORATE_PAYMENTS,outflow_occurrences_CREDIT_CARD_PAYMENT,outflow_occurrences_DEBT,outflow_occurrences_EDUCATION,outflow_occurrences_ENTERTAINMENT,outflow_occurrences_ESSENTIAL_SERVICES,outflow_occurrences_EXTERNAL_TRANSFER,outflow_occurrences_FITNESS,outflow_occurrences_FOOD_AND_BEVERAGES,outflow_occurrences_GAMBLING,outflow_occurrences_GENERAL_MERCHANDISE,outflow_occurrences_GIFTS_DONATIONS,outflow_occurrences_GOVERNMENT_SERVICES,outflow_occurrences_GROCERIES,outflow_occurrences_HEALTHCARE_MEDICAL,outflow_occurrences_HOME_IMPROVEMENT,outflow_occurrences_INSURANCE,outflow_occurrences_INVESTMENT,outflow_occurrences_LEGAL,outflow_occurrences_LOAN,outflow_occurrences_MISCELLANEOUS,outflow_occurrences_MORTGAGE,outflow_occurrences_OVERDRAFT,outflow_occurrences_PETS,outflow_occurrences_RENT,outflow_occurrences_RISK_CATCH_ALL,outflow_occurrences_RTO_LTO,outflow_occurrences_SELF_TRANSFER,outflow_occurrences_TAX,outflow_occurrences_TRANSPORATION,outflow_occurrences_TRAVEL
0,0,0,3,21,0,5,0,0,0,0,0,0,0,6,0,9,6,214,0,26,0,0,25,2,0,0,0,0,0,30,0,0,5,0,0,0,15,0,1,2
1,1,0,35,7,0,0,0,14,0,0,0,0,0,22,0,1,0,18,0,54,0,0,24,3,0,0,0,0,0,14,0,0,10,0,0,0,39,0,2,0
2,10,0,18,23,0,0,0,0,0,0,1,0,0,2,1,67,0,73,4,33,1,0,10,2,0,0,0,0,0,10,0,0,0,1,0,0,8,0,2,0
3,100,0,1,0,0,0,0,0,0,0,15,0,0,0,0,17,0,0,0,0,0,0,1,0,0,7,0,0,0,8,0,0,0,0,0,0,23,0,0,0
4,1000,0,0,6,0,0,0,0,0,0,34,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,2,0,8,9,0,0,0,0,0,0,68,0,0,0


In [125]:
# create features based on the sum of occurences in the transactions dataset
outflow_sums = (
    transaction_categories[transaction_categories["credit_or_debit"] == "DEBIT"]
    .groupby(["prism_consumer_id", "category_y"])["amount"]
    .sum()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
outflow_sums = outflow_sums.rename(
    columns=lambda col: f"outflow_sums_{col}"
    if col != "prism_consumer_id"
    else col
)
outflow_sums.head()


category_y,prism_consumer_id,outflow_sums_ACCOUNT_FEES,outflow_sums_ATM_CASH,outflow_sums_AUTOMOTIVE,outflow_sums_AUTO_LOAN,outflow_sums_BANKING_CATCH_ALL,outflow_sums_BILLS_UTILITIES,outflow_sums_BNPL,outflow_sums_CHILD_DEPENDENTS,outflow_sums_CORPORATE_PAYMENTS,outflow_sums_CREDIT_CARD_PAYMENT,outflow_sums_DEBT,outflow_sums_EDUCATION,outflow_sums_ENTERTAINMENT,outflow_sums_ESSENTIAL_SERVICES,outflow_sums_EXTERNAL_TRANSFER,outflow_sums_FITNESS,outflow_sums_FOOD_AND_BEVERAGES,outflow_sums_GAMBLING,outflow_sums_GENERAL_MERCHANDISE,outflow_sums_GIFTS_DONATIONS,outflow_sums_GOVERNMENT_SERVICES,outflow_sums_GROCERIES,outflow_sums_HEALTHCARE_MEDICAL,outflow_sums_HOME_IMPROVEMENT,outflow_sums_INSURANCE,outflow_sums_INVESTMENT,outflow_sums_LEGAL,outflow_sums_LOAN,outflow_sums_MISCELLANEOUS,outflow_sums_MORTGAGE,outflow_sums_OVERDRAFT,outflow_sums_PETS,outflow_sums_RENT,outflow_sums_RISK_CATCH_ALL,outflow_sums_RTO_LTO,outflow_sums_SELF_TRANSFER,outflow_sums_TAX,outflow_sums_TRANSPORATION,outflow_sums_TRAVEL
0,0,0.0,540.0,527.15,0.0,1980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.71,0.0,1741.1,150.6,4079.35,0.0,1974.0,0.0,0.0,997.63,193.38,0.0,0.0,0.0,0.0,0.0,1665.51,0.0,0.0,325.38,0.0,0.0,0.0,471.37,0.0,2.48,108.75
1,1,0.0,6999.13,195.18,0.0,0.0,0.0,251.43,0.0,0.0,0.0,0.0,0.0,275.25,0.0,2.42,0.0,337.0,0.0,3314.53,0.0,0.0,427.27,57.76,0.0,0.0,0.0,0.0,0.0,1773.26,0.0,0.0,310.34,0.0,0.0,0.0,9103.0,0.0,51.8,0.0
2,10,0.0,4112.0,483.06,0.0,0.0,0.0,0.0,0.0,0.0,180.0,0.0,0.0,107.64,99.0,7837.16,0.0,3348.14,166.0,1611.16,50.0,0.0,621.79,144.0,0.0,0.0,0.0,0.0,0.0,968.06,0.0,0.0,0.0,103.0,0.0,0.0,1900.55,0.0,35.04,0.0
3,100,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16188.17,0.0,0.0,0.0,0.0,8264.0,0.0,0.0,0.0,0.0,0.0,0.0,403.5,0.0,0.0,1942.76,0.0,0.0,0.0,791.5,0.0,0.0,0.0,0.0,0.0,0.0,11952.68,0.0,0.0,0.0
4,1000,0.0,0.0,204.03,0.0,0.0,0.0,0.0,0.0,0.0,14756.05,0.0,0.0,0.0,0.0,2820.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1100.0,0.0,2388.14,7261.82,0.0,0.0,0.0,0.0,0.0,0.0,49384.91,0.0,0.0,0.0


In [126]:
transaction_categories.category_id.unique()

array([ 4, 12,  1,  2,  0, 45,  3,  6, 20, 14, 18, 39, 16, 27, 46, 30, 26,
       17, 24, 19, 40, 49, 13, 11, 23, 34, 31, 22, 29, 21, 28, 32, 37, 38,
       36, 35, 42, 47, 41, 43,  8, 48,  9, 33, 44,  7, 25])

In [127]:
transaction_categories[transaction_categories['prism_consumer_id'] == "4"]

Unnamed: 0,prism_consumer_id,prism_transaction_id,category_x,amount,credit_or_debit,posted_date,category_id,category_y
525521,4,525148,4,0.57,CREDIT,2020-07-24,4,MISCELLANEOUS
525522,4,525149,0,60.00,CREDIT,2020-07-27,0,SELF_TRANSFER
525523,4,525150,2,15.00,CREDIT,2020-08-07,2,DEPOSIT
525524,4,525151,4,0.02,CREDIT,2020-12-25,4,MISCELLANEOUS
525525,4,525152,0,1000.00,CREDIT,2021-05-20,0,SELF_TRANSFER
...,...,...,...,...,...,...,...,...
525822,4,525449,14,50.00,DEBIT,2021-06-19,14,FOOD_AND_BEVERAGES
525823,4,525450,22,166.14,DEBIT,2021-06-19,22,ESSENTIAL_SERVICES
525824,4,525451,18,15.75,DEBIT,2021-06-20,18,GROCERIES
525825,4,525452,13,73.17,DEBIT,2021-06-21,13,INSURANCE


In [128]:
transaction_categories[transaction_categories['prism_consumer_id'] == "1100"]

Unnamed: 0,prism_consumer_id,prism_transaction_id,category_x,amount,credit_or_debit,posted_date,category_id,category_y
555588,1100,555215,0,1100.00,CREDIT,2020-12-17,0,SELF_TRANSFER
555589,1100,555216,3,1130.11,CREDIT,2020-12-17,3,PAYCHECK
555590,1100,555217,4,0.03,CREDIT,2020-12-18,4,MISCELLANEOUS
555591,1100,555218,0,1000.00,CREDIT,2020-12-24,0,SELF_TRANSFER
555592,1100,555219,0,700.00,CREDIT,2020-12-31,0,SELF_TRANSFER
...,...,...,...,...,...,...,...,...
555722,1100,555349,17,40.01,DEBIT,2021-04-29,17,AUTOMOTIVE
555723,1100,555350,17,49.95,DEBIT,2021-05-05,17,AUTOMOTIVE
555724,1100,555351,18,22.89,DEBIT,2021-05-06,18,GROCERIES
555725,1100,555352,0,760.00,DEBIT,2021-05-07,0,SELF_TRANSFER


In [129]:
transaction_categories.groupby(["prism_consumer_id", "category_y"])['amount'].sum()

prism_consumer_id  category_y       
0                  ATM_CASH               540.00
                   AUTOMOTIVE             527.15
                   BANKING_CATCH_ALL     1980.00
                   DEPOSIT                500.00
                   ENTERTAINMENT          151.71
                                          ...   
9999               MISCELLANEOUS         1468.33
                   PAYCHECK             15378.88
                   REFUND                  47.52
                   TRANSPORATION          132.73
                   TRAVEL                  79.00
Name: amount, Length: 280352, dtype: float64

In [130]:
transaction_categories.shape

(6407321, 8)

In [131]:
outflow_occurences.shape

(14196, 40)

In [133]:
# create features based on the number of unique occurences in the transactions dataset
inflow_occurences = (
    transaction_categories[transaction_categories["credit_or_debit"] == "CREDIT"]
    .groupby(["prism_consumer_id", "category_y"])
    .size()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
inflow_occurences = inflow_occurences.rename(
    columns=lambda col: f"inflow_occurrences_{col}"
    if col != "prism_consumer_id"
    else col
)
inflow_occurences.head()

category_y,prism_consumer_id,inflow_occurrences_CORPORATE_PAYMENTS,inflow_occurrences_DEPOSIT,inflow_occurrences_EXTERNAL_TRANSFER,inflow_occurrences_GAMBLING,inflow_occurrences_INSURANCE,inflow_occurrences_INVESTMENT,inflow_occurrences_INVESTMENT_INCOME,inflow_occurrences_LOAN,inflow_occurrences_MISCELLANEOUS,inflow_occurrences_OTHER_BENEFITS,inflow_occurrences_PAYCHECK,inflow_occurrences_PENSION,inflow_occurrences_REFUND,inflow_occurrences_SELF_TRANSFER,inflow_occurrences_TAX,inflow_occurrences_TIME_OR_STUFF,inflow_occurrences_UNEMPLOYMENT_BENEFITS
0,0,0,1,4,0,0,0,0,0,12,0,9,0,1,8,3,0,0
1,1,0,2,0,0,0,0,0,0,13,0,14,0,1,39,2,0,0
2,10,0,2,41,0,0,0,0,0,14,0,15,0,5,8,0,0,0
3,100,0,0,2,0,0,0,0,0,4,0,23,0,8,13,0,0,0
4,1000,0,3,13,0,0,1,0,0,8,0,26,0,1,13,0,0,0


In [134]:
# create features based on the sum of occurences in the transactions dataset
inflow_sums = (
    transaction_categories[transaction_categories["credit_or_debit"] == "CREDIT"]
    .groupby(["prism_consumer_id", "category_y"])['amount']
    .sum()  # Count number of unique occurrences
    .unstack(fill_value=0)  # Create one column per category_x
    .reset_index()
)

# name columns for clarity
inflow_sums = inflow_sums.rename(
    columns=lambda col: f"inflow_sums_{col}"
    if col != "prism_consumer_id"
    else col
)
inflow_sums.head()


category_y,prism_consumer_id,inflow_sums_CORPORATE_PAYMENTS,inflow_sums_DEPOSIT,inflow_sums_EXTERNAL_TRANSFER,inflow_sums_GAMBLING,inflow_sums_INSURANCE,inflow_sums_INVESTMENT,inflow_sums_INVESTMENT_INCOME,inflow_sums_LOAN,inflow_sums_MISCELLANEOUS,inflow_sums_OTHER_BENEFITS,inflow_sums_PAYCHECK,inflow_sums_PENSION,inflow_sums_REFUND,inflow_sums_SELF_TRANSFER,inflow_sums_TAX,inflow_sums_TIME_OR_STUFF,inflow_sums_UNEMPLOYMENT_BENEFITS
0,0,0.0,500.0,228.75,0.0,0.0,0.0,0.0,0.0,1.63,0.0,8820.56,0.0,19.96,2212.4,2603.52,0.0,0.0
1,1,0.0,1492.95,0.0,0.0,0.0,0.0,0.0,0.0,61.39,0.0,11918.64,0.0,2.42,9103.0,2325.4,0.0,0.0
2,10,0.0,700.0,3156.0,0.0,0.0,0.0,0.0,0.0,6.94,0.0,14720.74,0.0,92.33,1900.55,0.0,0.0,0.0
3,100,0.0,0.0,3060.0,0.0,0.0,0.0,0.0,0.0,3.31,0.0,24411.78,0.0,11.75,7750.0,0.0,0.0,0.0
4,1000,0.0,4720.0,1342.17,0.0,0.0,10614.37,0.0,0.0,258.92,0.0,43658.6,0.0,1.37,17757.64,0.0,0.0,0.0


In [135]:
features = outflow_occurences.merge(outflow_sums, how = "left", on = "prism_consumer_id").merge(inflow_occurences, how = "left", on = "prism_consumer_id").merge(inflow_sums, how = "left", on = "prism_consumer_id")
features.head()

category_y,prism_consumer_id,outflow_occurrences_ACCOUNT_FEES,outflow_occurrences_ATM_CASH,outflow_occurrences_AUTOMOTIVE,outflow_occurrences_AUTO_LOAN,outflow_occurrences_BANKING_CATCH_ALL,outflow_occurrences_BILLS_UTILITIES,outflow_occurrences_BNPL,outflow_occurrences_CHILD_DEPENDENTS,outflow_occurrences_CORPORATE_PAYMENTS,outflow_occurrences_CREDIT_CARD_PAYMENT,outflow_occurrences_DEBT,outflow_occurrences_EDUCATION,outflow_occurrences_ENTERTAINMENT,outflow_occurrences_ESSENTIAL_SERVICES,outflow_occurrences_EXTERNAL_TRANSFER,outflow_occurrences_FITNESS,outflow_occurrences_FOOD_AND_BEVERAGES,outflow_occurrences_GAMBLING,outflow_occurrences_GENERAL_MERCHANDISE,outflow_occurrences_GIFTS_DONATIONS,outflow_occurrences_GOVERNMENT_SERVICES,outflow_occurrences_GROCERIES,outflow_occurrences_HEALTHCARE_MEDICAL,outflow_occurrences_HOME_IMPROVEMENT,outflow_occurrences_INSURANCE,outflow_occurrences_INVESTMENT,outflow_occurrences_LEGAL,outflow_occurrences_LOAN,outflow_occurrences_MISCELLANEOUS,outflow_occurrences_MORTGAGE,outflow_occurrences_OVERDRAFT,outflow_occurrences_PETS,outflow_occurrences_RENT,outflow_occurrences_RISK_CATCH_ALL,outflow_occurrences_RTO_LTO,outflow_occurrences_SELF_TRANSFER,outflow_occurrences_TAX,outflow_occurrences_TRANSPORATION,outflow_occurrences_TRAVEL,outflow_sums_ACCOUNT_FEES,outflow_sums_ATM_CASH,outflow_sums_AUTOMOTIVE,outflow_sums_AUTO_LOAN,outflow_sums_BANKING_CATCH_ALL,outflow_sums_BILLS_UTILITIES,outflow_sums_BNPL,outflow_sums_CHILD_DEPENDENTS,outflow_sums_CORPORATE_PAYMENTS,outflow_sums_CREDIT_CARD_PAYMENT,outflow_sums_DEBT,outflow_sums_EDUCATION,outflow_sums_ENTERTAINMENT,outflow_sums_ESSENTIAL_SERVICES,outflow_sums_EXTERNAL_TRANSFER,outflow_sums_FITNESS,outflow_sums_FOOD_AND_BEVERAGES,outflow_sums_GAMBLING,outflow_sums_GENERAL_MERCHANDISE,outflow_sums_GIFTS_DONATIONS,outflow_sums_GOVERNMENT_SERVICES,outflow_sums_GROCERIES,outflow_sums_HEALTHCARE_MEDICAL,outflow_sums_HOME_IMPROVEMENT,outflow_sums_INSURANCE,outflow_sums_INVESTMENT,outflow_sums_LEGAL,outflow_sums_LOAN,outflow_sums_MISCELLANEOUS,outflow_sums_MORTGAGE,outflow_sums_OVERDRAFT,outflow_sums_PETS,outflow_sums_RENT,outflow_sums_RISK_CATCH_ALL,outflow_sums_RTO_LTO,outflow_sums_SELF_TRANSFER,outflow_sums_TAX,outflow_sums_TRANSPORATION,outflow_sums_TRAVEL,inflow_occurrences_CORPORATE_PAYMENTS,inflow_occurrences_DEPOSIT,inflow_occurrences_EXTERNAL_TRANSFER,inflow_occurrences_GAMBLING,inflow_occurrences_INSURANCE,inflow_occurrences_INVESTMENT,inflow_occurrences_INVESTMENT_INCOME,inflow_occurrences_LOAN,inflow_occurrences_MISCELLANEOUS,inflow_occurrences_OTHER_BENEFITS,inflow_occurrences_PAYCHECK,inflow_occurrences_PENSION,inflow_occurrences_REFUND,inflow_occurrences_SELF_TRANSFER,inflow_occurrences_TAX,inflow_occurrences_TIME_OR_STUFF,inflow_occurrences_UNEMPLOYMENT_BENEFITS,inflow_sums_CORPORATE_PAYMENTS,inflow_sums_DEPOSIT,inflow_sums_EXTERNAL_TRANSFER,inflow_sums_GAMBLING,inflow_sums_INSURANCE,inflow_sums_INVESTMENT,inflow_sums_INVESTMENT_INCOME,inflow_sums_LOAN,inflow_sums_MISCELLANEOUS,inflow_sums_OTHER_BENEFITS,inflow_sums_PAYCHECK,inflow_sums_PENSION,inflow_sums_REFUND,inflow_sums_SELF_TRANSFER,inflow_sums_TAX,inflow_sums_TIME_OR_STUFF,inflow_sums_UNEMPLOYMENT_BENEFITS
0,0,0,3,21,0,5,0,0,0,0,0,0,0,6,0,9,6,214,0,26,0,0,25,2,0,0,0,0,0,30,0,0,5,0,0,0,15,0,1,2,0.0,540.0,527.15,0.0,1980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.71,0.0,1741.1,150.6,4079.35,0.0,1974.0,0.0,0.0,997.63,193.38,0.0,0.0,0.0,0.0,0.0,1665.51,0.0,0.0,325.38,0.0,0.0,0.0,471.37,0.0,2.48,108.75,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,9.0,0.0,1.0,8.0,3.0,0.0,0.0,0.0,500.0,228.75,0.0,0.0,0.0,0.0,0.0,1.63,0.0,8820.56,0.0,19.96,2212.4,2603.52,0.0,0.0
1,1,0,35,7,0,0,0,14,0,0,0,0,0,22,0,1,0,18,0,54,0,0,24,3,0,0,0,0,0,14,0,0,10,0,0,0,39,0,2,0,0.0,6999.13,195.18,0.0,0.0,0.0,251.43,0.0,0.0,0.0,0.0,0.0,275.25,0.0,2.42,0.0,337.0,0.0,3314.53,0.0,0.0,427.27,57.76,0.0,0.0,0.0,0.0,0.0,1773.26,0.0,0.0,310.34,0.0,0.0,0.0,9103.0,0.0,51.8,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,14.0,0.0,1.0,39.0,2.0,0.0,0.0,0.0,1492.95,0.0,0.0,0.0,0.0,0.0,0.0,61.39,0.0,11918.64,0.0,2.42,9103.0,2325.4,0.0,0.0
2,10,0,18,23,0,0,0,0,0,0,1,0,0,2,1,67,0,73,4,33,1,0,10,2,0,0,0,0,0,10,0,0,0,1,0,0,8,0,2,0,0.0,4112.0,483.06,0.0,0.0,0.0,0.0,0.0,0.0,180.0,0.0,0.0,107.64,99.0,7837.16,0.0,3348.14,166.0,1611.16,50.0,0.0,621.79,144.0,0.0,0.0,0.0,0.0,0.0,968.06,0.0,0.0,0.0,103.0,0.0,0.0,1900.55,0.0,35.04,0.0,0.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,15.0,0.0,5.0,8.0,0.0,0.0,0.0,0.0,700.0,3156.0,0.0,0.0,0.0,0.0,0.0,6.94,0.0,14720.74,0.0,92.33,1900.55,0.0,0.0,0.0
3,100,0,1,0,0,0,0,0,0,0,15,0,0,0,0,17,0,0,0,0,0,0,1,0,0,7,0,0,0,8,0,0,0,0,0,0,23,0,0,0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16188.17,0.0,0.0,0.0,0.0,8264.0,0.0,0.0,0.0,0.0,0.0,0.0,403.5,0.0,0.0,1942.76,0.0,0.0,0.0,791.5,0.0,0.0,0.0,0.0,0.0,0.0,11952.68,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,23.0,0.0,8.0,13.0,0.0,0.0,0.0,0.0,0.0,3060.0,0.0,0.0,0.0,0.0,0.0,3.31,0.0,24411.78,0.0,11.75,7750.0,0.0,0.0,0.0
4,1000,0,0,6,0,0,0,0,0,0,34,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,2,0,8,9,0,0,0,0,0,0,68,0,0,0,0.0,0.0,204.03,0.0,0.0,0.0,0.0,0.0,0.0,14756.05,0.0,0.0,0.0,0.0,2820.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1100.0,0.0,2388.14,7261.82,0.0,0.0,0.0,0.0,0.0,0.0,49384.91,0.0,0.0,0.0,0.0,3.0,13.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,26.0,0.0,1.0,13.0,0.0,0.0,0.0,0.0,4720.0,1342.17,0.0,0.0,10614.37,0.0,0.0,258.92,0.0,43658.6,0.0,1.37,17757.64,0.0,0.0,0.0


In [136]:
# merge on consumer_balance to create new features
consumer_features = consumer_balance.merge(features, how = "left", on = "prism_consumer_id")
consumer_features.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance,outflow_occurrences_ACCOUNT_FEES,outflow_occurrences_ATM_CASH,outflow_occurrences_AUTOMOTIVE,outflow_occurrences_AUTO_LOAN,outflow_occurrences_BANKING_CATCH_ALL,outflow_occurrences_BILLS_UTILITIES,outflow_occurrences_BNPL,outflow_occurrences_CHILD_DEPENDENTS,outflow_occurrences_CORPORATE_PAYMENTS,outflow_occurrences_CREDIT_CARD_PAYMENT,outflow_occurrences_DEBT,outflow_occurrences_EDUCATION,outflow_occurrences_ENTERTAINMENT,outflow_occurrences_ESSENTIAL_SERVICES,outflow_occurrences_EXTERNAL_TRANSFER,outflow_occurrences_FITNESS,outflow_occurrences_FOOD_AND_BEVERAGES,outflow_occurrences_GAMBLING,outflow_occurrences_GENERAL_MERCHANDISE,outflow_occurrences_GIFTS_DONATIONS,outflow_occurrences_GOVERNMENT_SERVICES,outflow_occurrences_GROCERIES,outflow_occurrences_HEALTHCARE_MEDICAL,outflow_occurrences_HOME_IMPROVEMENT,outflow_occurrences_INSURANCE,outflow_occurrences_INVESTMENT,outflow_occurrences_LEGAL,outflow_occurrences_LOAN,outflow_occurrences_MISCELLANEOUS,outflow_occurrences_MORTGAGE,outflow_occurrences_OVERDRAFT,outflow_occurrences_PETS,outflow_occurrences_RENT,outflow_occurrences_RISK_CATCH_ALL,outflow_occurrences_RTO_LTO,outflow_occurrences_SELF_TRANSFER,outflow_occurrences_TAX,outflow_occurrences_TRANSPORATION,outflow_occurrences_TRAVEL,outflow_sums_ACCOUNT_FEES,outflow_sums_ATM_CASH,outflow_sums_AUTOMOTIVE,outflow_sums_AUTO_LOAN,outflow_sums_BANKING_CATCH_ALL,outflow_sums_BILLS_UTILITIES,outflow_sums_BNPL,outflow_sums_CHILD_DEPENDENTS,outflow_sums_CORPORATE_PAYMENTS,outflow_sums_CREDIT_CARD_PAYMENT,outflow_sums_DEBT,outflow_sums_EDUCATION,outflow_sums_ENTERTAINMENT,outflow_sums_ESSENTIAL_SERVICES,outflow_sums_EXTERNAL_TRANSFER,outflow_sums_FITNESS,outflow_sums_FOOD_AND_BEVERAGES,outflow_sums_GAMBLING,outflow_sums_GENERAL_MERCHANDISE,outflow_sums_GIFTS_DONATIONS,outflow_sums_GOVERNMENT_SERVICES,outflow_sums_GROCERIES,outflow_sums_HEALTHCARE_MEDICAL,outflow_sums_HOME_IMPROVEMENT,outflow_sums_INSURANCE,outflow_sums_INVESTMENT,outflow_sums_LEGAL,outflow_sums_LOAN,outflow_sums_MISCELLANEOUS,outflow_sums_MORTGAGE,outflow_sums_OVERDRAFT,outflow_sums_PETS,outflow_sums_RENT,outflow_sums_RISK_CATCH_ALL,outflow_sums_RTO_LTO,outflow_sums_SELF_TRANSFER,outflow_sums_TAX,outflow_sums_TRANSPORATION,outflow_sums_TRAVEL,inflow_occurrences_CORPORATE_PAYMENTS,inflow_occurrences_DEPOSIT,inflow_occurrences_EXTERNAL_TRANSFER,inflow_occurrences_GAMBLING,inflow_occurrences_INSURANCE,inflow_occurrences_INVESTMENT,inflow_occurrences_INVESTMENT_INCOME,inflow_occurrences_LOAN,inflow_occurrences_MISCELLANEOUS,inflow_occurrences_OTHER_BENEFITS,inflow_occurrences_PAYCHECK,inflow_occurrences_PENSION,inflow_occurrences_REFUND,inflow_occurrences_SELF_TRANSFER,inflow_occurrences_TAX,inflow_occurrences_TIME_OR_STUFF,inflow_occurrences_UNEMPLOYMENT_BENEFITS,inflow_sums_CORPORATE_PAYMENTS,inflow_sums_DEPOSIT,inflow_sums_EXTERNAL_TRANSFER,inflow_sums_GAMBLING,inflow_sums_INSURANCE,inflow_sums_INVESTMENT,inflow_sums_INVESTMENT_INCOME,inflow_sums_LOAN,inflow_sums_MISCELLANEOUS,inflow_sums_OTHER_BENEFITS,inflow_sums_PAYCHECK,inflow_sums_PENSION,inflow_sums_REFUND,inflow_sums_SELF_TRANSFER,inflow_sums_TAX,inflow_sums_TIME_OR_STUFF,inflow_sums_UNEMPLOYMENT_BENEFITS
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222,0.0,3.0,21.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,9.0,6.0,214.0,0.0,26.0,0.0,0.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,5.0,0.0,0.0,0.0,15.0,0.0,1.0,2.0,0.0,540.0,527.15,0.0,1980.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.71,0.0,1741.1,150.6,4079.35,0.0,1974.0,0.0,0.0,997.63,193.38,0.0,0.0,0.0,0.0,0.0,1665.51,0.0,0.0,325.38,0.0,0.0,0.0,471.37,0.0,2.48,108.75,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,9.0,0.0,1.0,8.0,3.0,0.0,0.0,0.0,500.0,228.75,0.0,0.0,0.0,0.0,0.0,1.63,0.0,8820.56,0.0,19.96,2212.4,2603.52,0.0,0.0
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027,0.0,35.0,7.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,1.0,0.0,18.0,0.0,54.0,0.0,0.0,24.0,3.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,10.0,0.0,0.0,0.0,39.0,0.0,2.0,0.0,0.0,6999.13,195.18,0.0,0.0,0.0,251.43,0.0,0.0,0.0,0.0,0.0,275.25,0.0,2.42,0.0,337.0,0.0,3314.53,0.0,0.0,427.27,57.76,0.0,0.0,0.0,0.0,0.0,1773.26,0.0,0.0,310.34,0.0,0.0,0.0,9103.0,0.0,51.8,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,14.0,0.0,1.0,39.0,2.0,0.0,0.0,0.0,1492.95,0.0,0.0,0.0,0.0,0.0,0.0,61.39,0.0,11918.64,0.0,2.42,9103.0,2325.4,0.0,0.0
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394,0.0,9.0,44.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,3.0,22.0,0.0,71.0,0.0,39.0,0.0,0.0,13.0,8.0,9.0,6.0,3.0,1.0,0.0,85.0,0.0,0.0,3.0,0.0,0.0,0.0,38.0,0.0,2.0,3.0,0.0,6240.0,1005.74,0.0,0.0,160.0,0.0,0.0,0.0,0.0,0.0,8.57,246.83,210.0,785.55,0.0,1317.26,0.0,1363.33,0.0,0.0,211.36,201.91,394.6,904.3,500.0,16.97,0.0,3908.95,0.0,0.0,16.46,0.0,0.0,0.0,4426.75,0.0,24.5,391.5,0.0,2.0,6.0,0.0,2.0,2.0,0.0,0.0,18.0,0.0,0.0,0.0,2.0,47.0,2.0,0.0,0.0,0.0,1100.0,64.0,0.0,4.87,0.52,0.0,0.0,5.2,0.0,0.0,0.0,56.48,15439.16,6094.48,0.0,0.0
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.00778,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.0,24.0,0.0,17.0,0.0,60.0,0.0,0.0,4.0,3.0,1.0,0.0,4.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,1.0,50.0,0.0,0.0,113.5,53.85,0.0,0.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,214.12,335.85,4076.35,0.0,500.49,0.0,2487.61,0.0,0.0,106.84,105.0,24.13,0.0,150.72,0.0,0.0,504.61,0.0,0.0,0.0,0.0,0.0,0.0,9782.19,44.04,1286.71,0.0,0.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,12.0,0.0,3.0,0.0,2.0,23.0,2.0,0.0,3.0,0.0,1030.0,37.75,0.0,0.0,0.72,0.0,0.0,4.9,0.0,4047.81,0.0,37.88,9782.19,2000.0,0.0,5700.0
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824,0.0,10.0,29.0,0.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,14.0,5.0,5.0,0.0,15.0,5.0,8.0,0.0,0.0,54.0,4.0,14.0,5.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,2.0,5.0,0.0,3540.0,966.43,0.0,0.0,88.04,0.0,0.0,0.0,225.48,0.0,23.97,604.01,820.56,612.67,0.0,269.03,495.98,486.88,0.0,0.0,2274.83,75.56,336.82,343.17,0.0,0.0,0.0,2720.78,0.0,0.0,0.0,0.0,0.0,0.0,2802.27,0.0,150.0,673.23,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,5.0,7.0,0.0,0.0,17.0,0.0,54.44,0.0,0.0,0.0,0.0,0.0,0.0,15.17,0.0,0.0,0.0,116.5,2760.0,0.0,0.0,12020.0


In [137]:
consumer_balance

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.007780
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824
...,...,...,...,...,...,...,...
14995,14995,2022-03-08,655.0,,,-0.080938,
14996,14996,2022-01-15,625.0,,6821.92,-0.472962,-0.023705
14997,14997,2022-01-31,688.0,,,0.350288,
14998,14998,2022-03-08,722.0,,,0.794581,


Testing the outflow/inflow occurences and sums of the different categories per person/user

In [69]:
classification_features = consumer_features.columns[4:]
run_classification2(classification_features, "DQ_TARGET", consumer_features)


Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.72      0.83      1802
         1.0       0.20      0.71      0.32       180

    accuracy                           0.72      1982
   macro avg       0.58      0.72      0.57      1982
weighted avg       0.89      0.72      0.78      1982

ROC-AUC Score: 0.785

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      1802
         1.0       0.30      0.19      0.23       180

    accuracy                           0.89      1982
   macro avg       0.61      0.57      0.58      1982
weighted avg       0.87      0.89      0.87      1982

ROC-AUC Score: 0.796
[LightGBM] [Info] Number of positive: 7235, number of negative: 7235
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014334 seconds.
You can set `force_row_wise=true` to remove the o

Looking through these the most influential features (category-wise), they are "ACCOUNT_FEES", "OVERDRAFT", "MISCELLEANEOUS", "BNPL", INSURANCE", "EXTERNAL_TRANSFER", "PAYCHECK", "LOAN", "SELF TRANSFER"

In [138]:
consumer_features

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance,outflow_occurrences_ACCOUNT_FEES,outflow_occurrences_ATM_CASH,outflow_occurrences_AUTOMOTIVE,outflow_occurrences_AUTO_LOAN,outflow_occurrences_BANKING_CATCH_ALL,outflow_occurrences_BILLS_UTILITIES,outflow_occurrences_BNPL,outflow_occurrences_CHILD_DEPENDENTS,outflow_occurrences_CORPORATE_PAYMENTS,outflow_occurrences_CREDIT_CARD_PAYMENT,outflow_occurrences_DEBT,outflow_occurrences_EDUCATION,outflow_occurrences_ENTERTAINMENT,outflow_occurrences_ESSENTIAL_SERVICES,outflow_occurrences_EXTERNAL_TRANSFER,outflow_occurrences_FITNESS,outflow_occurrences_FOOD_AND_BEVERAGES,outflow_occurrences_GAMBLING,outflow_occurrences_GENERAL_MERCHANDISE,outflow_occurrences_GIFTS_DONATIONS,outflow_occurrences_GOVERNMENT_SERVICES,outflow_occurrences_GROCERIES,outflow_occurrences_HEALTHCARE_MEDICAL,outflow_occurrences_HOME_IMPROVEMENT,outflow_occurrences_INSURANCE,outflow_occurrences_INVESTMENT,outflow_occurrences_LEGAL,outflow_occurrences_LOAN,outflow_occurrences_MISCELLANEOUS,outflow_occurrences_MORTGAGE,outflow_occurrences_OVERDRAFT,outflow_occurrences_PETS,outflow_occurrences_RENT,outflow_occurrences_RISK_CATCH_ALL,outflow_occurrences_RTO_LTO,outflow_occurrences_SELF_TRANSFER,outflow_occurrences_TAX,outflow_occurrences_TRANSPORATION,outflow_occurrences_TRAVEL,outflow_sums_ACCOUNT_FEES,outflow_sums_ATM_CASH,outflow_sums_AUTOMOTIVE,outflow_sums_AUTO_LOAN,outflow_sums_BANKING_CATCH_ALL,outflow_sums_BILLS_UTILITIES,outflow_sums_BNPL,outflow_sums_CHILD_DEPENDENTS,outflow_sums_CORPORATE_PAYMENTS,outflow_sums_CREDIT_CARD_PAYMENT,outflow_sums_DEBT,outflow_sums_EDUCATION,outflow_sums_ENTERTAINMENT,outflow_sums_ESSENTIAL_SERVICES,outflow_sums_EXTERNAL_TRANSFER,outflow_sums_FITNESS,outflow_sums_FOOD_AND_BEVERAGES,outflow_sums_GAMBLING,outflow_sums_GENERAL_MERCHANDISE,outflow_sums_GIFTS_DONATIONS,outflow_sums_GOVERNMENT_SERVICES,outflow_sums_GROCERIES,outflow_sums_HEALTHCARE_MEDICAL,outflow_sums_HOME_IMPROVEMENT,outflow_sums_INSURANCE,outflow_sums_INVESTMENT,outflow_sums_LEGAL,outflow_sums_LOAN,outflow_sums_MISCELLANEOUS,outflow_sums_MORTGAGE,outflow_sums_OVERDRAFT,outflow_sums_PETS,outflow_sums_RENT,outflow_sums_RISK_CATCH_ALL,outflow_sums_RTO_LTO,outflow_sums_SELF_TRANSFER,outflow_sums_TAX,outflow_sums_TRANSPORATION,outflow_sums_TRAVEL,inflow_occurrences_CORPORATE_PAYMENTS,inflow_occurrences_DEPOSIT,inflow_occurrences_EXTERNAL_TRANSFER,inflow_occurrences_GAMBLING,inflow_occurrences_INSURANCE,inflow_occurrences_INVESTMENT,inflow_occurrences_INVESTMENT_INCOME,inflow_occurrences_LOAN,inflow_occurrences_MISCELLANEOUS,inflow_occurrences_OTHER_BENEFITS,inflow_occurrences_PAYCHECK,inflow_occurrences_PENSION,inflow_occurrences_REFUND,inflow_occurrences_SELF_TRANSFER,inflow_occurrences_TAX,inflow_occurrences_TIME_OR_STUFF,inflow_occurrences_UNEMPLOYMENT_BENEFITS,inflow_sums_CORPORATE_PAYMENTS,inflow_sums_DEPOSIT,inflow_sums_EXTERNAL_TRANSFER,inflow_sums_GAMBLING,inflow_sums_INSURANCE,inflow_sums_INVESTMENT,inflow_sums_INVESTMENT_INCOME,inflow_sums_LOAN,inflow_sums_MISCELLANEOUS,inflow_sums_OTHER_BENEFITS,inflow_sums_PAYCHECK,inflow_sums_PENSION,inflow_sums_REFUND,inflow_sums_SELF_TRANSFER,inflow_sums_TAX,inflow_sums_TIME_OR_STUFF,inflow_sums_UNEMPLOYMENT_BENEFITS
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222,0.0,3.0,21.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,9.0,6.0,214.0,0.0,26.0,0.0,0.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,5.0,0.0,0.0,0.0,15.0,0.0,1.0,2.0,0.0,540.00,527.15,0.0,1980.00,0.00,0.00,0.0,0.0,0.00,0.0,0.00,151.71,0.00,1741.10,150.6,4079.35,0.00,1974.00,0.0,0.0,997.63,193.38,0.00,0.00,0.00,0.00,0.00,1665.51,0.0,0.0,325.38,0.0,0.0,0.0,471.37,0.00,2.48,108.75,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,9.0,0.0,1.0,8.0,3.0,0.0,0.0,0.0,500.00,228.75,0.0,0.00,0.00,0.00,0.0,1.63,0.0,8820.56,0.0,19.96,2212.40,2603.52,0.00,0.0
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027,0.0,35.0,7.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,1.0,0.0,18.0,0.0,54.0,0.0,0.0,24.0,3.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,10.0,0.0,0.0,0.0,39.0,0.0,2.0,0.0,0.0,6999.13,195.18,0.0,0.00,0.00,251.43,0.0,0.0,0.00,0.0,0.00,275.25,0.00,2.42,0.0,337.00,0.00,3314.53,0.0,0.0,427.27,57.76,0.00,0.00,0.00,0.00,0.00,1773.26,0.0,0.0,310.34,0.0,0.0,0.0,9103.00,0.00,51.80,0.00,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,14.0,0.0,1.0,39.0,2.0,0.0,0.0,0.0,1492.95,0.00,0.0,0.00,0.00,0.00,0.0,61.39,0.0,11918.64,0.0,2.42,9103.00,2325.40,0.00,0.0
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394,0.0,9.0,44.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,3.0,22.0,0.0,71.0,0.0,39.0,0.0,0.0,13.0,8.0,9.0,6.0,3.0,1.0,0.0,85.0,0.0,0.0,3.0,0.0,0.0,0.0,38.0,0.0,2.0,3.0,0.0,6240.00,1005.74,0.0,0.00,160.00,0.00,0.0,0.0,0.00,0.0,8.57,246.83,210.00,785.55,0.0,1317.26,0.00,1363.33,0.0,0.0,211.36,201.91,394.60,904.30,500.00,16.97,0.00,3908.95,0.0,0.0,16.46,0.0,0.0,0.0,4426.75,0.00,24.50,391.50,0.0,2.0,6.0,0.0,2.0,2.0,0.0,0.0,18.0,0.0,0.0,0.0,2.0,47.0,2.0,0.0,0.0,0.0,1100.00,64.00,0.0,4.87,0.52,0.00,0.0,5.20,0.0,0.00,0.0,56.48,15439.16,6094.48,0.00,0.0
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.007780,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.0,24.0,0.0,17.0,0.0,60.0,0.0,0.0,4.0,3.0,1.0,0.0,4.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,1.0,50.0,0.0,0.0,113.50,53.85,0.0,0.00,60.00,0.00,0.0,0.0,0.00,0.0,0.00,214.12,335.85,4076.35,0.0,500.49,0.00,2487.61,0.0,0.0,106.84,105.00,24.13,0.00,150.72,0.00,0.00,504.61,0.0,0.0,0.00,0.0,0.0,0.0,9782.19,44.04,1286.71,0.00,0.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,12.0,0.0,3.0,0.0,2.0,23.0,2.0,0.0,3.0,0.0,1030.00,37.75,0.0,0.00,0.72,0.00,0.0,4.90,0.0,4047.81,0.0,37.88,9782.19,2000.00,0.00,5700.0
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824,0.0,10.0,29.0,0.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,14.0,5.0,5.0,0.0,15.0,5.0,8.0,0.0,0.0,54.0,4.0,14.0,5.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,2.0,5.0,0.0,3540.00,966.43,0.0,0.00,88.04,0.00,0.0,0.0,225.48,0.0,23.97,604.01,820.56,612.67,0.0,269.03,495.98,486.88,0.0,0.0,2274.83,75.56,336.82,343.17,0.00,0.00,0.00,2720.78,0.0,0.0,0.00,0.0,0.0,0.0,2802.27,0.00,150.00,673.23,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,5.0,7.0,0.0,0.0,17.0,0.0,54.44,0.00,0.0,0.00,0.00,0.00,0.0,15.17,0.0,0.00,0.0,116.50,2760.00,0.00,0.00,12020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,2022-03-08,655.0,,,-0.080938,,0.0,1.0,22.0,0.0,0.0,10.0,16.0,0.0,0.0,19.0,0.0,3.0,10.0,5.0,22.0,0.0,52.0,0.0,78.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,6.0,0.0,0.0,40.00,645.22,0.0,0.00,669.61,1435.47,0.0,0.0,1386.00,0.0,523.50,152.40,1504.06,1510.00,0.0,1067.05,0.00,2531.48,0.0,0.0,0.00,58.21,34.69,0.00,0.00,0.00,1083.51,804.56,0.0,0.0,0.00,0.0,0.0,0.0,1289.00,0.00,45.65,0.00,0.0,3.0,11.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,14.0,0.0,2.0,14.0,0.0,0.0,0.0,0.0,2320.00,1980.68,0.0,0.00,0.00,0.00,0.0,1327.57,0.0,6975.34,0.0,11.82,1621.20,0.00,0.00,0.0
14996,14996,2022-01-15,625.0,,6821.92,-0.472962,-0.023705,17.0,5.0,37.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,13.0,1.0,21.0,0.0,29.0,0.0,24.0,0.0,0.0,10.0,5.0,3.0,0.0,2.0,0.0,0.0,35.0,0.0,0.0,3.0,2.0,0.0,0.0,25.0,0.0,4.0,1.0,102.1,280.00,1798.06,0.0,87.37,0.00,0.00,445.0,0.0,0.00,0.0,0.00,410.32,54.99,2458.78,0.0,953.49,0.00,3474.03,0.0,0.0,240.65,1989.84,53.22,0.00,450.00,0.00,0.00,4575.33,0.0,0.0,163.10,1432.5,0.0,0.0,41887.19,0.00,15.25,119.60,0.0,28.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,21.0,0.0,3.0,20.0,0.0,0.0,3.0,0.0,12141.12,0.00,0.0,0.00,0.00,0.22,0.0,0.00,0.0,17721.24,0.0,765.06,29567.19,0.00,0.00,1674.0
14997,14997,2022-01-31,688.0,,,0.350288,,0.0,5.0,54.0,0.0,7.0,0.0,0.0,0.0,0.0,130.0,0.0,2.0,0.0,0.0,63.0,0.0,23.0,6.0,2.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,73.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000.00,738.83,0.0,1837.80,0.00,0.00,0.0,0.0,19752.67,0.0,44.32,0.00,0.00,6569.12,0.0,373.44,52.00,16.21,0.0,0.0,69.81,0.00,0.00,242.85,0.00,0.00,11627.02,371.75,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0,14.0,32.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,417.95,541.59,0.0,0.00,0.00,0.00,20056.0,120.00,0.0,22724.02,0.0,0.00,0.00,0.00,0.00,0.0
14998,14998,2022-03-08,722.0,,,0.794581,,14.0,14.0,116.0,0.0,0.0,16.0,57.0,0.0,0.0,13.0,0.0,1.0,44.0,2.0,71.0,0.0,139.0,12.0,316.0,0.0,0.0,25.0,37.0,34.0,16.0,16.0,0.0,4.0,107.0,8.0,5.0,0.0,1.0,0.0,0.0,300.0,0.0,9.0,2.0,9.8,38663.02,3131.61,0.0,0.00,2392.23,2657.23,0.0,0.0,1418.29,0.0,15.20,892.28,69.65,1789.72,0.0,3584.63,630.98,13232.64,0.0,0.0,1027.37,2702.36,913.35,2375.80,3769.42,0.00,1275.26,4347.25,12800.0,175.0,0.00,30.0,0.0,0.0,72046.01,0.00,406.02,60.00,0.0,19.0,3.0,0.0,0.0,5.0,0.0,0.0,3.0,1.0,91.0,0.0,16.0,285.0,8.0,1.0,0.0,0.0,8220.57,146.96,0.0,0.00,37193.37,0.00,0.0,70.76,1100.0,45463.23,0.0,436.24,71198.67,6826.03,173.09,0.0


In [139]:
df = acct.copy()

# Ensure balance_date is in datetime format
df["balance_date"] = pd.to_datetime(df["balance_date"])

# One-hot encode account_type
one_hot = pd.get_dummies(df["account_type"], prefix="account_type")
one_hot_aggregated = (
    pd.concat([df[["prism_consumer_id"]], one_hot], axis=1)
    .groupby("prism_consumer_id")
    .sum()
)
consumer_features = consumer_features.merge(one_hot_aggregated, on="prism_consumer_id")
consumer_features


Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance,outflow_occurrences_ACCOUNT_FEES,outflow_occurrences_ATM_CASH,outflow_occurrences_AUTOMOTIVE,outflow_occurrences_AUTO_LOAN,outflow_occurrences_BANKING_CATCH_ALL,outflow_occurrences_BILLS_UTILITIES,outflow_occurrences_BNPL,outflow_occurrences_CHILD_DEPENDENTS,outflow_occurrences_CORPORATE_PAYMENTS,outflow_occurrences_CREDIT_CARD_PAYMENT,outflow_occurrences_DEBT,outflow_occurrences_EDUCATION,outflow_occurrences_ENTERTAINMENT,outflow_occurrences_ESSENTIAL_SERVICES,outflow_occurrences_EXTERNAL_TRANSFER,outflow_occurrences_FITNESS,outflow_occurrences_FOOD_AND_BEVERAGES,outflow_occurrences_GAMBLING,outflow_occurrences_GENERAL_MERCHANDISE,outflow_occurrences_GIFTS_DONATIONS,outflow_occurrences_GOVERNMENT_SERVICES,outflow_occurrences_GROCERIES,outflow_occurrences_HEALTHCARE_MEDICAL,outflow_occurrences_HOME_IMPROVEMENT,outflow_occurrences_INSURANCE,outflow_occurrences_INVESTMENT,outflow_occurrences_LEGAL,outflow_occurrences_LOAN,outflow_occurrences_MISCELLANEOUS,outflow_occurrences_MORTGAGE,outflow_occurrences_OVERDRAFT,outflow_occurrences_PETS,outflow_occurrences_RENT,outflow_occurrences_RISK_CATCH_ALL,outflow_occurrences_RTO_LTO,outflow_occurrences_SELF_TRANSFER,outflow_occurrences_TAX,outflow_occurrences_TRANSPORATION,outflow_occurrences_TRAVEL,outflow_sums_ACCOUNT_FEES,outflow_sums_ATM_CASH,outflow_sums_AUTOMOTIVE,outflow_sums_AUTO_LOAN,outflow_sums_BANKING_CATCH_ALL,outflow_sums_BILLS_UTILITIES,outflow_sums_BNPL,outflow_sums_CHILD_DEPENDENTS,outflow_sums_CORPORATE_PAYMENTS,outflow_sums_CREDIT_CARD_PAYMENT,outflow_sums_DEBT,outflow_sums_EDUCATION,outflow_sums_ENTERTAINMENT,outflow_sums_ESSENTIAL_SERVICES,outflow_sums_EXTERNAL_TRANSFER,outflow_sums_FITNESS,outflow_sums_FOOD_AND_BEVERAGES,outflow_sums_GAMBLING,outflow_sums_GENERAL_MERCHANDISE,outflow_sums_GIFTS_DONATIONS,outflow_sums_GOVERNMENT_SERVICES,outflow_sums_GROCERIES,outflow_sums_HEALTHCARE_MEDICAL,outflow_sums_HOME_IMPROVEMENT,outflow_sums_INSURANCE,outflow_sums_INVESTMENT,outflow_sums_LEGAL,outflow_sums_LOAN,outflow_sums_MISCELLANEOUS,outflow_sums_MORTGAGE,outflow_sums_OVERDRAFT,outflow_sums_PETS,outflow_sums_RENT,outflow_sums_RISK_CATCH_ALL,outflow_sums_RTO_LTO,outflow_sums_SELF_TRANSFER,outflow_sums_TAX,outflow_sums_TRANSPORATION,outflow_sums_TRAVEL,inflow_occurrences_CORPORATE_PAYMENTS,inflow_occurrences_DEPOSIT,inflow_occurrences_EXTERNAL_TRANSFER,inflow_occurrences_GAMBLING,inflow_occurrences_INSURANCE,inflow_occurrences_INVESTMENT,inflow_occurrences_INVESTMENT_INCOME,inflow_occurrences_LOAN,inflow_occurrences_MISCELLANEOUS,inflow_occurrences_OTHER_BENEFITS,inflow_occurrences_PAYCHECK,inflow_occurrences_PENSION,inflow_occurrences_REFUND,inflow_occurrences_SELF_TRANSFER,inflow_occurrences_TAX,inflow_occurrences_TIME_OR_STUFF,inflow_occurrences_UNEMPLOYMENT_BENEFITS,inflow_sums_CORPORATE_PAYMENTS,inflow_sums_DEPOSIT,inflow_sums_EXTERNAL_TRANSFER,inflow_sums_GAMBLING,inflow_sums_INSURANCE,inflow_sums_INVESTMENT,inflow_sums_INVESTMENT_INCOME,inflow_sums_LOAN,inflow_sums_MISCELLANEOUS,inflow_sums_OTHER_BENEFITS,inflow_sums_PAYCHECK,inflow_sums_PENSION,inflow_sums_REFUND,inflow_sums_SELF_TRANSFER,inflow_sums_TAX,inflow_sums_TIME_OR_STUFF,inflow_sums_UNEMPLOYMENT_BENEFITS,account_type_401K,account_type_AUTO,account_type_BROKERAGE,account_type_CASH MANAGEMENT,account_type_CD,account_type_CHECKING,account_type_CONSUMER,account_type_CREDIT CARD,account_type_HOME EQUITY,account_type_HSA,account_type_IRA,account_type_LINE OF CREDIT,account_type_LOAN,account_type_MONEY MARKET,account_type_MONEYMARKET,account_type_MORTGAGE,account_type_OTHER,account_type_OVERDRAFT,account_type_PREPAID,account_type_RETIREMENT,account_type_ROTH,account_type_SAVINGS,account_type_STOCK PLAN,account_type_STUDENT
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222,0.0,3.0,21.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,9.0,6.0,214.0,0.0,26.0,0.0,0.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,5.0,0.0,0.0,0.0,15.0,0.0,1.0,2.0,0.00,540.00,527.15,0.0,1980.00,0.00,0.00,0.0,0.0,0.00,0.0,0.00,151.71,0.00,1741.10,150.6,4079.35,0.00,1974.00,0.0,0.0,997.63,193.38,0.00,0.00,0.00,0.00,0.00,1665.51,0.00,0.0,325.38,0.00,0.0,0.0,471.37,0.00,2.48,108.75,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,9.0,0.0,1.0,8.0,3.0,0.0,0.0,0.0,500.00,228.75,0.0,0.00,0.00,0.00,0.00,1.63,0.00,8820.56,0.0,19.96,2212.40,2603.52,0.00,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027,0.0,35.0,7.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,1.0,0.0,18.0,0.0,54.0,0.0,0.0,24.0,3.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,10.0,0.0,0.0,0.0,39.0,0.0,2.0,0.0,0.00,6999.13,195.18,0.0,0.00,0.00,251.43,0.0,0.0,0.00,0.0,0.00,275.25,0.00,2.42,0.0,337.00,0.00,3314.53,0.0,0.0,427.27,57.76,0.00,0.00,0.00,0.00,0.00,1773.26,0.00,0.0,310.34,0.00,0.0,0.0,9103.00,0.00,51.80,0.00,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,14.0,0.0,1.0,39.0,2.0,0.0,0.0,0.0,1492.95,0.00,0.0,0.00,0.00,0.00,0.00,61.39,0.00,11918.64,0.0,2.42,9103.00,2325.40,0.00,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394,0.0,9.0,44.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,3.0,22.0,0.0,71.0,0.0,39.0,0.0,0.0,13.0,8.0,9.0,6.0,3.0,1.0,0.0,85.0,0.0,0.0,3.0,0.0,0.0,0.0,38.0,0.0,2.0,3.0,0.00,6240.00,1005.74,0.0,0.00,160.00,0.00,0.0,0.0,0.00,0.0,8.57,246.83,210.00,785.55,0.0,1317.26,0.00,1363.33,0.0,0.0,211.36,201.91,394.60,904.30,500.00,16.97,0.00,3908.95,0.00,0.0,16.46,0.00,0.0,0.0,4426.75,0.00,24.50,391.50,0.0,2.0,6.0,0.0,2.0,2.0,0.0,0.0,18.0,0.0,0.0,0.0,2.0,47.0,2.0,0.0,0.0,0.0,1100.00,64.00,0.0,4.87,0.52,0.00,0.00,5.20,0.00,0.00,0.0,56.48,15439.16,6094.48,0.00,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.007780,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.0,24.0,0.0,17.0,0.0,60.0,0.0,0.0,4.0,3.0,1.0,0.0,4.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,1.0,50.0,0.0,0.00,113.50,53.85,0.0,0.00,60.00,0.00,0.0,0.0,0.00,0.0,0.00,214.12,335.85,4076.35,0.0,500.49,0.00,2487.61,0.0,0.0,106.84,105.00,24.13,0.00,150.72,0.00,0.00,504.61,0.00,0.0,0.00,0.00,0.0,0.0,9782.19,44.04,1286.71,0.00,0.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,12.0,0.0,3.0,0.0,2.0,23.0,2.0,0.0,3.0,0.0,1030.00,37.75,0.0,0.00,0.72,0.00,0.00,4.90,0.00,4047.81,0.0,37.88,9782.19,2000.00,0.00,5700.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824,0.0,10.0,29.0,0.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,14.0,5.0,5.0,0.0,15.0,5.0,8.0,0.0,0.0,54.0,4.0,14.0,5.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,2.0,5.0,0.00,3540.00,966.43,0.0,0.00,88.04,0.00,0.0,0.0,225.48,0.0,23.97,604.01,820.56,612.67,0.0,269.03,495.98,486.88,0.0,0.0,2274.83,75.56,336.82,343.17,0.00,0.00,0.00,2720.78,0.00,0.0,0.00,0.00,0.0,0.0,2802.27,0.00,150.00,673.23,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,5.0,7.0,0.0,0.0,17.0,0.0,54.44,0.00,0.0,0.00,0.00,0.00,0.00,15.17,0.00,0.00,0.0,116.50,2760.00,0.00,0.00,12020.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13004,14991,2022-01-26,667.0,,1092.69,0.075871,-0.131668,5.0,3.0,27.0,0.0,0.0,6.0,2.0,0.0,0.0,7.0,0.0,0.0,62.0,12.0,32.0,0.0,45.0,0.0,306.0,0.0,0.0,17.0,12.0,4.0,13.0,0.0,0.0,3.0,28.0,0.0,1.0,2.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,1.01,160.00,239.12,0.0,0.00,640.19,118.54,0.0,0.0,1126.90,0.0,0.00,1052.31,885.61,3028.47,0.0,432.15,0.00,5016.07,0.0,0.0,258.88,151.11,94.30,401.76,0.00,0.00,368.42,603.61,0.00,36.0,50.42,0.00,0.0,0.0,220.00,0.00,46.37,0.00,0.0,10.0,21.0,0.0,2.0,0.0,0.0,0.0,4.0,0.0,5.0,8.0,21.0,10.0,0.0,1.0,0.0,0.0,656.00,596.85,0.0,22.58,0.00,0.00,0.00,105.15,0.00,170.00,12368.0,697.20,289.21,0.00,12.66,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
13005,14992,2022-03-14,709.0,,29164.00,0.624704,0.397316,4.0,5.0,167.0,0.0,11.0,22.0,24.0,0.0,0.0,29.0,0.0,25.0,8.0,20.0,148.0,0.0,148.0,0.0,219.0,0.0,0.0,35.0,19.0,7.0,37.0,0.0,1.0,91.0,114.0,7.0,10.0,32.0,9.0,0.0,0.0,106.0,0.0,63.0,0.0,47.80,4373.00,4425.45,0.0,2163.65,4041.09,2885.07,0.0,0.0,5864.29,0.0,8276.37,749.67,1774.54,22865.19,0.0,2985.82,0.00,20344.77,0.0,0.0,1854.51,296.46,307.82,3745.53,0.00,555.00,15865.05,18759.18,10206.84,350.0,2561.17,2818.87,0.0,0.0,2370.96,0.00,1305.38,0.00,0.0,14.0,73.0,0.0,1.0,1.0,0.0,36.0,2.0,6.0,24.0,0.0,16.0,0.0,1.0,0.0,0.0,0.0,10348.50,68631.24,0.0,74.78,142.87,0.00,16474.59,443.26,13319.31,56792.17,0.0,2300.58,0.00,1936.00,0.00,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13006,14994,2022-01-15,616.0,,5464.18,-0.590569,-0.049291,29.0,9.0,22.0,0.0,4.0,8.0,8.0,0.0,0.0,93.0,0.0,4.0,39.0,0.0,95.0,0.0,149.0,0.0,99.0,0.0,0.0,44.0,35.0,3.0,0.0,0.0,0.0,17.0,64.0,4.0,0.0,2.0,0.0,0.0,0.0,75.0,0.0,2.0,1.0,192.47,1148.00,624.31,0.0,157.40,164.87,139.33,0.0,0.0,23204.15,0.0,1344.00,708.72,0.00,30818.74,0.0,3262.85,0.00,5509.92,0.0,0.0,2753.91,1351.91,113.44,0.00,0.00,0.00,11347.72,11934.59,7609.16,0.0,98.89,0.00,0.0,0.0,23441.00,0.00,41.75,66.00,0.0,26.0,9.0,0.0,0.0,0.0,0.0,0.0,24.0,8.0,20.0,0.0,18.0,90.0,6.0,5.0,0.0,0.0,13727.13,655.55,0.0,0.00,0.00,0.00,0.00,55550.94,8282.00,1942.64,0.0,218.15,29321.00,4800.00,285.66,0.0,0,0,0,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
13007,14996,2022-01-15,625.0,,6821.92,-0.472962,-0.023705,17.0,5.0,37.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,13.0,1.0,21.0,0.0,29.0,0.0,24.0,0.0,0.0,10.0,5.0,3.0,0.0,2.0,0.0,0.0,35.0,0.0,0.0,3.0,2.0,0.0,0.0,25.0,0.0,4.0,1.0,102.10,280.00,1798.06,0.0,87.37,0.00,0.00,445.0,0.0,0.00,0.0,0.00,410.32,54.99,2458.78,0.0,953.49,0.00,3474.03,0.0,0.0,240.65,1989.84,53.22,0.00,450.00,0.00,0.00,4575.33,0.00,0.0,163.10,1432.50,0.0,0.0,41887.19,0.00,15.25,119.60,0.0,28.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,21.0,0.0,3.0,20.0,0.0,0.0,3.0,0.0,12141.12,0.00,0.0,0.00,0.00,0.22,0.00,0.00,0.00,17721.24,0.0,765.06,29567.19,0.00,0.00,1674.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [None]:
# # List of account types for borrowing money
# negative_balance_types = [
#     "CREDIT CARD",
#     "LOAN",
#     "LINE OF CREDIT",
#     "AUTO",
#     "MORTGAGE",
#     "STUDENT",
#     "OVERDRAFT",
#     "HOME EQUITY",
# ]


# # Function to adjust balances (negative for borrowing account types)
# def adjust_balance(row):
#     if row["account_type"] in negative_balance_types:
#         return -row["balance"]
#     return row["balance"]


# df["adjusted_balance"] = df.apply(adjust_balance, axis=1)

# # Pivot the data to create columns for each account type, grouping by prism_consumer_id
# grouped_df = df.pivot_table(
#     index="prism_consumer_id",
#     columns="account_type",
#     values="adjusted_balance",
#     aggfunc="sum",  # Sum balances if there are multiple rows for the same consumer and account type
#     fill_value=0,
# )

# consumer_features = consumer_features.merge(grouped_df, on="prism_consumer_id")
# consumer_features

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance,outflow_occurrences_ACCOUNT_FEES,outflow_occurrences_ATM_CASH,outflow_occurrences_AUTOMOTIVE,outflow_occurrences_AUTO_LOAN,outflow_occurrences_BANKING_CATCH_ALL,outflow_occurrences_BILLS_UTILITIES,outflow_occurrences_BNPL,outflow_occurrences_CHILD_DEPENDENTS,outflow_occurrences_CORPORATE_PAYMENTS,outflow_occurrences_CREDIT_CARD_PAYMENT,outflow_occurrences_DEBT,outflow_occurrences_EDUCATION,outflow_occurrences_ENTERTAINMENT,outflow_occurrences_ESSENTIAL_SERVICES,outflow_occurrences_EXTERNAL_TRANSFER,outflow_occurrences_FITNESS,outflow_occurrences_FOOD_AND_BEVERAGES,outflow_occurrences_GAMBLING,outflow_occurrences_GENERAL_MERCHANDISE,outflow_occurrences_GIFTS_DONATIONS,outflow_occurrences_GOVERNMENT_SERVICES,outflow_occurrences_GROCERIES,outflow_occurrences_HEALTHCARE_MEDICAL,outflow_occurrences_HOME_IMPROVEMENT,outflow_occurrences_INSURANCE,outflow_occurrences_INVESTMENT,outflow_occurrences_LEGAL,outflow_occurrences_LOAN,outflow_occurrences_MISCELLANEOUS,outflow_occurrences_MORTGAGE,outflow_occurrences_OVERDRAFT,outflow_occurrences_PETS,outflow_occurrences_RENT,outflow_occurrences_RISK_CATCH_ALL,outflow_occurrences_RTO_LTO,outflow_occurrences_SELF_TRANSFER,outflow_occurrences_TAX,outflow_occurrences_TRANSPORATION,outflow_occurrences_TRAVEL,outflow_sums_ACCOUNT_FEES,outflow_sums_ATM_CASH,outflow_sums_AUTOMOTIVE,outflow_sums_AUTO_LOAN,outflow_sums_BANKING_CATCH_ALL,outflow_sums_BILLS_UTILITIES,outflow_sums_BNPL,outflow_sums_CHILD_DEPENDENTS,outflow_sums_CORPORATE_PAYMENTS,outflow_sums_CREDIT_CARD_PAYMENT,outflow_sums_DEBT,outflow_sums_EDUCATION,outflow_sums_ENTERTAINMENT,outflow_sums_ESSENTIAL_SERVICES,outflow_sums_EXTERNAL_TRANSFER,outflow_sums_FITNESS,outflow_sums_FOOD_AND_BEVERAGES,outflow_sums_GAMBLING,outflow_sums_GENERAL_MERCHANDISE,outflow_sums_GIFTS_DONATIONS,outflow_sums_GOVERNMENT_SERVICES,outflow_sums_GROCERIES,outflow_sums_HEALTHCARE_MEDICAL,outflow_sums_HOME_IMPROVEMENT,outflow_sums_INSURANCE,outflow_sums_INVESTMENT,outflow_sums_LEGAL,outflow_sums_LOAN,outflow_sums_MISCELLANEOUS,outflow_sums_MORTGAGE,outflow_sums_OVERDRAFT,outflow_sums_PETS,outflow_sums_RENT,outflow_sums_RISK_CATCH_ALL,outflow_sums_RTO_LTO,outflow_sums_SELF_TRANSFER,outflow_sums_TAX,outflow_sums_TRANSPORATION,outflow_sums_TRAVEL,inflow_occurrences_CORPORATE_PAYMENTS,inflow_occurrences_DEPOSIT,inflow_occurrences_EXTERNAL_TRANSFER,inflow_occurrences_GAMBLING,inflow_occurrences_INSURANCE,inflow_occurrences_INVESTMENT,inflow_occurrences_INVESTMENT_INCOME,inflow_occurrences_LOAN,inflow_occurrences_MISCELLANEOUS,inflow_occurrences_OTHER_BENEFITS,inflow_occurrences_PAYCHECK,inflow_occurrences_PENSION,inflow_occurrences_REFUND,inflow_occurrences_SELF_TRANSFER,inflow_occurrences_TAX,inflow_occurrences_TIME_OR_STUFF,inflow_occurrences_UNEMPLOYMENT_BENEFITS,inflow_sums_CORPORATE_PAYMENTS,inflow_sums_DEPOSIT,inflow_sums_EXTERNAL_TRANSFER,inflow_sums_GAMBLING,inflow_sums_INSURANCE,inflow_sums_INVESTMENT,inflow_sums_INVESTMENT_INCOME,inflow_sums_LOAN,inflow_sums_MISCELLANEOUS,inflow_sums_OTHER_BENEFITS,inflow_sums_PAYCHECK,inflow_sums_PENSION,inflow_sums_REFUND,inflow_sums_SELF_TRANSFER,inflow_sums_TAX,inflow_sums_TIME_OR_STUFF,inflow_sums_UNEMPLOYMENT_BENEFITS,401K,AUTO,BROKERAGE,CASH MANAGEMENT,CD,CHECKING,CONSUMER,CREDIT CARD,HOME EQUITY,HSA,IRA,LINE OF CREDIT,LOAN,MONEY MARKET,MONEYMARKET,MORTGAGE,OTHER,OVERDRAFT,PREPAID,RETIREMENT,ROTH,SAVINGS,STOCK PLAN,STUDENT
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222,0.0,3.0,21.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,9.0,6.0,214.0,0.0,26.0,0.0,0.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,5.0,0.0,0.0,0.0,15.0,0.0,1.0,2.0,0.00,540.00,527.15,0.0,1980.00,0.00,0.00,0.0,0.0,0.00,0.0,0.00,151.71,0.00,1741.10,150.6,4079.35,0.00,1974.00,0.0,0.0,997.63,193.38,0.00,0.00,0.00,0.00,0.00,1665.51,0.00,0.0,325.38,0.00,0.0,0.0,471.37,0.00,2.48,108.75,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,9.0,0.0,1.0,8.0,3.0,0.0,0.0,0.0,500.00,228.75,0.0,0.00,0.00,0.00,0.00,1.63,0.00,8820.56,0.0,19.96,2212.40,2603.52,0.00,0.0,0.0,0.0,0.0,0.0,0.0,294.67,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.70,0.0,0.0
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027,0.0,35.0,7.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,1.0,0.0,18.0,0.0,54.0,0.0,0.0,24.0,3.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,10.0,0.0,0.0,0.0,39.0,0.0,2.0,0.0,0.00,6999.13,195.18,0.0,0.00,0.00,251.43,0.0,0.0,0.00,0.0,0.00,275.25,0.00,2.42,0.0,337.00,0.00,3314.53,0.0,0.0,427.27,57.76,0.00,0.00,0.00,0.00,0.00,1773.26,0.00,0.0,310.34,0.00,0.0,0.0,9103.00,0.00,51.80,0.00,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,14.0,0.0,1.0,39.0,2.0,0.0,0.0,0.0,1492.95,0.00,0.0,0.00,0.00,0.00,0.00,61.39,0.00,11918.64,0.0,2.42,9103.00,2325.40,0.00,0.0,0.0,0.0,0.0,0.0,0.0,91.24,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3211.18,0.0,0.0
2,2,2021-05-01,680.0,0.0,2805.36,0.245748,-0.099394,0.0,9.0,44.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,3.0,22.0,0.0,71.0,0.0,39.0,0.0,0.0,13.0,8.0,9.0,6.0,3.0,1.0,0.0,85.0,0.0,0.0,3.0,0.0,0.0,0.0,38.0,0.0,2.0,3.0,0.00,6240.00,1005.74,0.0,0.00,160.00,0.00,0.0,0.0,0.00,0.0,8.57,246.83,210.00,785.55,0.0,1317.26,0.00,1363.33,0.0,0.0,211.36,201.91,394.60,904.30,500.00,16.97,0.00,3908.95,0.00,0.0,16.46,0.00,0.0,0.0,4426.75,0.00,24.50,391.50,0.0,2.0,6.0,0.0,2.0,2.0,0.0,0.0,18.0,0.0,0.0,0.0,2.0,47.0,2.0,0.0,0.0,0.0,1100.00,64.00,0.0,4.87,0.52,0.00,0.00,5.20,0.00,0.00,0.0,56.48,15439.16,6094.48,0.00,0.0,0.0,0.0,0.0,0.0,0.0,243.93,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2561.43,0.0,0.0
3,3,2021-03-01,734.0,0.0,7667.01,0.951391,-0.007780,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,6.0,24.0,0.0,17.0,0.0,60.0,0.0,0.0,4.0,3.0,1.0,0.0,4.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,1.0,50.0,0.0,0.00,113.50,53.85,0.0,0.00,60.00,0.00,0.0,0.0,0.00,0.0,0.00,214.12,335.85,4076.35,0.0,500.49,0.00,2487.61,0.0,0.0,106.84,105.00,24.13,0.00,150.72,0.00,0.00,504.61,0.00,0.0,0.00,0.00,0.0,0.0,9782.19,44.04,1286.71,0.00,0.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,12.0,0.0,3.0,0.0,2.0,23.0,2.0,0.0,3.0,0.0,1030.00,37.75,0.0,0.00,0.72,0.00,0.00,4.90,0.00,4047.81,0.0,37.88,9782.19,2000.00,0.00,5700.0,0.0,0.0,0.0,0.0,0.0,976.82,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6690.19,0.0,0.0
4,4,2021-10-01,676.0,0.0,394.55,0.193478,-0.144824,0.0,10.0,29.0,0.0,0.0,10.0,0.0,0.0,0.0,2.0,0.0,2.0,14.0,5.0,5.0,0.0,15.0,5.0,8.0,0.0,0.0,54.0,4.0,14.0,5.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,2.0,5.0,0.00,3540.00,966.43,0.0,0.00,88.04,0.00,0.0,0.0,225.48,0.0,23.97,604.01,820.56,612.67,0.0,269.03,495.98,486.88,0.0,0.0,2274.83,75.56,336.82,343.17,0.00,0.00,0.00,2720.78,0.00,0.0,0.00,0.00,0.0,0.0,2802.27,0.00,150.00,673.23,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,5.0,7.0,0.0,0.0,17.0,0.0,54.44,0.00,0.0,0.00,0.00,0.00,0.00,15.17,0.00,0.00,0.0,116.50,2760.00,0.00,0.00,12020.0,0.0,0.0,0.0,0.0,0.0,391.62,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.93,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13004,14991,2022-01-26,667.0,,1092.69,0.075871,-0.131668,5.0,3.0,27.0,0.0,0.0,6.0,2.0,0.0,0.0,7.0,0.0,0.0,62.0,12.0,32.0,0.0,45.0,0.0,306.0,0.0,0.0,17.0,12.0,4.0,13.0,0.0,0.0,3.0,28.0,0.0,1.0,2.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,1.01,160.00,239.12,0.0,0.00,640.19,118.54,0.0,0.0,1126.90,0.0,0.00,1052.31,885.61,3028.47,0.0,432.15,0.00,5016.07,0.0,0.0,258.88,151.11,94.30,401.76,0.00,0.00,368.42,603.61,0.00,36.0,50.42,0.00,0.0,0.0,220.00,0.00,46.37,0.00,0.0,10.0,21.0,0.0,2.0,0.0,0.0,0.0,4.0,0.0,5.0,8.0,21.0,10.0,0.0,1.0,0.0,0.0,656.00,596.85,0.0,22.58,0.00,0.00,0.00,105.15,0.00,170.00,12368.0,697.20,289.21,0.00,12.66,0.0,0.0,0.0,0.0,0.0,0.0,1091.68,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.01,0.0,0.0
13005,14992,2022-03-14,709.0,,29164.00,0.624704,0.397316,4.0,5.0,167.0,0.0,11.0,22.0,24.0,0.0,0.0,29.0,0.0,25.0,8.0,20.0,148.0,0.0,148.0,0.0,219.0,0.0,0.0,35.0,19.0,7.0,37.0,0.0,1.0,91.0,114.0,7.0,10.0,32.0,9.0,0.0,0.0,106.0,0.0,63.0,0.0,47.80,4373.00,4425.45,0.0,2163.65,4041.09,2885.07,0.0,0.0,5864.29,0.0,8276.37,749.67,1774.54,22865.19,0.0,2985.82,0.00,20344.77,0.0,0.0,1854.51,296.46,307.82,3745.53,0.00,555.00,15865.05,18759.18,10206.84,350.0,2561.17,2818.87,0.0,0.0,2370.96,0.00,1305.38,0.00,0.0,14.0,73.0,0.0,1.0,1.0,0.0,36.0,2.0,6.0,24.0,0.0,16.0,0.0,1.0,0.0,0.0,0.0,10348.50,68631.24,0.0,74.78,142.87,0.00,16474.59,443.26,13319.31,56792.17,0.0,2300.58,0.00,1936.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,29164.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
13006,14994,2022-01-15,616.0,,5464.18,-0.590569,-0.049291,29.0,9.0,22.0,0.0,4.0,8.0,8.0,0.0,0.0,93.0,0.0,4.0,39.0,0.0,95.0,0.0,149.0,0.0,99.0,0.0,0.0,44.0,35.0,3.0,0.0,0.0,0.0,17.0,64.0,4.0,0.0,2.0,0.0,0.0,0.0,75.0,0.0,2.0,1.0,192.47,1148.00,624.31,0.0,157.40,164.87,139.33,0.0,0.0,23204.15,0.0,1344.00,708.72,0.00,30818.74,0.0,3262.85,0.00,5509.92,0.0,0.0,2753.91,1351.91,113.44,0.00,0.00,0.00,11347.72,11934.59,7609.16,0.0,98.89,0.00,0.0,0.0,23441.00,0.00,41.75,66.00,0.0,26.0,9.0,0.0,0.0,0.0,0.0,0.0,24.0,8.0,20.0,0.0,18.0,90.0,6.0,5.0,0.0,0.0,13727.13,655.55,0.0,0.00,0.00,0.00,0.00,55550.94,8282.00,1942.64,0.0,218.15,29321.00,4800.00,285.66,0.0,0.0,0.0,0.0,0.0,0.0,5058.99,0.0,-230.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,175.00,0.0,0.0
13007,14996,2022-01-15,625.0,,6821.92,-0.472962,-0.023705,17.0,5.0,37.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,13.0,1.0,21.0,0.0,29.0,0.0,24.0,0.0,0.0,10.0,5.0,3.0,0.0,2.0,0.0,0.0,35.0,0.0,0.0,3.0,2.0,0.0,0.0,25.0,0.0,4.0,1.0,102.10,280.00,1798.06,0.0,87.37,0.00,0.00,445.0,0.0,0.00,0.0,0.00,410.32,54.99,2458.78,0.0,953.49,0.00,3474.03,0.0,0.0,240.65,1989.84,53.22,0.00,450.00,0.00,0.00,4575.33,0.00,0.0,163.10,1432.50,0.0,0.0,41887.19,0.00,15.25,119.60,0.0,28.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,21.0,0.0,3.0,20.0,0.0,0.0,3.0,0.0,12141.12,0.00,0.0,0.00,0.00,0.22,0.00,0.00,0.00,17721.24,0.0,765.06,29567.19,0.00,0.00,1674.0,0.0,0.0,0.0,0.0,0.0,2084.04,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4737.88,0.0,0.0


Testing combined outflow/inflow occurences and sums as well as one hot encoded features for type of account

In [80]:
classification_features = consumer_features.columns[4:].drop('std_credit')
run_classification2(classification_features, "DQ_TARGET", consumer_features)


Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78      1802
         1.0       0.16      0.66      0.26       180

    accuracy                           0.66      1982
   macro avg       0.56      0.66      0.52      1982
weighted avg       0.88      0.66      0.73      1982

ROC-AUC Score: 0.704

Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      1802
         1.0       0.27      0.14      0.19       180

    accuracy                           0.89      1982
   macro avg       0.59      0.55      0.56      1982
weighted avg       0.86      0.89      0.87      1982

ROC-AUC Score: 0.761
[LightGBM] [Info] Number of positive: 7239, number of negative: 7239
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006546 seconds.
You can set `force_row_wise=true` to remove the o

  c /= stddev[:, None]
  c /= stddev[None, :]



Top 20 Features with Correlation Direction:
                               Feature  Importance  Correlation
                               balance    0.045148    -0.038697
      outflow_occurrences_ACCOUNT_FEES    0.038146     0.023709
                           std_balance    0.036229    -0.038697
                  account_type_SAVINGS    0.028457    -0.110817
             outflow_sums_ACCOUNT_FEES    0.028121     0.009541
                outflow_sums_OVERDRAFT    0.027683     0.025399
                              CHECKING    0.023124    -0.031861
         outflow_occurrences_OVERDRAFT    0.022937     0.075701
      inflow_occurrences_MISCELLANEOUS    0.018988    -0.041972
                     outflow_sums_BNPL    0.018800     0.036570
                     outflow_sums_LOAN    0.017230    -0.015584
                               SAVINGS    0.017055    -0.047150
  inflow_occurrences_EXTERNAL_TRANSFER    0.016210     0.054128
         outflow_occurrences_INSURANCE    0.015708    -0.04

In [144]:
import sys
import os

sys.path.append(os.path.abspath("../src/base"))
# from classify import run_classification
from classify import run_classification

In [145]:
classification_features = consumer_features.columns[4:]
run_classification(classification_features, "DQ_TARGET", consumer_features)

NameError: name 'UndefinedMetricWarning' is not defined