In [1]:
import numpy as np
import pandas as pd
import warnings
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import xgboost as xgb

In [2]:
categories = pd.read_csv('../../data_q2/q2-ucsd-cat-map.csv')
consumer = pd.read_parquet('../../data_q2/q2-ucsd-consDF.pqt')
acct = pd.read_parquet('../../data_q2/q2-ucsd-acctDF.pqt')
transactions = pd.read_parquet('../../data_q2/q2-ucsd-trxnDF.pqt')

In [3]:
consumer.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0


In [4]:
transactions.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16


In [5]:
categories

Unnamed: 0,category_id,category
0,0,SELF_TRANSFER
1,1,EXTERNAL_TRANSFER
2,2,DEPOSIT
3,3,PAYCHECK
4,4,MISCELLANEOUS
5,5,PAYCHECK_PLACEHOLDER
6,6,REFUND
7,7,INVESTMENT_INCOME
8,8,OTHER_BENEFITS
9,9,UNEMPLOYMENT_BENEFITS


In [6]:
acct.head()

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.9


In [7]:
len(acct['prism_consumer_id'].unique())

13009

In [8]:
# merge 
acct['account_type'].unique()

# most important account_types: ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN]
# most_important_accounts = ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN']
acctDF = acct.copy()
total_balance = acctDF.groupby('prism_consumer_id')['balance'].sum()
total_balance

pd.DataFrame(total_balance) #get total balance for each consumer

Unnamed: 0_level_0,balance
prism_consumer_id,Unnamed: 1_level_1
0,320.37
1,3302.42
10,824.24
100,2655.47
1000,95.25
...,...
9995,0.00
9996,252.67
9997,611.28
9998,-862.99


In [9]:
consumer_balance = consumer.merge(pd.DataFrame(total_balance), on = 'prism_consumer_id', how = 'outer')
consumer_balance['std_credit'] = (consumer_balance['credit_score'] - consumer_balance['credit_score'].mean()) / consumer_balance['credit_score'].std()
consumer_balance["std_balance"] = (
    consumer_balance["balance"] - consumer_balance["balance"].mean()
    ) / consumer_balance["balance"].std()
consumer_balance

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027
2,10,2022-02-01,654.0,0.0,824.24,-0.094005,-0.136727
3,100,2021-12-01,750.0,0.0,2655.47,1.160470,-0.102219
4,1000,2021-03-01,756.0,0.0,95.25,1.238874,-0.150464
...,...,...,...,...,...,...,...
14995,9995,2023-03-02,578.0,,0.00,-1.087132,-0.152259
14996,9996,2023-10-11,610.0,,252.67,-0.668973,-0.147498
14997,9997,2023-05-25,675.0,,611.28,0.180411,-0.140740
14998,9998,2023-11-02,534.0,,-862.99,-1.662099,-0.168522


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import warnings

def run_classification(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run Logistic Regression and Random Forest Classification on a dataset.

    Parameters:
        feature_column (list): List of columns that can be used as features.
        target_column (str): The name of the target column.
        dataset (pd.DataFrame): The dataset containing the features and target.
        test_size (float): Proportion of the dataset to include in the test split (default 0.3).
        random_state (int): Random seed for reproducibility (default 42).

    Returns:
        None: Prints the classification reports for both models.
    """
    warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

    # Drop NaN values and shuffle the dataset
    dataset = dataset.dropna()
    dataset = dataset.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # balance the dataset
    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

    # Logistic Regression
    log_model = LogisticRegression(class_weight = 'balanced')
    log_model.fit(X_train_bal, y_train_bal)
    log_y_pred = log_model.predict(X_test)
    print(f"Logistic Regression for {feature_column}")
    print(classification_report(y_test, log_y_pred))

    # Random Forest Classification
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(X_train_bal, y_train_bal)
    rfc_y_pred = rfc_model.predict(X_test)
    print(f"\nRandom Forest Classification for {feature_column}")
    print(classification_report(y_test, rfc_y_pred))

    # Light GBM 
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train_bal, y_train_bal)
    lgb_y_pred = lgb_model.predict(X_test)
    print(f"\nLGB Model Classification for {feature_column}")
    print(classification_report(y_test, lgb_y_pred))

    # SVC Model
    svc_model = SVC(probability=True, random_state=random_state)
    svc_model.fit(X_train_bal, y_train_bal)
    svc_y_pred = svc_model.predict(X_test)
    print(f"\nSVM Classification for {feature_column}")
    print(classification_report(y_test, svc_y_pred))

    # XGB Model
    xgb_model = xgb.XGBClassifier(
        scale_pos_weight=len(y_train) / y_train.sum()
        )
    xgb_model.fit(X_train_bal, y_train_bal)
    xgb_y_pred = xgb_model.predict(X_test)
    print("\nXGBoost:")
    print(classification_report(y_test, xgb_y_pred))

In [11]:
# Added ROC AUC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import warnings

def run_classification2(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run Logistic Regression, Random Forest, LightGBM, SVM, and XGBoost Classification on a dataset.
    Includes ROC-AUC evaluation for all models.

    Parameters:
        feature_column (list): List of columns that can be used as features.
        target_column (str): The name of the target column.
        dataset (pd.DataFrame): The dataset containing the features and target.
        test_size (float): Proportion of the dataset to include in the test split (default 0.2).
        random_state (int): Random seed for reproducibility (default 42).

    Returns:
        None: Prints the classification reports and ROC-AUC scores for all models.
    """
    warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

    # Drop NaN values and shuffle the dataset
    dataset = dataset.dropna()
    dataset = dataset.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Balance the dataset using SMOTE
    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

    # Logistic Regression
    log_model = LogisticRegression(class_weight='balanced')
    log_model.fit(X_train_bal, y_train_bal)
    log_y_pred = log_model.predict(X_test)
    log_y_proba = log_model.predict_proba(X_test)[:, 1]
    print(f"Logistic Regression for {feature_column}")
    print(classification_report(y_test, log_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, log_y_proba):.3f}")

    # Random Forest Classification
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(X_train_bal, y_train_bal)
    rfc_y_pred = rfc_model.predict(X_test)
    rfc_y_proba = rfc_model.predict_proba(X_test)[:, 1]
    print(f"\nRandom Forest Classification for {feature_column}")
    print(classification_report(y_test, rfc_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, rfc_y_proba):.3f}")

    # LightGBM
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train_bal, y_train_bal)
    lgb_y_pred = lgb_model.predict(X_test)
    lgb_y_proba = lgb_model.predict_proba(X_test)[:, 1]
    print(f"\nLightGBM Classification for {feature_column}")
    print(classification_report(y_test, lgb_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, lgb_y_proba):.3f}")

    # SVM Classification
    svc_model = SVC(probability=True, random_state=random_state)
    svc_model.fit(X_train_bal, y_train_bal)
    svc_y_pred = svc_model.predict(X_test)
    svc_y_proba = svc_model.predict_proba(X_test)[:, 1]
    print(f"\nSVM Classification for {feature_column}")
    print(classification_report(y_test, svc_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, svc_y_proba):.3f}")

    # XGBoost Classification
    xgb_model = xgb.XGBClassifier(scale_pos_weight=len(y_train) / y_train.sum())
    xgb_model.fit(X_train_bal, y_train_bal)
    xgb_y_pred = xgb_model.predict(X_test)
    xgb_y_proba = xgb_model.predict_proba(X_test)[:, 1]
    print(f"\nXGBoost Classification for {feature_column}")
    print(classification_report(y_test, xgb_y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, xgb_y_proba):.3f}")


In [12]:
run_classification(["std_credit"], "DQ_TARGET", consumer_balance)

Logistic Regression for ['std_credit']
              precision    recall  f1-score   support

         0.0       0.95      0.74      0.83      1897
         1.0       0.19      0.63      0.29       185

    accuracy                           0.73      2082
   macro avg       0.57      0.68      0.56      2082
weighted avg       0.89      0.73      0.78      2082


Random Forest Classification for ['std_credit']
              precision    recall  f1-score   support

         0.0       0.93      0.81      0.87      1897
         1.0       0.16      0.38      0.23       185

    accuracy                           0.77      2082
   macro avg       0.55      0.60      0.55      2082
weighted avg       0.86      0.77      0.81      2082

[LightGBM] [Info] Number of positive: 7596, number of negative: 7596
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bin

In [13]:
# Example usage:
run_classification2(["std_credit"], "DQ_TARGET", consumer_balance)

Logistic Regression for ['std_credit']
              precision    recall  f1-score   support

         0.0       0.95      0.74      0.83      1897
         1.0       0.19      0.63      0.29       185

    accuracy                           0.73      2082
   macro avg       0.57      0.68      0.56      2082
weighted avg       0.89      0.73      0.78      2082

ROC-AUC Score: 0.750

Random Forest Classification for ['std_credit']
              precision    recall  f1-score   support

         0.0       0.93      0.81      0.87      1897
         1.0       0.16      0.38      0.23       185

    accuracy                           0.77      2082
   macro avg       0.55      0.60      0.55      2082
weighted avg       0.86      0.77      0.81      2082

ROC-AUC Score: 0.662
[LightGBM] [Info] Number of positive: 7596, number of negative: 7596
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_col_wise=true` to remove

In [14]:
# Example usage:  #features, target, dataframe
run_classification(["std_balance"], "DQ_TARGET", consumer_balance)

Logistic Regression for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.96      0.36      0.52      1897
         1.0       0.11      0.85      0.20       185

    accuracy                           0.40      2082
   macro avg       0.54      0.61      0.36      2082
weighted avg       0.89      0.40      0.49      2082


Random Forest Classification for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.92      0.68      0.79      1897
         1.0       0.11      0.41      0.18       185

    accuracy                           0.66      2082
   macro avg       0.52      0.55      0.48      2082
weighted avg       0.85      0.66      0.73      2082

[LightGBM] [Info] Number of positive: 7596, number of negative: 7596
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total B

In [15]:
run_classification2(["std_balance"], "DQ_TARGET", consumer_balance)

Logistic Regression for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.96      0.36      0.52      1897
         1.0       0.11      0.85      0.20       185

    accuracy                           0.40      2082
   macro avg       0.54      0.61      0.36      2082
weighted avg       0.89      0.40      0.49      2082

ROC-AUC Score: 0.701

Random Forest Classification for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.92      0.68      0.79      1897
         1.0       0.11      0.41      0.18       185

    accuracy                           0.66      2082
   macro avg       0.52      0.55      0.48      2082
weighted avg       0.85      0.66      0.73      2082

ROC-AUC Score: 0.589
[LightGBM] [Info] Number of positive: 7596, number of negative: 7596
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_col_wise=true` to remo

In [16]:
#NEW FEATURES: credit and debit

credit_transactions = transactions[transactions['credit_or_debit'] == 'CREDIT']
debit_transactions = transactions[transactions['credit_or_debit'] == 'DEBIT']

credit_features = credit_transactions.groupby('prism_consumer_id').agg(
    total_credit=('amount', 'sum')
).reset_index()

debit_features = debit_transactions.groupby('prism_consumer_id').agg(
    total_debit=('amount', 'sum')
).reset_index()

credit_debit_features = credit_features.merge(debit_features, on='prism_consumer_id', how='outer').fillna(0)
consumer_features = consumer.merge(credit_debit_features, on='prism_consumer_id', how='left').fillna(0)

run_classification2(['total_credit', 'total_debit'], 'DQ_TARGET', consumer_features)

Logistic Regression for ['total_credit', 'total_debit']
              precision    recall  f1-score   support

         0.0       0.96      0.36      0.52      2784
         1.0       0.09      0.79      0.16       216

    accuracy                           0.39      3000
   macro avg       0.52      0.57      0.34      3000
weighted avg       0.89      0.39      0.49      3000

ROC-AUC Score: 0.604

Random Forest Classification for ['total_credit', 'total_debit']
              precision    recall  f1-score   support

         0.0       0.94      0.76      0.84      2784
         1.0       0.11      0.39      0.17       216

    accuracy                           0.73      3000
   macro avg       0.53      0.57      0.51      3000
weighted avg       0.88      0.73      0.79      3000

ROC-AUC Score: 0.600
[LightGBM] [Info] Number of positive: 11210, number of negative: 11210
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can

In [17]:
credit_features

Unnamed: 0,prism_consumer_id,total_credit
0,0,14386.82
1,1,24903.80
2,10,20576.56
3,100,35236.84
4,1000,78353.07
...,...,...
14471,9995,16925.84
14472,9996,1200.03
14473,9997,17206.11
14474,9998,14566.37


In [18]:
consumer_balance['DQ_TARGET'].value_counts()

DQ_TARGET
0.0    10994
1.0     1006
Name: count, dtype: int64

In [19]:
consumer_balance.isna().sum()

prism_consumer_id       0
evaluation_date         0
credit_score            0
DQ_TARGET            3000
balance              1991
std_credit              0
std_balance          1991
dtype: int64

In [20]:
new_data = consumer_balance.dropna()
new_data['DQ_TARGET'].value_counts()

DQ_TARGET
0.0    9493
1.0     915
Name: count, dtype: int64

In [21]:
transactions['posted_date'] = pd.to_datetime(transactions['posted_date'])

# sort transactions by consumer ID and date
transactions = transactions.sort_values(by=['prism_consumer_id', 'posted_date'])

def calculate_balances(group):
    balance = 0
    balances = []  # cumulative balances
    for _, row in group.iterrows():
        if row['credit_or_debit'] == 'CREDIT':
            balance += row['amount']  # add for CREDIT
        elif row['credit_or_debit'] == 'DEBIT':
            balance -= row['amount']  # subtract for DEBIT
        balances.append(balance)
    group['calculated_balance'] = balances
    return group

transactions_with_balances = transactions.groupby('prism_consumer_id').apply(calculate_balances)

transactions_with_balances


  transactions_with_balances = transactions.groupby('prism_consumer_id').apply(calculate_balances)


Unnamed: 0_level_0,Unnamed: 1_level_0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date,calculated_balance
prism_consumer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,136802,0,136738,14,27.62,DEBIT,2021-03-16,-27.62
0,136767,0,136703,11,1400.00,CREDIT,2021-03-17,1372.38
0,136803,0,136739,39,25.10,DEBIT,2021-03-17,1347.28
0,136804,0,136740,37,500.00,DEBIT,2021-03-17,847.28
0,136805,0,136741,14,25.00,DEBIT,2021-03-18,822.28
...,...,...,...,...,...,...,...,...
9999,1524647,9999,1522635,16,66.63,DEBIT,2023-08-08,-274.02
9999,1524648,9999,1522636,14,16.91,DEBIT,2023-08-08,-290.93
9999,1524649,9999,1522637,14,3.52,DEBIT,2023-08-08,-294.45
9999,1524650,9999,1522638,16,7.99,DEBIT,2023-08-08,-302.44


In [22]:
# create new features

transactions['days_since'] = (transactions['posted_date'].max() - transactions['posted_date']).dt.days

time_windows = [0, 350, 700, 1050, 1401]

transaction_features = transactions.groupby('prism_consumer_id').apply(lambda group: {
    # total inflow (CREDIT) and outflow (DEBIT)
    'total_credit': group[group['credit_or_debit'] == 'CREDIT']['amount'].sum(),
    'total_debit': group[group['credit_or_debit'] == 'DEBIT']['amount'].sum(),
    # net transaction balance
    'net_transaction_balance': group[group['credit_or_debit'] == 'CREDIT']['amount'].sum() - group[group['credit_or_debit'] == 'DEBIT']['amount'].sum(),
    # spending in different time windows
    **{f'spending_last_{tw}_days': group[group['days_since'] <= tw]['amount'].sum() for tw in time_windows},
    # spending grouped by categories
    **{f'category_{cat}_spending': group[group['category'] == cat]['amount'].sum() for cat in transactions['category'].unique()},
}).apply(pd.Series).reset_index()

consumer_features = consumer.merge(transaction_features, on='prism_consumer_id', how='left')

consumer_features.fillna(0, inplace=True)

print(consumer_features.head())

grouped_balances = acct.groupby('prism_consumer_id').agg({
    # list of all balances
    'balance': lambda x: list(x),
    # count of negative balances
    'balance': lambda x: (np.array(x) < 0).sum(),
    # average balance
    'balance': 'mean',
    # max balance
    'balance': 'max',
    # min balance
    'balance': 'min'
}).reset_index()

grouped_balances.rename(columns={
    'balance': 'all_balances',
    '<lambda_0>': 'negative_balance_count',
    '<lambda_1>': 'average_balance',
    '<lambda_2>': 'max_balance',
    '<lambda_3>': 'min_balance'
}, inplace=True)

print(grouped_balances.head())

# calculate how long each account has been open
acct['balance_date'] = pd.to_datetime(acct['balance_date'], errors='coerce')
acct['days_account_open'] = (acct['balance_date'].max() - acct['balance_date']).dt.days

# add ratio of negative balance count to account age
negative_balance_ratio = acct.groupby('prism_consumer_id').apply(lambda group: {
    'negative_balance_days': (group['balance'] < 0).sum(),
    'days_account_open': (group['balance_date'].max() - group['balance_date'].min()).days,
    'negative_balance_ratio': (group['balance'] < 0).sum() / ((group['balance_date'].max() - group['balance_date'].min()).days + 1)
}).apply(pd.Series).reset_index()

print(negative_balance_ratio.head())

# merge grouped balances and negative balance ratio with consumer features
final_consumer_features = consumer_features.merge(grouped_balances, on='prism_consumer_id', how='left')
final_consumer_features = final_consumer_features.merge(negative_balance_ratio, on='prism_consumer_id', how='left')

final_consumer_features.fillna(0, inplace=True)

print(final_consumer_features.head())

  transaction_features = transactions.groupby('prism_consumer_id').apply(lambda group: {


  prism_consumer_id evaluation_date  credit_score  DQ_TARGET  total_credit  \
0                 0      2021-09-01         726.0        0.0      14386.82   
1                 1      2021-07-01         626.0        0.0      24903.80   
2                 2      2021-05-01         680.0        0.0      22764.71   
3                 3      2021-03-01         734.0        0.0      22641.25   
4                 4      2021-10-01         676.0        0.0      14966.11   

   total_debit  net_transaction_balance  spending_last_0_days  \
0     14908.41                  -521.59                   0.0   
1     23098.37                  1805.43                   0.0   
2     22334.58                   430.13                   0.0   
3     19846.01                  2795.24                   0.0   
4     17509.71                 -2543.60                   0.0   

   spending_last_350_days  spending_last_700_days  ...  category_32_spending  \
0                     0.0                     0.0  ...      

  negative_balance_ratio = acct.groupby('prism_consumer_id').apply(lambda group: {


  prism_consumer_id  negative_balance_days  days_account_open  \
0                 0                    0.0                0.0   
1                 1                    0.0                0.0   
2                10                    0.0                0.0   
3               100                    0.0              610.0   
4              1000                    0.0                0.0   

   negative_balance_ratio  
0                     0.0  
1                     0.0  
2                     0.0  
3                     0.0  
4                     0.0  
  prism_consumer_id evaluation_date  credit_score  DQ_TARGET  total_credit  \
0                 0      2021-09-01         726.0        0.0      14386.82   
1                 1      2021-07-01         626.0        0.0      24903.80   
2                 2      2021-05-01         680.0        0.0      22764.71   
3                 3      2021-03-01         734.0        0.0      22641.25   
4                 4      2021-10-01         676.0  

In [23]:
# suppress all warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# suppress LightGBM logs
os.environ['OMP_NUM_THREADS'] = '1'

feature_columns = [
    'negative_balance_ratio', 'total_credit', 'total_debit',
    'max_balance', 'min_balance', 'mean_balance', 'std_balance', 
    'negative_balance_count', 'account_duration_days'
] + [f'category_{cat}_spending' for cat in transactions_with_balances['category'].unique()] \
  + [f'spending_{tw}d' for tw in time_windows]

existing_feature_columns = [col for col in feature_columns if col in consumer_features.columns]

target_column = 'DQ_TARGET'
dataset = consumer_features.copy()
dataset.dropna(subset=existing_feature_columns + [target_column], inplace=True)

print("Using these feature columns:", existing_feature_columns)

results = []

for feature in existing_feature_columns:
    print(f"--- Testing Single Feature: {feature} ---")

    X = dataset[[feature]]
    y = dataset[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

    # Logistic Regression
    log_model = LogisticRegression(class_weight='balanced', max_iter=1000)
    log_model.fit(X_train_bal, y_train_bal)
    log_y_proba = log_model.predict_proba(X_test)[:, 1]
    log_auc = roc_auc_score(y_test, log_y_proba)

    # Random Forest
    rfc_model = RandomForestClassifier(random_state=42)
    rfc_model.fit(X_train_bal, y_train_bal)
    rfc_y_proba = rfc_model.predict_proba(X_test)[:, 1]
    rfc_auc = roc_auc_score(y_test, rfc_y_proba)

    # LightGBM (Suppressing logs with verbose=-1)
    lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
    lgb_model.fit(X_train_bal, y_train_bal)
    lgb_y_proba = lgb_model.predict_proba(X_test)[:, 1]
    lgb_auc = roc_auc_score(y_test, lgb_y_proba)

    # XGBoost (Suppressing logs)
    xgb_model = xgb.XGBClassifier(scale_pos_weight=len(y_train) / y_train.sum(), random_state=42, verbosity=0)
    xgb_model.fit(X_train_bal, y_train_bal)
    xgb_y_proba = xgb_model.predict_proba(X_test)[:, 1]
    xgb_auc = roc_auc_score(y_test, xgb_y_proba)

    results.append({
        'feature': feature,
        'logistic_auc': log_auc,
        'random_forest_auc': rfc_auc,
        'lightgbm_auc': lgb_auc,
        'xgboost_auc': xgb_auc
    })

    print("\n")

results_df = pd.DataFrame(results)
results_df['best_auc'] = results_df[['logistic_auc', 'random_forest_auc', 'lightgbm_auc', 'xgboost_auc']].max(axis=1)
results_df = results_df.sort_values(by='best_auc', ascending=False)

# display top features by ROC-AUC
print("Top Features by ROC-AUC:")
print(results_df[['feature', 'best_auc']])


Using these feature columns: ['total_credit', 'total_debit', 'category_14_spending', 'category_11_spending', 'category_39_spending', 'category_37_spending', 'category_17_spending', 'category_4_spending', 'category_3_spending', 'category_28_spending', 'category_16_spending', 'category_18_spending', 'category_0_spending', 'category_27_spending', 'category_20_spending', 'category_1_spending', 'category_21_spending', 'category_19_spending', 'category_40_spending', 'category_6_spending', 'category_2_spending', 'category_35_spending', 'category_34_spending', 'category_22_spending', 'category_46_spending', 'category_30_spending', 'category_26_spending', 'category_13_spending', 'category_45_spending', 'category_12_spending', 'category_7_spending', 'category_42_spending', 'category_24_spending', 'category_31_spending', 'category_23_spending', 'category_25_spending', 'category_36_spending', 'category_41_spending', 'category_8_spending', 'category_32_spending', 'category_48_spending', 'category_2

In [24]:
dataset

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,total_credit,total_debit,net_transaction_balance,spending_last_0_days,spending_last_350_days,spending_last_700_days,...,category_32_spending,category_48_spending,category_29_spending,category_38_spending,category_9_spending,category_33_spending,category_43_spending,category_49_spending,category_44_spending,category_47_spending
0,0,2021-09-01,726.0,0.0,14386.82,14908.41,-521.59,0.0,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2021-07-01,626.0,0.0,24903.80,23098.37,1805.43,0.0,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2021-05-01,680.0,0.0,22764.71,22334.58,430.13,0.0,0.0,0.0,...,0.0,0.00,8.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,2021-03-01,734.0,0.0,22641.25,19846.01,2795.24,0.0,0.0,0.0,...,0.0,0.00,0.00,0.0,5700.0,0.0,0.0,0.0,0.0,0.0
4,4,2021-10-01,676.0,0.0,14966.11,17509.71,-2543.60,0.0,0.0,0.0,...,0.0,0.00,23.97,0.0,12020.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,2022-03-08,655.0,0.0,14236.61,14780.41,-543.80,0.0,0.0,0.0,...,0.0,0.00,523.50,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14996,14996,2022-01-15,625.0,0.0,61868.83,60990.82,878.01,0.0,0.0,0.0,...,0.0,0.00,0.00,0.0,1674.0,445.0,0.0,0.0,0.0,0.0
14997,14997,2022-01-31,688.0,0.0,43859.56,43695.82,163.74,0.0,0.0,0.0,...,0.0,0.00,44.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14998,14998,2022-03-08,722.0,0.0,170828.92,170415.12,413.80,0.0,0.0,0.0,...,12800.0,173.09,15.20,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# count number of users where spending is not zero for each category
category_columns = [col for col in dataset.columns if col.startswith("category_") and col.endswith("_spending")]

nonzero_counts = (dataset[category_columns] != 0).sum()
nonzero_counts_df = pd.DataFrame({"category": category_columns, "num_users_with_spending": nonzero_counts.values})
print(nonzero_counts_df)


                category  num_users_with_spending
0   category_14_spending                    12690
1   category_11_spending                     5163
2   category_39_spending                     3095
3   category_37_spending                     4525
4   category_17_spending                    12300
5    category_4_spending                    13923
6    category_3_spending                    10771
7   category_28_spending                     3665
8   category_16_spending                    13022
9   category_18_spending                    12183
10   category_0_spending                    11275
11  category_27_spending                     9998
12  category_20_spending                    11517
13   category_1_spending                    13269
14  category_21_spending                     6652
15  category_19_spending                    10612
16  category_40_spending                    10013
17   category_6_spending                     9933
18   category_2_spending                    11640


In [26]:
# count number of users where time window spending is not zero
time_window_columns = [col for col in dataset.columns if col.startswith('spending_last_')]

nonzero_spending_counts = dataset[time_window_columns].ne(0).sum()
nonzero_spending_counts_df = nonzero_spending_counts.reset_index()
nonzero_spending_counts_df.columns = ['Time Window', 'Non-Zero Users']
print(nonzero_spending_counts_df)

               Time Window  Non-Zero Users
0     spending_last_0_days               1
1   spending_last_350_days            3047
2   spending_last_700_days            4722
3  spending_last_1050_days           13151
4  spending_last_1401_days           14487


In [27]:
transactions_with_balances

Unnamed: 0_level_0,Unnamed: 1_level_0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date,calculated_balance
prism_consumer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,136802,0,136738,14,27.62,DEBIT,2021-03-16,-27.62
0,136767,0,136703,11,1400.00,CREDIT,2021-03-17,1372.38
0,136803,0,136739,39,25.10,DEBIT,2021-03-17,1347.28
0,136804,0,136740,37,500.00,DEBIT,2021-03-17,847.28
0,136805,0,136741,14,25.00,DEBIT,2021-03-18,822.28
...,...,...,...,...,...,...,...,...
9999,1524647,9999,1522635,16,66.63,DEBIT,2023-08-08,-274.02
9999,1524648,9999,1522636,14,16.91,DEBIT,2023-08-08,-290.93
9999,1524649,9999,1522637,14,3.52,DEBIT,2023-08-08,-294.45
9999,1524650,9999,1522638,16,7.99,DEBIT,2023-08-08,-302.44


In [28]:
transactions_with_balances

Unnamed: 0_level_0,Unnamed: 1_level_0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date,calculated_balance
prism_consumer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,136802,0,136738,14,27.62,DEBIT,2021-03-16,-27.62
0,136767,0,136703,11,1400.00,CREDIT,2021-03-17,1372.38
0,136803,0,136739,39,25.10,DEBIT,2021-03-17,1347.28
0,136804,0,136740,37,500.00,DEBIT,2021-03-17,847.28
0,136805,0,136741,14,25.00,DEBIT,2021-03-18,822.28
...,...,...,...,...,...,...,...,...
9999,1524647,9999,1522635,16,66.63,DEBIT,2023-08-08,-274.02
9999,1524648,9999,1522636,14,16.91,DEBIT,2023-08-08,-290.93
9999,1524649,9999,1522637,14,3.52,DEBIT,2023-08-08,-294.45
9999,1524650,9999,1522638,16,7.99,DEBIT,2023-08-08,-302.44


In [29]:
transactions_with_balances = transactions_with_balances.drop(columns=['prism_consumer_id'])
transactions_with_balances = transactions_with_balances.reset_index()
transactions_with_balances['negative_flag'] = transactions_with_balances['calculated_balance'] < 0

negative_balance_ratio = (
    transactions_with_balances
    .groupby('prism_consumer_id', as_index=False)['negative_flag']
    .mean()
    .rename(columns={'negative_flag': 'negative_balance_ratio'})
)

In [30]:
negative_balance_ratio

Unnamed: 0,prism_consumer_id,negative_balance_ratio
0,0,0.580882
1,1,0.085987
2,10,0.944282
3,100,0.975410
4,1000,0.037915
...,...,...
14487,9995,0.305466
14488,9996,0.833333
14489,9997,0.272031
14490,9998,0.268182


In [31]:
# How many times the balance went negative within span of how long account has been created for
transactions_with_balances['posted_date'] = pd.to_datetime(transactions_with_balances['posted_date'])

account_age = transactions_with_balances.groupby('prism_consumer_id')['posted_date'].agg(
    lambda x: (x.max() - x.min()).days
).reset_index().rename(columns={'posted_date': 'account_duration_days'})

negative_balance_days = transactions_with_balances.groupby('prism_consumer_id').apply(
    lambda group: group['negative_flag'].sum()
).reset_index().rename(columns={0: 'negative_balance_days'})

negative_balance_frequency = negative_balance_days.merge(account_age, on='prism_consumer_id')
negative_balance_frequency['negative_balance_per_day'] = (
    negative_balance_frequency['negative_balance_days'] / (negative_balance_frequency['account_duration_days'] + 1)
)

negative_balance_frequency


Unnamed: 0,prism_consumer_id,negative_balance_days,account_duration_days,negative_balance_per_day
0,0,237,179,1.316667
1,1,27,180,0.149171
2,10,322,180,1.779006
3,100,119,179,0.661111
4,1000,8,180,0.044199
...,...,...,...,...
14487,9995,95,86,1.091954
14488,9996,35,78,0.443038
14489,9997,71,87,0.806818
14490,9998,59,85,0.686047


In [32]:
negative_balance_frequency

Unnamed: 0,prism_consumer_id,negative_balance_days,account_duration_days,negative_balance_per_day
0,0,237,179,1.316667
1,1,27,180,0.149171
2,10,322,180,1.779006
3,100,119,179,0.661111
4,1000,8,180,0.044199
...,...,...,...,...
14487,9995,95,86,1.091954
14488,9996,35,78,0.443038
14489,9997,71,87,0.806818
14490,9998,59,85,0.686047


In [33]:
# each category (how much was spent on each category -> one's for credit, debit, total transactions) 
category_spending = transactions_with_balances.pivot_table(
    index='prism_consumer_id',
    columns=['credit_or_debit', 'category'],
    values='amount',
    aggfunc='sum',
    fill_value=0
).reset_index()

# Flatten MultiIndex Columns
category_spending.columns = [
    f'category_{col[1]}_{col[0].lower()}_spending' if col[1] else col[0] for col in category_spending.columns
]

category_spending


Unnamed: 0,prism_consumer_id,CREDIT,category_1_credit_spending,category_2_credit_spending,category_3_credit_spending,category_4_credit_spending,category_6_credit_spending,category_7_credit_spending,category_8_credit_spending,category_9_credit_spending,...,category_38_debit_spending,category_39_debit_spending,category_40_debit_spending,category_41_debit_spending,category_42_debit_spending,category_43_debit_spending,category_44_debit_spending,category_45_debit_spending,category_46_debit_spending,category_47_debit_spending
0,0,2212.40,228.75,500.00,8820.56,1.63,19.96,0.00,0.0,0.0,...,0.0,150.60,2.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,9103.00,0.00,1492.95,11918.64,61.39,2.42,0.00,0.0,0.0,...,0.0,0.00,51.80,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,1900.55,3156.00,700.00,14720.74,6.94,92.33,0.00,0.0,0.0,...,0.0,0.00,35.04,0.0,0.0,0.0,0.0,0.0,166.0,0.0
3,100,7750.00,3060.00,0.00,24411.78,3.31,11.75,0.00,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000,17757.64,1342.17,4720.00,43658.60,258.92,1.37,0.00,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,0.0,1100.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14487,9995,0.00,1526.00,0.00,11226.84,0.00,0.00,0.00,0.0,0.0,...,0.0,19.98,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14488,9996,0.00,0.00,0.00,0.00,1200.00,0.00,0.03,0.0,0.0,...,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14489,9997,0.00,198.25,40.56,16632.30,0.00,335.00,0.00,0.0,0.0,...,0.0,217.94,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14490,9998,26.00,2686.02,7203.23,0.00,3722.49,928.63,0.00,0.0,0.0,...,0.0,0.00,51.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# multiple features: get number of times balance falls below certain amounts/threshold ($25, $50, $100) divided by the number of transactions there are

thresholds = [25, 50, 100]

for threshold in thresholds:
    transactions_with_balances[f'below_{threshold}'] = transactions_with_balances['calculated_balance'] < -threshold

balance_below_thresholds = transactions_with_balances.groupby('prism_consumer_id').apply(
    lambda group: {
        f'below_{threshold}_ratio': group[f'below_{threshold}'].sum() / len(group) for threshold in thresholds
    }
).apply(pd.Series).reset_index()

balance_below_thresholds


Unnamed: 0,prism_consumer_id,below_25_ratio,below_50_ratio,below_100_ratio
0,0,0.573529,0.568627,0.553922
1,1,0.070064,0.054140,0.022293
2,10,0.929619,0.920821,0.903226
3,100,0.975410,0.975410,0.975410
4,1000,0.033175,0.014218,0.009479
...,...,...,...,...
14487,9995,0.237942,0.192926,0.070740
14488,9996,0.714286,0.595238,0.333333
14489,9997,0.252874,0.237548,0.199234
14490,9998,0.259091,0.250000,0.222727


In [35]:

feature_columns = [
    'negative_balance_ratio', 'negative_balance_per_day', 'negative_balance_days',
    'total_credit', 'total_debit', 'account_duration_days'
] + [f'category_{cat}_credit_spending' for cat in transactions_with_balances['category'].unique()] \
  + [f'category_{cat}_debit_spending' for cat in transactions_with_balances['category'].unique()] \
  + ['below_25_ratio', 'below_50_ratio', 'below_100_ratio']

existing_feature_columns = [col for col in feature_columns if col in consumer_features.columns]

target_column = 'DQ_TARGET'
dataset = consumer_features.copy()
dataset.dropna(subset=existing_feature_columns + [target_column], inplace=True)

print("Using these feature columns:", existing_feature_columns)

results = []

for feature in existing_feature_columns:
    print(f"--- Testing Single Feature: {feature} ---")

    X = dataset[[feature]]
    y = dataset[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

    # Logistic Regression
    log_model = LogisticRegression(class_weight='balanced', max_iter=1000)
    log_model.fit(X_train_bal, y_train_bal)
    log_y_proba = log_model.predict_proba(X_test)[:, 1]
    log_auc = roc_auc_score(y_test, log_y_proba)

    # Random Forest
    rfc_model = RandomForestClassifier(random_state=42)
    rfc_model.fit(X_train_bal, y_train_bal)
    rfc_y_proba = rfc_model.predict_proba(X_test)[:, 1]
    rfc_auc = roc_auc_score(y_test, rfc_y_proba)

    # LightGBM (Suppressing logs with verbose=-1)
    lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
    lgb_model.fit(X_train_bal, y_train_bal)
    lgb_y_proba = lgb_model.predict_proba(X_test)[:, 1]
    lgb_auc = roc_auc_score(y_test, lgb_y_proba)

    # XGBoost (Suppressing logs)
    xgb_model = xgb.XGBClassifier(scale_pos_weight=len(y_train) / y_train.sum(), random_state=42, verbosity=0)
    xgb_model.fit(X_train_bal, y_train_bal)
    xgb_y_proba = xgb_model.predict_proba(X_test)[:, 1]
    xgb_auc = roc_auc_score(y_test, xgb_y_proba)

    results.append({
        'feature': feature,
        'logistic_auc': log_auc,
        'random_forest_auc': rfc_auc,
        'lightgbm_auc': lgb_auc,
        'xgboost_auc': xgb_auc
    })

    print("\n")

results_df = pd.DataFrame(results)
results_df['best_auc'] = results_df[['logistic_auc', 'random_forest_auc', 'lightgbm_auc', 'xgboost_auc']].max(axis=1)
results_df = results_df.sort_values(by='best_auc', ascending=False)

print("Top Features by ROC-AUC:")
print(results_df[['feature', 'best_auc']])


Using these feature columns: ['total_credit', 'total_debit']
--- Testing Single Feature: total_credit ---


--- Testing Single Feature: total_debit ---


Top Features by ROC-AUC:
        feature  best_auc
0  total_credit  0.602917
1   total_debit  0.588642


EDA

In [36]:
# group balances as a list for each consumer
transactions_with_balances
# transactions_with_balances = transactions_with_balances.drop(columns=['prism_consumer_id'])
# transactions_with_balances = transactions_with_balances.reset_index()
# transactions_with_balances

# if 'prism_consumer_id' in transactions_with_balances.columns:
#     transactions_with_balances = transactions_with_balances.drop(columns=['prism_consumer_id'])

# if transactions_with_balances.index.name == 'prism_consumer_id':
#     transactions_with_balances = transactions_with_balances.reset_index()

# print(transactions_with_balances.columns)
grouped_balances = transactions_with_balances.groupby('prism_consumer_id')['calculated_balance'].apply(list).reset_index()

grouped_balances.rename(columns={'calculated_balance': 'all_balances'}, inplace=True)

print(grouped_balances)

      prism_consumer_id                                       all_balances
0                     0  [-27.62, 1372.38, 1347.2800000000002, 847.2800...
1                     1  [200.0, 289.0, 1096.28, 1007.28, 973.4, 773.4,...
2                    10  [-18.15, -77.72999999999999, -191.49, -208.0, ...
3                   100  [200.0, 0.0, -901.7, -1387.42, -1803.580000000...
4                  1000  [500.0, 1200.0, 700.0, 381.92, -118.0799999999...
...                 ...                                                ...
14487              9995  [100.0, 200.0, 188.0, -20.0, -70.0, -120.0, -2...
14488              9996  [-150.0, -176.36, -218.94, -18.939999999999998...
14489              9997  [98.25, 63.26, 47.769999999999996, 12.77999999...
14490              9998  [100.0, 200.0, 300.0, 200.0, 177.96, 107.96000...
14491              9999  [-100.0, -159.74, 2647.63, 2727.63, 2708.9, 26...

[14492 rows x 2 columns]


In [37]:
# Count how many times the calculated balance was negative for each consumer
negative_balance_count = transactions_with_balances.groupby('prism_consumer_id').apply(
    lambda group: (group['calculated_balance'] < 0).sum()
).reset_index()

negative_balance_count.columns = ['prism_consumer_id', 'negative_balance_count']
print(negative_balance_count.head())

  prism_consumer_id  negative_balance_count
0                 0                     237
1                 1                      27
2                10                     322
3               100                     119
4              1000                       8


In [38]:
# calculate account age in days

acct['balance_date'] = pd.to_datetime(acct['balance_date'])
acct['account_age'] = (acct['balance_date'].max() - acct['balance_date']).dt.days
account_age = acct.groupby('prism_consumer_id')['account_age'].max().reset_index()
account_age.columns = ['prism_consumer_id', 'account_age_days']
print(account_age)


      prism_consumer_id  account_age_days
0                     0               979
1                     1              1041
2                    10               826
3                   100              1102
4                  1000               461
...                 ...               ...
13004              9995               431
13005              9996               208
13006              9997               347
13007              9998               186
13008              9999               271

[13009 rows x 2 columns]


In [39]:
# look at all negative balances
acct['balance'] = pd.to_numeric(acct['balance'], errors='coerce')

negative_balances = acct[acct['balance'] < 0]
if negative_balances.empty:
    print("No negative balances found in the dataset.")
else:
    print(negative_balances)


      prism_consumer_id prism_account_id account_type balance_date  balance  \
82                 2246               82     CHECKING   2021-04-30  -168.90   
206                2751              206     CHECKING   2022-03-30  -474.41   
506                2242              506     CHECKING   2022-03-31   -49.52   
616                3280              616     CHECKING   2021-11-30    -7.23   
904                1056              904     CHECKING   2021-03-31  -313.61   
...                 ...              ...          ...          ...      ...   
24239             12997            24239  CREDIT CARD   2021-12-14   -25.00   
24279             13976            24279  CREDIT CARD   2022-02-15 -1025.27   
24296             11678            24296     CHECKING   2021-12-19  -145.00   
24357             10353            24357      SAVINGS   2021-12-16   -10.00   
24426             10580            24426     CHECKING   2022-03-10   -48.69   

       days_account_open  account_age  
82         

In [40]:
transactions_with_balances['is_negative'] = transactions_with_balances['calculated_balance'] < 0

# groupby prism_consumer_id and calculate:
negative_balance_ratio = transactions_with_balances.groupby('prism_consumer_id').agg(
    negative_count=('is_negative', 'sum'),  # count of negative balances
    total_transactions=('calculated_balance', 'size')  # total number of transactions
).reset_index()

# calculate the ratio of negative balances to total transactions
negative_balance_ratio['negative_balance_ratio'] = (
    negative_balance_ratio['negative_count'] / negative_balance_ratio['total_transactions']
)

negative_balance_ratio.fillna(0, inplace=True)
print(negative_balance_ratio)

      prism_consumer_id  negative_count  total_transactions  \
0                     0             237                 408   
1                     1              27                 314   
2                    10             322                 341   
3                   100             119                 122   
4                  1000               8                 211   
...                 ...             ...                 ...   
14487              9995              95                 311   
14488              9996              35                  42   
14489              9997              71                 261   
14490              9998              59                 220   
14491              9999              25                 381   

       negative_balance_ratio  
0                    0.580882  
1                    0.085987  
2                    0.944282  
3                    0.975410  
4                    0.037915  
...                       ...  
14487              

In [41]:
# feature_column = 'negative_balance_ratio'

# X = consumer_features[[feature_column]]
# y = consumer_features['DQ_TARGET']

# run_classification2([feature_column], 'DQ_TARGET', consumer_features)


In [42]:
# display outflow features
debit_transactions = transactions[transactions['credit_or_debit'] == 'DEBIT']

# calculate monthly and yearly outflows per consumer
outflow_features = debit_transactions.groupby('prism_consumer_id').apply(lambda group: {
    'avg_monthly_outflow': group['amount'].sum() / max(1, group['posted_date'].dt.to_period('M').nunique()),
    'avg_yearly_outflow': group['amount'].sum() / max(1, group['posted_date'].dt.to_period('Y').nunique())
}).apply(pd.Series).reset_index()

outflow_features.columns = ['prism_consumer_id', 'avg_monthly_outflow', 'avg_yearly_outflow']
print(outflow_features)



      prism_consumer_id  avg_monthly_outflow  avg_yearly_outflow
0                     0          2129.772857           14908.410
1                     1          3299.767143           23098.370
2                    10          3109.514286           10883.300
3                   100          6623.768333           19871.305
4                  1000         11130.712857           38957.495
...                 ...                  ...                 ...
14191              9995          4261.847500            8523.695
14192              9996           297.722500            1190.890
14193              9997          4175.730000           16702.920
14194              9998          5133.156667           15399.470
14195              9999          8024.312500           32097.250

[14196 rows x 3 columns]


In [43]:
category_features = transactions.groupby(['prism_consumer_id', 'category']).apply(lambda group: {
    'total_transactions': len(group),
    'total_credit_amount': group[group['credit_or_debit'] == 'CREDIT']['amount'].sum(),
    'total_debit_amount': group[group['credit_or_debit'] == 'DEBIT']['amount'].sum(),
    'total_transaction_amount': group['amount'].sum()
}).apply(pd.Series).reset_index()


In [44]:
acct['balance_date'] = pd.to_datetime(acct['balance_date'], errors='coerce')

acct['days_since'] = (acct['balance_date'].max() - acct['balance_date']).dt.days

time_windows = [30, 90, 180, 365]
thresholds = [25, 50, 100]

balance_features = acct.groupby('prism_consumer_id').apply(lambda group: {
    **{f'avg_balance_{tw}d': group[group['days_since'] <= tw]['balance'].mean() for tw in time_windows},
    **{f'below_{threshold}_{tw}d': (group[group['days_since'] <= tw]['balance'] < threshold).sum()
       for threshold in thresholds for tw in time_windows},
    'days_negative_balance': (group['balance'] < 0).sum()
}).apply(pd.Series).reset_index()

balance_features.columns = [
    'prism_consumer_id', 
    *[
        f'avg_balance_{tw}d' for tw in time_windows
    ],
    *[
        f'below_{threshold}_{tw}d' for threshold in thresholds for tw in time_windows
    ],
    'days_negative_balance'
]

print(balance_features)


      prism_consumer_id  avg_balance_30d  avg_balance_90d  avg_balance_180d  \
0                     0              NaN              NaN               NaN   
1                     1              NaN              NaN               NaN   
2                    10              NaN              NaN               NaN   
3                   100              NaN              NaN               NaN   
4                  1000              NaN              NaN               NaN   
...                 ...              ...              ...               ...   
13004              9995              NaN              NaN               NaN   
13005              9996              NaN              NaN               NaN   
13006              9997              NaN              NaN               NaN   
13007              9998              NaN              NaN               NaN   
13008              9999              NaN              NaN               NaN   

       avg_balance_365d  below_25_30d  below_25_90d

In [45]:
acct

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance,days_account_open,account_age,days_since
0,3023,0,SAVINGS,2021-08-31,90.57,979,979,979
1,3023,1,CHECKING,2021-08-31,225.95,979,979,979
2,4416,2,SAVINGS,2022-03-31,15157.17,767,767,767
3,4416,3,CHECKING,2022-03-31,66.42,767,767,767
4,4227,4,CHECKING,2021-07-31,7042.90,1010,1010,1010
...,...,...,...,...,...,...,...,...
24461,11500,24461,CHECKING,2022-03-27,732.75,771,771,771
24462,11615,24462,SAVINGS,2022-03-30,5.00,768,768,768
24463,11615,24463,CHECKING,2022-03-30,1956.46,768,768,768
24464,12210,24464,CHECKING,2022-03-28,2701.51,770,770,770


In [46]:
balance_features

Unnamed: 0,prism_consumer_id,avg_balance_30d,avg_balance_90d,avg_balance_180d,avg_balance_365d,below_25_30d,below_25_90d,below_25_180d,below_25_365d,below_50_30d,below_50_90d,below_50_180d,below_50_365d,below_100_30d,below_100_90d,below_100_180d,below_100_365d,days_negative_balance
0,0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13004,9995,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13005,9996,,,,252.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13006,9997,,,,611.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13007,9998,,,,-862.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0


In [47]:
negative_ratio

NameError: name 'negative_ratio' is not defined