In [3]:
import numpy as np
import pandas as pd

In [5]:
categories = pd.read_csv('../../data_q2/q2-ucsd-cat-map.csv')
consumer = pd.read_parquet('../../data_q2/q2-ucsd-consDF.pqt')
acct = pd.read_parquet('../../data_q2/q2-ucsd-acctDF.pqt')
transactions = pd.read_parquet('../../data_q2/q2-ucsd-trxnDF.pqt')

In [7]:
# categories = pd.read_csv('../../data_q2/q2-ucsd-cat-map.csv')
# consumer = pd.read_parquet('../../data_q2/q2-ucsd-consDF.pqt')
# acct = pd.read_parquet("../../data_q2/q2-ucsd-acctIDF.pqt")
# transactions = pd.read_parquet("../../data_q2/q2-ucsd-trxnDF.pqt")

In [8]:
consumer.head()

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET
0,0,2021-09-01,726.0,0.0
1,1,2021-07-01,626.0,0.0
2,2,2021-05-01,680.0,0.0
3,3,2021-03-01,734.0,0.0
4,4,2021-10-01,676.0,0.0


In [9]:
transactions.head()

Unnamed: 0,prism_consumer_id,prism_transaction_id,category,amount,credit_or_debit,posted_date
0,3023,0,4,0.05,CREDIT,2021-04-16
1,3023,1,12,481.56,CREDIT,2021-04-30
2,3023,2,4,0.05,CREDIT,2021-05-16
3,3023,3,4,0.07,CREDIT,2021-06-16
4,3023,4,4,0.06,CREDIT,2021-07-16


In [14]:
categories.head()

Unnamed: 0,category_id,category
0,0,SELF_TRANSFER
1,1,EXTERNAL_TRANSFER
2,2,DEPOSIT
3,3,PAYCHECK
4,4,MISCELLANEOUS


In [10]:
acct.head()

Unnamed: 0,prism_consumer_id,prism_account_id,account_type,balance_date,balance
0,3023,0,SAVINGS,2021-08-31,90.57
1,3023,1,CHECKING,2021-08-31,225.95
2,4416,2,SAVINGS,2022-03-31,15157.17
3,4416,3,CHECKING,2022-03-31,66.42
4,4227,4,CHECKING,2021-07-31,7042.9


In [11]:
len(acct['prism_consumer_id'].unique())

13009

In [12]:
# merge 
acct['account_type'].unique()

# most important account_types: ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN]
# most_important_accounts = ['CHECKING', 'SAVINGS', 'CREDIT CARD', 'LOAN']
acctDF = acct.copy()
total_balance = acctDF.groupby('prism_consumer_id')['balance'].sum()

In [13]:
consumer_balance = consumer.merge(pd.DataFrame(total_balance), on = 'prism_consumer_id', how = 'outer')
consumer_balance['std_credit'] = (consumer_balance['credit_score'] - consumer_balance['credit_score'].mean()) / consumer_balance['credit_score'].std()
consumer_balance["std_balance"] = (
    consumer_balance["balance"] - consumer_balance["balance"].mean()
    ) / consumer_balance["balance"].std()
consumer_balance

Unnamed: 0,prism_consumer_id,evaluation_date,credit_score,DQ_TARGET,balance,std_credit,std_balance
0,0,2021-09-01,726.0,0.0,320.37,0.846851,-0.146222
1,1,2021-07-01,626.0,0.0,3302.42,-0.459894,-0.090027
2,10,2022-02-01,654.0,0.0,824.24,-0.094005,-0.136727
3,100,2021-12-01,750.0,0.0,2655.47,1.160470,-0.102219
4,1000,2021-03-01,756.0,0.0,95.25,1.238874,-0.150464
...,...,...,...,...,...,...,...
14995,9995,2023-03-02,578.0,,0.00,-1.087132,-0.152259
14996,9996,2023-10-11,610.0,,252.67,-0.668973,-0.147498
14997,9997,2023-05-25,675.0,,611.28,0.180411,-0.140740
14998,9998,2023-11-02,534.0,,-862.99,-1.662099,-0.168522


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import warnings

def run_classification(
    feature_column, target_column, dataset, test_size=0.2, random_state=42
):
    """
    Run Logistic Regression and Random Forest Classification on a dataset.

    Parameters:
        feature_column (list): List of columns that can be used as features.
        target_column (str): The name of the target column.
        dataset (pd.DataFrame): The dataset containing the features and target.
        test_size (float): Proportion of the dataset to include in the test split (default 0.3).
        random_state (int): Random seed for reproducibility (default 42).

    Returns:
        None: Prints the classification reports for both models.
    """
    warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

    # Drop NaN values and shuffle the dataset
    dataset = dataset.dropna()
    dataset = dataset.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Define features and target
    X = dataset[feature_column]
    y = dataset[target_column]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # balance the dataset
    sm = SMOTE(random_state=42)
    X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

    # Logistic Regression
    log_model = LogisticRegression(class_weight = 'balanced')
    log_model.fit(X_train_bal, y_train_bal)
    log_y_pred = log_model.predict(X_test)
    print(f"Logistic Regression for {feature_column}")
    print(classification_report(y_test, log_y_pred))

    # Random Forest Classification
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(X_train_bal, y_train_bal)
    rfc_y_pred = rfc_model.predict(X_test)
    print(f"\nRandom Forest Classification for {feature_column}")
    print(classification_report(y_test, rfc_y_pred))

    # Light GBM 
    lgb_model = lgb.LGBMClassifier()
    lgb_model.fit(X_train_bal, y_train_bal)
    lgb_y_pred = lgb_model.predict(X_test)
    print(f"\nLGB Model Classification for {feature_column}")
    print(classification_report(y_test, lgb_y_pred))

    # SVC Model
    svc_model = SVC(probability=True, random_state=random_state)
    svc_model.fit(X_train_bal, y_train_bal)
    svc_y_pred = svc_model.predict(X_test)
    print(f"\nSVM Classification for {feature_column}")
    print(classification_report(y_test, svc_y_pred))

    # XGB Model
    xgb_model = xgb.XGBClassifier(
        scale_pos_weight=len(y_train) / y_train.sum()
        )
    xgb_model.fit(X_train_bal, y_train_bal)
    xgb_y_pred = xgb_model.predict(X_test)
    print("\nXGBoost:")
    print(classification_report(y_test, xgb_y_pred))

In [None]:
# Example usage:
run_classification(["std_credit"], "DQ_TARGET", consumer_balance)

Logistic Regression for ['std_credit']
              precision    recall  f1-score   support

         0.0       0.97      0.74      0.84      1914
         1.0       0.20      0.74      0.31       168

    accuracy                           0.74      2082
   macro avg       0.58      0.74      0.58      2082
weighted avg       0.91      0.74      0.80      2082


Random Forest Classification for ['std_credit']
              precision    recall  f1-score   support

         0.0       0.94      0.85      0.89      1914
         1.0       0.20      0.43      0.27       168

    accuracy                           0.82      2082
   macro avg       0.57      0.64      0.58      2082
weighted avg       0.88      0.82      0.84      2082

[LightGBM] [Info] Number of positive: 7579, number of negative: 7579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bin

In [None]:
# Example usage:
run_classification(["std_balance"], "DQ_TARGET", consumer_balance)


Logistic Regression for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.98      0.34      0.51      1914
         1.0       0.11      0.91      0.19       168

    accuracy                           0.39      2082
   macro avg       0.54      0.63      0.35      2082
weighted avg       0.91      0.39      0.48      2082


Random Forest Classification for ['std_balance']
              precision    recall  f1-score   support

         0.0       0.93      0.68      0.78      1914
         1.0       0.10      0.43      0.17       168

    accuracy                           0.66      2082
   macro avg       0.52      0.55      0.48      2082
weighted avg       0.86      0.66      0.73      2082

[LightGBM] [Info] Number of positive: 7579, number of negative: 7579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total B

In [None]:
consumer_balance['DQ_TARGET'].value_counts()

DQ_TARGET
0.0    10994
1.0     1006
Name: count, dtype: int64

In [None]:
consumer_balance.isna().sum()

prism_consumer_id       0
evaluation_date         0
credit_score            0
DQ_TARGET            3000
balance              1991
std_credit              0
std_balance          1991
dtype: int64

In [None]:
new_data = consumer_balance.dropna()
new_data['DQ_TARGET'].value_counts()

DQ_TARGET
0.0    9493
1.0     915
Name: count, dtype: int64