In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

inflows = pd.read_parquet("Data/ucsd-inflows.pqt")
outflows = pd.read_parquet("Data/ucsd-outflows.pqt")
import re

def clean_text(data):
    df = data.copy()

    # create the patterns

    # pattern1 handles the dates, and the states 
    pattern1 = r'\b(?:CA\s+)?(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\d|3[01])(?:/\d{2,4})?\b'

    # pattern2 handles unnecessary X's
    pattern2 = r'(?<!#)XX+|#XX+'

    # Handles unnecessary punctuation
    pattern3 = r"[^a-zA-Z0-9\s./]"

    # get rid of the state at the end of the wording
    pattern4 = r'\s[A-Z]{2}$'

    # Handles "POS WITHDRAWAL | DEBIT CARD WITHDRAWL"
    pattern5 = r'(pos withdrawal|debit card withdrawal)'

    # handles the word "purchase"
    pattern6 = r'(purchase)'

    # make everything lower case, and get rid of unnecessary spacing between words
    df['memo'] = df['memo'].apply(lambda x: re.sub(pattern3, '', re.sub(pattern2, '', re.sub(pattern1, '', x))))
    df['memo'] = df['memo'].apply(lambda x: " ".join(x.split()).strip())
    df['memo'] = df['memo'].apply(lambda x: re.sub(pattern4, '', x))
    df["memo"] = df["memo"].apply(lambda x: re.sub(pattern6, '', re.sub(pattern5, '', x.lower())))
    return df
not_matching = outflows[outflows['memo'] != outflows['category']]
cleaned_not_matching = clean_text(not_matching)
#cleaned_inflows = clean_text(inflows)
#cleaned_outflows = clean_text(outflows)

In [2]:
cleaned_not_matching

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
2,0,acc_0,tst casa del rio exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,oculus,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,los girasoles stow,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,buzzis laundry 1,4.16,2022-03-29,GENERAL_MERCHANDISE
...,...,...,...,...,...,...
2597457,5941,acc_9524,amazon primeti40l27r3 amzn.com/bill wa date p...,15.93,2023-01-16,GENERAL_MERCHANDISE
2597462,5941,acc_9524,az lot quiktrip e indian school rd phoenix az ...,25.00,2023-01-18,EDUCATION
2597465,5941,acc_9524,walmart e mckellips rd mesa az card 15 mcc,3.68,2023-01-18,FOOD_AND_BEVERAGES
2597468,5941,acc_9524,withdrawal salt river projetype online pmt cos...,90.00,2023-01-20,FOOD_AND_BEVERAGES


In [3]:
#w_cleaned_inflows = cleaned_inflows.copy()  
w_cleaned_outflows = cleaned_not_matching.copy()
#w_cleaned_outflows = cleaned_outflows.copy()

def add_text(memo, added_tokens):
    memo += ''.join(added_tokens)
    return memo

def whole_dollar_amount(amount):
    if amount % 1 == 0:
        return ' <W_D>'
    return ''

def day(date):
    return f" <D_{date.day}>"

def month(date):
    return f" <M_{date.month}>"


w_cleaned_outflows['memo'] = w_cleaned_outflows.apply(
    lambda row: add_text(row['memo'], 
                         [whole_dollar_amount(row['amount']), 
                          day(row['posted_date']),
                         month(row['posted_date'])]), axis=1
)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
import numpy as np

# Splitting the data into train and test sets
X_train_text, X_test_text, y_train, y_test = train_test_split(
    w_cleaned_outflows['memo'], 
    w_cleaned_outflows['category'], 
    test_size=0.2, 
    random_state=42
)
print('Data split complete')

# Vectorizing the text data - fit only on training data, transform on test data
vectorizer = TfidfVectorizer(max_features=5000, max_df=0.95, min_df=5)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)
print('Vectorization complete')

# Encoding the labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
print('Label encoding complete')

# Fitting the Logistic Regression model
log_reg = LogisticRegression(solver='saga', max_iter=200, n_jobs=-1)
log_reg.fit(X_train, y_train)
print('Logistic Regression model trained')

# Making predictions and calculating accuracy
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Getting predicted probabilities
y_prob = log_reg.predict_proba(X_test)

# Calculating ROC-AUC score for each category individually
roc_auc_scores = {}
for i, category in enumerate(label_encoder.classes_):
    y_test_binary = np.where(y_test == i, 1, 0)  # Binary label for the current category
    
    # Only calculate ROC-AUC if there are both positive and negative samples
    if len(np.unique(y_test_binary)) == 2:
        roc_auc_scores[category] = roc_auc_score(y_test_binary, y_prob[:, i])
    else:
        roc_auc_scores[category] = "Undefined (only one class in y_test)"

# Displaying the ROC-AUC score for each category
print("ROC-AUC Scores per Category:")
for category, score in roc_auc_scores.items():
    print(f"{category}: {score}")


Data split complete
Vectorization complete
Label encoding complete
Logistic Regression model trained
Accuracy: 0.9614950380992839
ROC-AUC Scores per Category:
EDUCATION: 0.9937495928333644
FOOD_AND_BEVERAGES: 0.9956160161953359
GENERAL_MERCHANDISE: 0.9955593609414578
GROCERIES: 0.9981393125521718
MORTGAGE: 1.0
OVERDRAFT: 0.9999371706305867
PETS: 0.9986474182987582
RENT: 0.9968897677086999
TRAVEL: 0.9982747864673119
