In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind

import re
from typing import List, Tuple, Iterable, Optional
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, PrecisionRecallDisplay
)
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Week 2 (Train Test Split)

In [2]:
inflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-inflows.pqt')
outflow = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

# remove rows where memo = category
outflow = outflow[outflow['memo'] != outflow['category']]

outflow_ids = set(outflow["prism_consumer_id"].unique())
inflow_ids = set(inflow["prism_consumer_id"].unique())

in_not_out = inflow_ids - outflow_ids
out_not_in = outflow_ids - inflow_ids

#consumers in both inflow and outflow
consumers_both = sorted(set(inflow["prism_consumer_id"]).intersection(outflow["prism_consumer_id"]))

#80-20 train test split
train_ids, test_ids = train_test_split(consumers_both, test_size=0.2, random_state=42)

inflow_train = inflow[inflow["prism_consumer_id"].isin(train_ids)]
inflow_test  = inflow[inflow["prism_consumer_id"].isin(test_ids)]

outflow_train = outflow[outflow["prism_consumer_id"].isin(train_ids)]
outflow_test  = outflow[outflow["prism_consumer_id"].isin(test_ids)]

# print(f'Inflow_train: {inflow_train["amount"].median()}\nInflow_test: {inflow_test["amount"].median()}\nOutflow_train: {outflow_train["amount"].median()}\nOutflow_test: {outflow_test["amount"].median()}')

## Week 3 (Cleaning Memos)

### Remove Observed Patterns

In [3]:
## use train_memos.txt to observe patterns from original outflow_train
# unique_memos = np.sort(outflow_train[outflow_train['memo']!=outflow_train['category']]['memo'].str.lower().unique())
# np.savetxt("train_memos.txt", unique_memos, fmt="%s")

In [4]:
m = outflow_train[outflow_train['memo']!=outflow_train['category']]['memo'].str.lower()
clean = m.copy(deep = True)

websites = re.compile(r'\b(?:www\.)?([a-wy-z0-9]+)[.\s](com|com|net|org|gov)\b', flags=re.IGNORECASE)

# extract website domain
def extract_domain(text):
    m = websites.search(text)
    if m:
        return m.group(1)
    else:
        return text

clean = clean.apply(extract_domain)

In [5]:
patterns = [r'^#\d*x*\s', #starting number signs with following 0 or more digits inlcuding x then space
            r'\(?(?:\d|x){3}\)?(?:\s|-)(?:\d|x){3}(?:\s|-)(?:\d|x){4}', # phone numbers in various formats
            r'(?:\s|^)[^A-Za-z0-9]?x+(?:[^A-Za-z0-9]+x+)*(?:\s|$)', #various sequences of trailing x
            r'\b[^A-Za-z0-9](al|ak|az|ar|ca|co|ct|de|fl|ga|hi|id|il|ia|ks|ky|me|md|ma|mi|mn|ms|mo|mt|ne|nv|nh|nj|nm|ny|nc|nd|oh|ok|or|pa|ri|sc|sd|tn|tx|ut|vt|va|wa|wv|wi|wy|dc)(?![A-Za-z0-9])\b',
            #state abbreviations besides 'in' bc ex: in n out
            r'\bin$', #ending 'in' state and la
            r'(?:in)?\scard\s\d{2}\s?',
            # format ex: card 20 unless in card 20
            r'\b\d{2}[/-]\d{2}(?:[/-]\d{2,4})?\b', #dates
            r'\d{2}\:\d{2}(?:\:\d{2})?[ap]?m?', #time stamps
            r'(?:date\s)?\%\%(?:\smcc)?', #format ex: %% mcc
            r'#[A-Za-z0-9]+$' #ending #_____
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 605120


In [6]:
clean = clean.reindex(outflow_train.index)
outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])
outflow_train = outflow_train[['prism_consumer_id', 'prism_account_id', 'memo', 'cleaned_memo', 'amount', 'posted_date', 'category']]  
outflow_train[outflow_train['cleaned_memo'] != outflow_train['memo']][['memo','cleaned_memo']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])


Unnamed: 0,memo,cleaned_memo
2,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio - exp fairlawn
4,Buffalo Wild Wings,buffalo wild wings
6,Oculus CA 04/16,oculus
7,LOS GIRASOLES STOW OH 03/08,los girasoles stow
8,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1
...,...,...
2595705,PURCHASE 01-14 PUREKANA XXX-XXXXXXX NJ VNT XXXX,purchase purekana vnt
2595706,PURCHASE 01-18 WAL-MART #XXXX HUTCHINSON KS S...,purchase wal-mart hutchinson srf
2595708,PURCHASE 01-27 CLKBANK*MoonReading XXX-XXX-XXX...,purchase clkbank*moonreading vnt
2595709,PURCHASE 01-26 CASH APP*DIANA STIE XXXXXXXXXX ...,purchase cash app*diana stie vnt


### Build n-gram

In [7]:
# n-gram
def tokenize(text: str, lowercase: bool = True) -> List[str]:
    if lowercase:
        text = text.lower()
    # Keep words, numbers, and apostrophes; split by comma later
    return re.findall(r"[a-z0-9']+", text)

def generate_ngrams(tokens: List[str], n: int) -> Iterable[Tuple[str, ...]]:
    """Generate n-grams of size n from tokens."""
    if n <= 0:
        raise ValueError("n must be >= 1")
    if len(tokens) < n:
        return []
    return zip(*[tokens[i:] for i in range(n)])

def most_common_ngrams(
    text: str,
    n: int = 1,
    top_k: int = 10,
    stopwords: Optional[Iterable[str]] = None,
    min_token_len: int = 1,
) -> List[Tuple[str, int]]:
    # Split the text into comma-separated segments
    segments = [seg.strip() for seg in text.split(",") if seg.strip()]

    all_tokens = []
    for seg in segments:
        tokens = tokenize(seg)
        if stopwords:
            sw = set(stopwords)
            tokens = [t for t in tokens if t not in sw]
        if min_token_len > 1:
            tokens = [t for t in tokens if len(t) >= min_token_len]
        all_tokens.append(tokens)

    # Build all n-grams up to length n within each segment
    grams = []
    for tokens in all_tokens:
        for i in range(1, n + 1):
            grams.extend(" ".join(g) for g in generate_ngrams(tokens, i))

    counts = Counter(grams)
    return counts.most_common(top_k)

In [8]:
# perform n-gram to find most common patterns with <= n tokens

# create corpus using all unique cleaned_memos in outflow_train
memos = outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].unique()
memos = ', '.join(memos)
memos[:1000]

"tst* casa del rio - exp fairlawn, buffalo wild wings, oculus, los girasoles stow, buzzis laundry 1, tgi fridays stow, tst* the basement sp cuyahoga fall, lowe's, piada - 39, grubhub, hardees akron, market di state cuyahoga fall, swensons - montrose akron, great clips, apple, wing warehouse cuyah cuyahoga fall, winking lizard - 30, longhorn steak cuyahoga fall, on tap - cuyahoga fa cuyahoga fall, home depot, falls discount tobacc cuyahoga fls, burger king, o'charley's, homedepot, dairy queen, east of chicago - cu cuyahoga fall, giant-eag corpora uniontown, fin's bar & chill pigeon forge, acme no. 12 bai cuyahoga fall ohxxxxxx, taco bell, chick-fil-a, walmart, moe's sw grill cuyahoga fall, texas roadhouse, 39 piada cuyahoga fall, cleveland gaming fairview park, circle k, iah cnbc smartshop houston, bob evans rest stow, kohl's, moe's sw grill akron, wendy's, rays place of fairlawn fairlawn, get go st cuyahoga fall ohxxxxxx, dd doordash dashmart, chipotle mexican grill, swensons - north a

In [9]:
n_grams = most_common_ngrams(memos, n=10, top_k=50)
# longer patterns may include shorter patterns --> remove longer ones first
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
sorted_by_len[:10]

[('pos debit visa check card', 6856),
 ('pxxxxxxxxxxxxxxxxx card', 4003),
 ('purchase authorized on', 33710),
 ('sxxxxxxxxxxxxxxx card', 26726),
 ('debit visa check card', 6856),
 ('pxxxxxxxxxxxxxxx card', 5666),
 ('pos debit visa check', 6856),
 ('purchase authorized', 33710),
 ('debit card purchase', 6352),
 ('pxxxxxxxxxxxxxxxxx', 4003)]

In [10]:
m = outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].str.lower()
clean = m.copy(deep = True)

patterns = [r'\b(?:in)?\s[sp]xxxxxxxxxxxxxxx\scard',
            r'\b[sp]?x{3,}',
            r'(?:purchase\s)?authorized(?:\son)?',
            r'debit(?:\scard)?',
            r'visa\s(?:check\s)?card',
            r'withdrawal',
            r'recurring',
            r'checkcard',
            r'purchase',
            r'^[^A-Za-z0-9]|\-\s',
            r'\#[a-zA-Z0-9]+\s'
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 367753


### check if any other common patterns can be removed

In [11]:
memos = ', '.join(clean)

max_len = int(outflow_train['cleaned_memo'].str.split().str.len().max())
# find top 50 patterns with <= max_len tokens
n_grams = most_common_ngrams(memos, n=max_len, top_k=50)
# longer patterns may include shorter patterns --> remove longer ones first
sorted_by_len = sorted(n_grams, key=lambda x: len(x[0]), reverse = True)
sorted_by_len[:10]

[('dollar general', 10898),
 ("mcdonald's", 29782),
 ('starbucks', 15166),
 ('cash app', 37056),
 ('doordash', 22096),
 ('7 eleven', 16599),
 ('wal mart', 16281),
 ('circle k', 12553),
 ('afterpay', 10472),
 ('point of', 10160)]

### Use txt file to observe again

In [12]:
# np.savetxt("after_ngram_memos.txt", np.sort(clean.unique()), fmt="%s")

patterns = [r'^\d{2}\-\d{2}(?:\-\d{2,4})?\b', #leading dates
            r'c\#\sdbt\scrd', #format ex: c# dbt crd
            r'\bc\#?$', #ending c or c#
            r'\bcard#$', #ending card#
            r'\s[a-z]{0,2}x{3,}\s', #ex: fxxxx, cxxxx
           ]

for p in patterns:
    clean = clean.str.replace(p, ' ', regex=True)
    clean = clean.str.replace(r'\s+', ' ', regex=True).str.strip()

changed_mask = m != clean
changed_items = m[changed_mask]
changed_count = changed_mask.sum()

print("Changed count:", changed_count)

Changed count: 380312


In [13]:
clean = clean.reindex(outflow_train.index)
outflow_train['cleaned_memo'] = clean.fillna(outflow_train['memo'])
outflow_train = outflow_train[['prism_consumer_id', 'prism_account_id', 'memo', 'cleaned_memo', 'amount', 'posted_date', 'category']]  
outflow_train.head(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,Buffalo Wild Wings,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,Oculus CA 04/16,oculus,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,LOS GIRASOLES STOW OH 03/08,los girasoles stow,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1,4.16,2022-03-29,GENERAL_MERCHANDISE


In [14]:
outflow_train.sample(50).groupby(['memo','cleaned_memo'])[['amount']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount
memo,cleaned_memo,Unnamed: 2_level_1
365 MARKET 888,365 market 888,1
APPLE.COM/BILL XXXX 866-712-77,apple,1
Amazon,amazon,1
CHECKCARD XXXX Gringos Mexican G JEFFERSON GA XXXXXXXXXXXXXXXXXXXXXXX,gringos mexican g jefferson,1
CHECKCARD XXXX LUPITA'S RESTAURANT Bullard TX XXXXXXXXXXXXXXXXXXXXXXX,lupita's restaurant bullard,1
CHECKCARD XXXX MGM*BETMGM CR & DB JERSEY CITY NJ XXXXXXXXXXXXXXXXXXXXXXX,mgm*betmgm cr & db jersey city,1
CHECKCARD XXXX PRIVACY ASSIST VA XXXXXXXXXXXXXXXXXXXXXXX RECURRING,privacy assist,1
CLEATS CLUB SEAT GCLEVELAND O,cleats club seat gcleveland o,1
Cash App*Oriana Rag,cash app*oriana rag,1
Cook Out,cook out,1


In [15]:
# clean_memos = np.sort(outflow_train[outflow_train['memo']!=outflow_train['category']]['cleaned_memo'].unique())
# np.savetxt("cleaned_train_memos.txt", clean_memos, fmt="%s")

In [16]:
outflow_train.sort_values(by='cleaned_memo')

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
763744,1617,acc_4488,XXX-XXX-XXXX,,633.42,2021-09-17,FOOD_AND_BEVERAGES
306986,672,acc_1961,# XXXX,,150.00,2022-01-19,FOOD_AND_BEVERAGES
306988,672,acc_1961,# XXXX,,250.00,2022-05-26,FOOD_AND_BEVERAGES
307058,672,acc_1961,# XXXX,,33.00,2022-03-18,FOOD_AND_BEVERAGES
307066,672,acc_1961,# XXXX,,10.00,2022-05-27,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...,...
5128,12,acc_28,DEBIT CARD PURCHASE / | ON 10/03 AT TST SUNFLO...,| on at tst sunflower drive in| fair oaks |,35.59,2020-10-05,FOOD_AND_BEVERAGES
4931,12,acc_28,DEBIT CARD PURCHASE / | ON 09/15 AT TUG BOAT F...,| on at tug boat fish n chips 2| fair oaks |,45.75,2021-09-17,FOOD_AND_BEVERAGES
5089,12,acc_28,DEBIT CARD PURCHASE / | ON 05/15 AT VINTAGE CH...,| on at vintage charm antiques| loomis |,158.02,2021-05-17,GENERAL_MERCHANDISE
5207,12,acc_28,DEBIT CARD PURCHASE / | ON 01/06 AT WAYSIDE LU...,| on at wayside lumber inc| |,262.18,2021-01-07,GENERAL_MERCHANDISE


In [17]:
outflow_train.loc[307802]

prism_consumer_id                   672
prism_account_id               acc_1961
memo                             # XXXX
cleaned_memo                           
amount                            150.0
posted_date                  2022-04-28
category             FOOD_AND_BEVERAGES
Name: 307802, dtype: object

In [18]:
outflow_train

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,Buffalo Wild Wings,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,Oculus CA 04/16,oculus,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,LOS GIRASOLES STOW OH 03/08,los girasoles stow,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1,4.16,2022-03-29,GENERAL_MERCHANDISE
...,...,...,...,...,...,...,...
2595705,5939,acc_9522,PURCHASE 01-14 PUREKANA XXX-XXXXXXX NJ VNT XXXX,purekana vnt,116.58,2023-01-17,GENERAL_MERCHANDISE
2595706,5939,acc_9522,PURCHASE 01-18 WAL-MART #XXXX HUTCHINSON KS S...,wal-mart hutchinson srf,12.54,2023-01-18,GROCERIES
2595708,5939,acc_9522,PURCHASE 01-27 CLKBANK*MoonReading XXX-XXX-XXX...,clkbank*moonreading vnt,11.00,2023-01-27,GENERAL_MERCHANDISE
2595709,5939,acc_9522,PURCHASE 01-26 CASH APP*DIANA STIE XXXXXXXXXX ...,cash app*diana stie vnt,16.00,2023-01-27,GENERAL_MERCHANDISE


## Week 4 (Feature Creation and Baseline Testing)

### Date Attributes

In [20]:
outflow_train_add = outflow_train.copy(deep=True)
outflow_train_add['posted_date'] = pd.to_datetime(outflow_train_add['posted_date'])

# day of the week
outflow_train_add['day_of_week'] = outflow_train_add['posted_date'].dt.strftime('%a')

# month
outflow_train_add['month'] = outflow_train_add['posted_date'].dt.month_name()

# quarter
outflow_train_add['quarter'] = outflow_train_add['posted_date'].dt.quarter
q_map = {1: 'q1', 2: 'q2', 3: 'q3', 4: 'q4'}
outflow_train_add['quarter'] = outflow_train_add['quarter'].map(q_map)

# year
outflow_train_add['year'] = outflow_train_add['posted_date'].dt.year

# average time btwn transactions
df = outflow_train_add.copy()
df = df.sort_values(['prism_consumer_id', 'posted_date'])

df['days_since_prev'] = df.groupby('prism_consumer_id')['posted_date'].diff().dt.days.fillna(0)
df['avg_days_between_txn'] = df.groupby('prism_consumer_id')['days_since_prev'].transform('mean')

df = df.sort_index() # Restore original order
outflow_train_add = df

# rolling avg time btwn transactions (window = 5) --> can be helpful for determining financial stability
df = outflow_train_add.sort_values(['prism_consumer_id', 'posted_date']).copy()

df['rolling_avg_days_between_txn'] = (
    df.groupby('prism_consumer_id')['days_since_prev']
      .transform(lambda x: x.rolling(window=5, min_periods=1).mean())
)

outflow_train_add = df.sort_index() # restore to original order

# time since first transaction
df = outflow_train_add.sort_values(['prism_consumer_id', 'posted_date']).copy()
df['posted_date'] = pd.to_datetime(df['posted_date'])

first_txn = ( # find first transaction date per customer
    df.groupby('prism_consumer_id')['posted_date']
      .transform('min')
)

df['days_since_first_txn'] = (df['posted_date'] - first_txn).dt.days # compute days since first transaction

outflow_train_add = df.sort_index() # restore to original order

### Amount Attributes

In [21]:
# difference from median amount of transactions per month per customer
outflow_train_add['month_med_amnt'] = (outflow_train_add.groupby(['prism_consumer_id', 'year', 'month'])['amount']
                                       .transform('median')
                                      )
outflow_train_add['month_med_amnt_diff'] = outflow_train_add['amount']-outflow_train_add['month_med_amnt']

# Standard deviation of amounts per consumer
group_stats = outflow_train_add.groupby('prism_consumer_id')['amount'].agg(['mean', 'std'])
mean,std = group_stats['mean'],group_stats['std']
outflow_train_add['amnt_zscore'] = (outflow_train_add['amount'] - mean) / std # compute z-score
outflow_train_add['amnt_zscore'] = outflow_train_add['amnt_zscore'].fillna(0) # optional: fill NaN z-scores (e.g. if std = 0 or only one transaction)

# Log-transformed amount --> fix skewness of amounts
outflow_train_add['log_amnt'] = np.log1p(outflow_train_add['amount'])
outflow_train_add

Unnamed: 0,prism_consumer_id,prism_account_id,memo,cleaned_memo,amount,posted_date,category,day_of_week,month,quarter,year,days_since_prev,avg_days_between_txn,rolling_avg_days_between_txn,days_since_first_txn,month_med_amnt,month_med_amnt_diff,amnt_zscore,log_amnt
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,tst* casa del rio exp fairlawn,18.42,2022-09-26,FOOD_AND_BEVERAGES,Mon,September,q3,2022,3.0,1.100917,1.0,224,22.66,-4.24,-0.705843,2.966303
4,0,acc_0,Buffalo Wild Wings,buffalo wild wings,26.47,2022-09-12,FOOD_AND_BEVERAGES,Mon,September,q3,2022,6.0,1.100917,1.2,210,22.66,3.81,-0.288038,3.313095
6,0,acc_0,Oculus CA 04/16,oculus,11.73,2022-04-18,GENERAL_MERCHANDISE,Mon,April,q2,2022,3.0,1.100917,1.4,63,26.68,-14.95,0.000000,2.543961
7,0,acc_0,LOS GIRASOLES STOW OH 03/08,los girasoles stow,30.04,2022-03-09,FOOD_AND_BEVERAGES,Wed,March,q1,2022,1.0,1.100917,0.4,23,26.19,3.85,-0.328565,3.435277
8,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,buzzis laundry 1,4.16,2022-03-29,GENERAL_MERCHANDISE,Tue,March,q1,2022,1.0,1.100917,1.0,43,26.19,-22.03,0.000000,1.640937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595705,5939,acc_9522,PURCHASE 01-14 PUREKANA XXX-XXXXXXX NJ VNT XXXX,purekana vnt,116.58,2023-01-17,GENERAL_MERCHANDISE,Tue,January,q1,2023,8.0,2.002770,5.8,713,19.48,97.10,0.000000,4.767119
2595706,5939,acc_9522,PURCHASE 01-18 WAL-MART #XXXX HUTCHINSON KS S...,wal-mart hutchinson srf,12.54,2023-01-18,GROCERIES,Wed,January,q1,2023,1.0,2.002770,5.4,714,19.48,-6.94,0.000000,2.605648
2595708,5939,acc_9522,PURCHASE 01-27 CLKBANK*MoonReading XXX-XXX-XXX...,clkbank*moonreading vnt,11.00,2023-01-27,GENERAL_MERCHANDISE,Fri,January,q1,2023,9.0,2.002770,4.8,723,19.48,-8.48,0.000000,2.484907
2595709,5939,acc_9522,PURCHASE 01-26 CASH APP*DIANA STIE XXXXXXXXXX ...,cash app*diana stie vnt,16.00,2023-01-27,GENERAL_MERCHANDISE,Fri,January,q1,2023,0.0,2.002770,4.8,723,19.48,-3.48,0.000000,2.833213


### TFIDF 

In [22]:
tfidf = TfidfVectorizer(
    max_features=100,
    ngram_range=(1, 5)
)
tfidf_matrix = tfidf.fit_transform(outflow_train['cleaned_memo'])
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

In [23]:
tfidf_df.columns

Index(['365', '365 market', '888', 'ach', 'afterpay', 'amazon', 'amzn', 'and',
       'app', 'apple', 'bell', 'burger', 'burger king', 'ca', 'cafe', 'card',
       'cash', 'cash app', 'chick', 'chick fil', 'circle', 'city', 'costco',
       'crd', 'date', 'dbt', 'dd', 'dd doordash', 'dda', 'depot', 'dollar',
       'dollar general', 'donuts', 'doordash', 'dunkin', 'eats', 'eleven',
       'family', 'fil', 'food', 'from', 'general', 'google', 'home',
       'home depot', 'in', 'inc', 'king', 'kroger', 'liquor', 'lyft', 'market',
       'mart', 'mart super', 'mcdonald', 'mobile', 'new', 'of', 'of sale',
       'on', 'payment', 'paypal', 'pin', 'pizza', 'point', 'point of',
       'point of sale', 'pos', 'pos transaction', 'publix', 'pur', 'safeway',
       'sale', 'san', 'signature', 'sq', 'st', 'starbucks', 'store', 'super',
       'taco', 'taco bell', 'target', 'the', 'transaction', 'tst', 'uber',
       'uber eats', 'us', 'usa', 'vending', 'visa', 'wal', 'wal mart',
       'wal mart s

In [24]:
mean_tfidf = tfidf_matrix.mean(axis=0).A1
feature_names = tfidf.get_feature_names_out()

tfidf_importance = (
    pd.DataFrame({'term': feature_names, 'mean_tfidf': mean_tfidf})
    .sort_values(by='mean_tfidf', ascending=False)
)

In [25]:
tfidf_importance.head(20)

Unnamed: 0,term,mean_tfidf
5,amazon,0.049251
9,apple,0.042388
67,pos,0.03954
54,mcdonald,0.030863
95,walmart,0.0266
16,cash,0.024033
15,card,0.02374
6,amzn,0.021223
8,app,0.020763
17,cash app,0.019698


In [19]:
X = outflow_train_add['cleaned_memo']
y = outflow_train_add['category']

max_features_list = [10, 50, 100, 500, 1000]
scores = []

for n in max_features_list:
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=n, ngram_range=(1, 5))),
        ('clf', LogisticRegression(max_iter=200))
    ])
    
    score = cross_val_score(pipe, X, y, cv=min(3, len(outflow_train)), scoring='accuracy').mean()
    scores.append(score)
    print(f"max_features={n}, accuracy={score:.3f}")

max_features=10, accuracy=0.517
max_features=50, accuracy=0.706
max_features=100, accuracy=0.736
max_features=500, accuracy=0.823
max_features=1000, accuracy=0.859


### Logistic Regression

In [28]:
X = outflow_train_add['cleaned_memo']
y = outflow_train_add['category']
X_test = outflow_test["memo"]
y_test = outflow_test["category"]
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 5))),  # TF-IDF features
    ('clf', LogisticRegression(max_iter=300, n_jobs=-1))                 # Logistic regression classifier
])

#Train the Model
pipe.fit(X, y)
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8679975678196863

Classification Report:
                     precision    recall  f1-score   support

          EDUCATION       0.71      0.03      0.06       999
 FOOD_AND_BEVERAGES       0.85      0.88      0.86     99154
GENERAL_MERCHANDISE       0.85      0.89      0.87    101953
          GROCERIES       0.95      0.86      0.90     45523
           MORTGAGE       0.80      0.18      0.30       404
          OVERDRAFT       0.98      0.96      0.97       737
               PETS       0.99      0.72      0.84      2454
               RENT       0.71      0.53      0.61       479
             TRAVEL       0.88      0.74      0.80     13080

           accuracy                           0.87    264783
          macro avg       0.86      0.64      0.69    264783
       weighted avg       0.87      0.87      0.87    264783



In [29]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 3))),  # TF-IDF features
    ('clf', LogisticRegression(max_iter=300, n_jobs=-1))                 # Logistic regression classifier
])

#Train the Model
pipe.fit(X, y)
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# report train and test accuracy

Accuracy: 0.9232050395984637

Classification Report:
                     precision    recall  f1-score   support

          EDUCATION       0.73      0.39      0.51       999
 FOOD_AND_BEVERAGES       0.91      0.93      0.92     99154
GENERAL_MERCHANDISE       0.91      0.93      0.92    101953
          GROCERIES       0.96      0.93      0.94     45523
           MORTGAGE       1.00      0.46      0.63       404
          OVERDRAFT       0.99      0.97      0.98       737
               PETS       1.00      0.84      0.91      2454
               RENT       0.83      0.67      0.74       479
             TRAVEL       0.98      0.88      0.93     13080

           accuracy                           0.92    264783
          macro avg       0.92      0.78      0.83    264783
       weighted avg       0.92      0.92      0.92    264783



In [29]:
text_col = 'cleaned_memo'
num_cols = ['amount', 'days_since_prev', 'avg_days_between_txn',
            'rolling_avg_days_between_txn']
cat_cols = ['day_of_week', 'month', 'quarter', 'year']

# Combine all features
X = outflow_train_add[[text_col] + num_cols + cat_cols]
y = outflow_train_add['category']
X_test = outflow_test['memo']
y_test = outflow_test['category']

# Preprocessing for each column type
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=1000, ngram_range=(1, 5)), text_col),
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# Create the pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(max_iter=300, n_jobs=-1))
])

# Train the model
pipe.fit(X, y)

# Evaluate
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: X does not contain any features, but ColumnTransformer is expecting 9 features