In [1]:
#thanks this is ripped from https://github.com/thu-vu92/local-llms-analyse-finance/blob/main/categorize_expenses_with_validation.ipynb
from langchain_community.llms import Ollama
from langchain.chains.conversation.memory import ConversationBufferMemory
import pandas as pd
import numpy as np
import os

file_path = 'out.txt'

if os.path.exists(file_path):
    os.remove(file_path)

In [2]:
from os import listdir
from os.path import isfile, join
directory = './transactions/'
transfiles = [directory+f for f in listdir(directory) if isfile(join(directory, f))]
skip_list=['checking','savings']
cc_files=[f for f in transfiles if all(skip not in f for skip in skip_list)]

acc_files = [join(directory, f) for f in listdir(directory) 
         if os.path.isfile(os.path.join(directory, f)) and (f.startswith('checking') or f.startswith('savings'))]

print(cc_files)
print(acc_files)


['./transactions/August2024_9400.csv', './transactions/May2024_9400.csv', './transactions/August2024_1696.csv', './transactions/currentTransaction_9400.csv', './transactions/September2024_1696.csv', './transactions/July2024_2741.csv', './transactions/September2024_2741.csv', './transactions/July2024_1696.csv', './transactions/September2024_9400.csv', './transactions/June2024_2741.csv', './transactions/May2024_2741.csv', './transactions/currentTransaction_2741.csv', './transactions/July2024_9400.csv', './transactions/August2024_2741.csv', './transactions/June2024_9400.csv']
['./transactions/savings_0083.csv', './transactions/checking_9443.csv', './transactions/checking_4538.csv']


In [3]:
def read_cc_transactions(transfile):
    df = pd.read_csv(transfile)
    df['account']=transfile.split('_')[1].split('.')[0]
    df['fname']=transfile.split('/')[2].split('.')[0]

    #https://www.statology.org/pandas-remove-special-characters/
    pattern = r'[^\w\s]'
    df['Payee'] = df['Payee'].str.replace(pattern, '', regex=True)
    df['Payee'] = df['Payee'].str.replace(r'\s+', ' ', regex=True)

    
    return df

In [4]:
transframes=[]
for cc_file in cc_files[1:]:
    transframes.append(read_cc_transactions(cc_file))
all_trans=pd.concat(transframes, ignore_index=True)

In [5]:
#all_trans = all_trans[0:50]
all_trans.to_csv('all_trans_start.csv') #goood

In [6]:
# Get unique transactions in the Name / Description column
unique_transactions = all_trans["Payee"].unique()
unique_transactions_df = pd.DataFrame(unique_transactions,columns=['col1'])
unique_transactions_df.to_csv('unique_transactions_df.csv')
#print(type(unique_transactions))
#unique_transactions[1:10]

In [7]:
# Get index list
#https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

#index_list = list(hop(0, len(unique_transactions), 10))
index_list = list(hop(0, len(all_trans), 10))

In [8]:
# Output validation
from pydantic import BaseModel, field_validator
from typing import List

# Validate response format - check if it actually contains hyphen ("-")
class ResponseChecks(BaseModel):
    data: List[str]

    @field_validator("data")
    def check(cls, value):
        for item in value:
            if len(item) > 0:
                assert "-" in item, "String does not contain hyphen."

# Test validation
ResponseChecks(data = ['Hello - World', 'Hello - there!'])

ResponseChecks(data=None)

In [9]:
def check_trans(categories_df,all_trans,cnt_trans):
    print('check_trans')
    
    common_trans= pd.DataFrame()
    common_trans = categories_df['Transaction'].isin(all_trans['Payee'])

    if np.any(~common_trans):
        print("fail on cat vs all trans")
        return False

    
    if cnt_trans != len(categories_df.index):
        print("fail on len")
        return False

    print("passing true")
    return True
    


In [10]:
def categorize_transactions(transaction_names):
    print("categorzing...")
    llm = Ollama(model="llama3test:8b")
    
    llm.__init__
    response = llm.invoke("""
    Can you provide an appropriate category to the following expenses for my personal budget. 
    some expenses include just the business name, some include the name and part or all of their address.
    For example I will privde :
    CVSPHARMACY 123 Somehwer NJ, Spotify AB by Adyen, Beta Boulders Ams Amsterdam Nl
    Categories should be less than 4 words.
    Do not prepend the answer, no further explanation or any other text outside of the transactions and their corresponding categories. 
    Do not edit the transaction text in any way. Do not add spacing if there are words together. Do not add or remove any letters. Do not spell check or alter the text in anyway. 
    The response should be formatted as follows, with one entry and category per line:
    CVSPHARMACY 123 Somehwer NJ - Pharmacy
    Spotify AB by Adyen - Subscription/Music/Entertainment
    Beta Boulders Ams Amsterdam Nld - Gym/Sports
    Here is my list of transactions:""" + transaction_names)
    response = response.split('\n')
    ResponseChecks(data = response)

    
    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.strip().str.split(' - ', expand=True)
    #print(categories_df)    
    return categories_df, response

In [11]:
#categorize_transactions('INTEREST CHARGED ON PURCHASES,LATE FEE FOR PAYMENT DUE,RITAS 064 ROCKLEDGE PA,FUEL 1 MILTOWN MILTOWN NJ,WAWA 276 PHILADELPHIA PA,DOLLARTREE PHILADELPHIA PA,FELI RESTAURANT ELKINS PARK PA,MORE FOOD COURT ELKINS PARK PA,NBPA  PARKMOBILE parkmobilecomNJ,FIVE POINTS  CONVENIENCEPHILADELPHIA PA')
#print('!!!!!!!!!!!!!!!!!!')
#categorize_transactions('LUKOIL 69257 PHILADELPHIA PA - Gas Station, THE FRANKLIN INSTITUTE PHILADELPHIA PA - Education/Culture, TST THANAL INDIAN TAVERNPHILADELPHIA PA - Dining/Restaurants, GRANT PLAZA 9 PHILADELPHIA PA - Dry Cleaning, WHOLEFDS JEN 10100 JENKINTOWN PA - Grocery Store, DUNKIN 342380 PHILADELPHIA PA - Coffee Shop, SQ PELICANA Philadelphia PA - Pet Supplies, SQ JEONGS NOODLE Philadelphia PA - Dining/Restaurants, SQ MIRIM Philadelphia PA - Dining/Restaurants, SQ OH K DOG Philadelphia PA - Food/Grocery')

In [12]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()

#with open('out.txt', 'w') as f:
    
    
# Loop through the index_list
for i in range(0, len(index_list)-1):
    fcnt=0
    passed=0
    cnt=0
    while(not passed and fcnt <3 and cnt < 4):
        categories_df= pd.DataFrame()
    
        transaction_names_lst = unique_transactions[index_list[i]:index_list[i+1]]
        #print(transaction_names)
        transaction_names = ','.join(transaction_names_lst)
        
        #print("transaction_names", file=f)
        #print(transaction_names, file=f) 
        #print("!!!!!!!!!!", file=f)  # Python 3.x
        cnt = cnt +1
        
        if len(transaction_names_lst):
            categories_df,response = categorize_transactions(transaction_names)
            if check_trans(categories_df,all_trans,len(transaction_names_lst)):
               passed = 1
            else:
                fcnt= fcnt + 1
                
            
        print(f"passed: {passed}")
        print(f"fcnt: {fcnt}")
        
        #print(categories_df.to_string())
        #if check_trans(categories_df,all_trans):
        #    print('good')
        #else
        #    print('bad wtf is wron with you ai')
        
    categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)

categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
fail on cat vs all trans
passed: 0
fcnt: 1
categorzing...
check_trans
passing true
passed: 1
fcnt: 1
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
fail on len
passed: 0
fcnt: 1
categorzing...
check_trans
fail on len
passed: 0
fcnt: 2
categorzing...
check_trans
fail on len
passed: 0
fcnt: 3
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categorzing...
check_trans
passing true
passed: 1
fcnt: 0
categ

In [13]:
categories_df_all.to_csv('categories_df_all.csv')

In [14]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
print(unique_categories)
#      unique_categoriesdf = pd.DataFrame( unique_categories )

['Finance Charges' 'Overdue Payment' 'Restaurant' 'Gas Station'
 'Convenience Store' 'Retail/Store' 'Food/Court' 'Parking/Tolls'
 'Fast Food' 'Pet Supplies' 'Grocery' 'Restaurant/Coffee'
 'Restaurant/Asian' 'Pharmacy' 'Steakhouse' 'Subscription/Media'
 'Medical/Bills' 'Grocery Store' 'Auto Parts' 'Department Store'
 'Shoe Repair' 'Laundry' 'Discount Store' 'Travel/Transportation'
 'Payment Processing' 'Online Shopping' 'Shipping/Postage' 'Donations'
 'Gasoline' 'Museum/Arts' 'Car Payment' 'Groceries' 'Coffee/Breakfast'
 'Entertainment/Travel' 'Dining/Coffee' 'Gas/Oil' 'Coffee/Beverages'
 'Clothing/Shoes' 'Dining/Luxury' 'Food/Delivery' 'Baking/Sweets'
 'Fast/Food/Childhood' 'Pizza/Takeout' 'Gas/Petrol' 'Rewards/Points'
 'Dining' 'Online/Shoppin' 'Transfer/Other' 'Online Retail' 'Insurance'
 'Electronics' 'Parking/Transportation' 'Grocery/Snacks' 'Bakery/Food'
 'Entertainment' 'Dining Out' 'Pet Store' 'Flower Shop' 'Dry Cleaning'
 'Subscription/Design' 'Printing Services' 'Hardware Supp

In [15]:
# Drop NA values
categories_df_all_no_na = categories_df_all.dropna()
categories_df_all_no_na.to_csv('categories_df_all_nona.csv')

In [16]:
# Merge the categories_df_all with the transactions_2022_2023.csv dataframe (df)
all_trans.to_csv('all_transpremerge.csv')
all_trans = pd.merge(all_trans, categories_df_all_no_na, left_on='Payee', right_on='Transaction', how='left')
all_trans['Income/Expense'] = 'Expense'


In [17]:
all_trans.to_csv('all_trans_end.csv', index=False)