In [1]:
import pandas as pd

source_file = "C:\\users\\Sean Jackman\\downloads\\transactions_ai_categorization.csv"

def read_transactions_csv(filepath: str) -> pd.DataFrame:
    """
    Reads transaction data from CSV file into a pandas DataFrame
    
    Args:
        filepath: Path to the CSV file
        
    Returns:
        DataFrame containing transaction data with expected columns
    """
    expected_columns = [
        "Date", 
        "Description",
        "Category",
        "Amount", 
        "Labels",
        "Notes",
        "Account",
        "Account #",
        "Institution",
        "Month",
        "Week",
        "Transaction ID",
        "Account ID", 
        "Check Number",
        "Full Description",
        "Date Added"
    ]
    
    df = pd.read_csv(
        filepath,
        parse_dates=["Date", "Date Added"],
        dtype={
            "Account #": str,
            "Transaction ID": str,
            "Account ID": str,
            "Check Number": str
        }
    )
    
    # Ensure all expected columns exist
    for col in expected_columns:
        if col not in df.columns:
            df[col] = None
            
    return df[expected_columns]

transactions_df = read_transactions_csv(source_file)
# print(transactions)

In [2]:
from pydantic import BaseModel, Field
from datetime import datetime
from typing import Optional, List

class Transaction(BaseModel):
    date: datetime = Field(alias="Date")
    description: str = Field(alias="Description") 
    category: Optional[str] = Field(alias="Category")
    amount: float = Field(alias="Amount")
    labels: Optional[str] = Field(alias="Labels")
    notes: Optional[str] = Field(alias="Notes")
    account: Optional[str] = Field(alias="Account")
    account_number: Optional[str] = Field(alias="Account #")
    institution: Optional[str] = Field(alias="Institution")
    month: Optional[str] = Field(alias="Month")
    week: Optional[str] = Field(alias="Week")
    transaction_id: Optional[str] = Field(alias="Transaction ID")
    account_id: Optional[str] = Field(alias="Account ID")
    check_number: Optional[str] = Field(alias="Check Number")
    full_description: Optional[str] = Field(alias="Full Description")
    date_added: Optional[datetime] = Field(alias="Date Added")


def convert_df_to_transactions(df: pd.DataFrame) -> List[Transaction]:
    """
    Converts DataFrame rows to Transaction objects
    Raises ValueError if any row fails validation
    
    Returns:
        List of validated Transaction objects
    """
    transactions = []
    
    for index, row in df.iterrows():
        try:
            transaction = Transaction(**row.to_dict())
            transactions.append(transaction)
        except Exception as e:
            raise ValueError(f"Row {index} failed validation: {str(e)}\nData: {row.to_dict()}")
            
    return transactions


transactions = convert_df_to_transactions(transactions_df)
print(transactions)


ValueError: Row 0 failed validation: 5 validation errors for Transaction
Category
  Input should be a valid string [type=string_type, input_value=nan, input_type=float]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Amount
  Input should be a valid number, unable to parse string as a number [type=float_parsing, input_value='-$55.13', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/float_parsing
Labels
  Input should be a valid string [type=string_type, input_value=nan, input_type=float]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Notes
  Input should be a valid string [type=string_type, input_value=nan, input_type=float]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Check Number
  Input should be a valid string [type=string_type, input_value=nan, input_type=float]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
Data: {'Date': Timestamp('2024-12-05 00:00:00'), 'Description': 'Paypal * r.c purdys x8739 bc', 'Category': nan, 'Amount': '-$55.13', 'Labels': nan, 'Notes': nan, 'Account': 'Scotia Momentum VISA Infinite', 'Account #': 'xxxx0028', 'Institution': 'Scotiabank', 'Month': '2024-12-01', 'Week': '2024-12-02', 'Transaction ID': '67549c1f246e21bae546ae52', 'Account ID': '63bd8dee44761f0033422444', 'Check Number': nan, 'Full Description': 'paypal * r.c purdys XXXXXXX8739 bc', 'Date Added': Timestamp('2024-12-07 00:00:00')}

In [None]:
from toolkit.language_models.model_connection import ChatModelsSetup
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel


class Comments(BaseModel):
    student_id: str
    comments: str


template = """
Human: You are an expert personal finance assistant. 
You will help me categorize my spending and income transactions.

Here are my categories:
{categories}

Here are examples of my past transactions and how I categorized them
{examples} 

Here is the transaction I need you to categorize:
{transaction}

Think step by step and explain your reasoning for what category the transaction should fall into. If you are not confident what category the transaction should fall into, you should simply assign the category as "Unknown". It is very important that you recognize when you are not confident, as it is much better to assign Unknown than to guess and risk being wrong.
Assistant:
"""

prompt_template = PromptTemplate.from_template(template)

formatted_prompt = prompt_template.format(
    examples="",
    id="",
    notes="",
    structure=Comments.model_json_schema()
)

chat_models = ChatModelsSetup()
response = chat_models.claude_35_v2_sonnet_chat.invoke(formatted_prompt)
opening_comments = Comments.model_validate_json(response)
