# Personal Finance Analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import hashlib

%matplotlib inline

## Reading in the Data

Here, we expect some consistent format from the download button within Fidelity. 
In the event that we download more files for more up to date reflections on spending, I want to be able to easily add them and use the same code.
Here, we read in multiple csv files and provide unique hashes to reference them again in the future if we need to.

If we assume the average life of a human is 80 years x 365 days = 29,200 days, and we upper bound our spending by saying we have 100 transactions a day, we will have 2,920,000 transactions in a lifetime.

`log_36(2920000) = 4.166`, so if we round up to a hash of 5, we should have more than enough numbers to track all of our spending.

In [139]:
def read_transaction_csv():
    # Get list of CSV files
    csv_files = glob.glob('../data/transactions_*.csv')

    # Read each file and add source column
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        df['SourceFile'] = file.split('/')[-1]  # Just the filename, not full path
        dfs.append(df)

    # Combine all DataFrames
    return pd.concat(dfs, ignore_index=True)

def create_row_hash(row):
    # Get column names to exclude
    exclude_cols = ['SourceFile']

    # Filter out excluded columns before hashing
    row_values = [str(row[col]) for col in row.index if col not in exclude_cols]
    row_str = ''.join(row_values)

    # Create hash
    full_hash = hashlib.sha256(row_str.encode()).hexdigest()
    return full_hash[:5]

def add_row_hash(df):
    # Apply to each row
    df['RowHashID'] = df.apply(create_row_hash, axis=1)
    # Avoid duplicates with count as suffix
    df['RowHashID'] = df.groupby('RowHashID').cumcount().add(1).astype(str).radd(df['RowHashID'] + '_')
    df.set_index('RowHashID', inplace=True)
    return df

# Preview data
add_row_hash(read_transaction_csv())

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
e3637_1,6/11/2025,ALLY BANK DES TRANSFER INDN LEN G HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,($400),transactions_2025_06_13.csv
94c5e_1,6/11/2025,VENMO DES PAYMENT INDN LEN HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Online Payment,No,No,($2500),transactions_2025_06_13.csv
72b02_1,6/11/2025,ALLY BANK DES ID LEN G HUANG INDN LEN G HUANG ...,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Income,No,No,$5000,transactions_2025_06_13.csv
4871c_1,6/10/2025,OZZIES FRESH MARKET NY,Bank of America - via Direct Web API,Customized Cash Rewards Visa Signature - 0966,Groceries,No,No,($6.99),transactions_2025_06_13.csv
a1c2b_1,6/10/2025,SECURITY DEPOSIT UNION AVE,Venmo - via Direct Web Access,Venmo,Income,No,No,($2500),transactions_2025_06_13.csv
...,...,...,...,...,...,...,...,...,...
2d51e_1,1/2/2024,ONLINE BANKING TRANSFER FROM CHK CONFIRMATION,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,$300,transactions_2025_06_13.csv
92d6d_1,1/1/2024,COURT SQUARE DINER,Chase,Other Spending,Restaurants/Dining,No,No,($77.65),transactions_2025_06_13.csv
ee82d_1,1/1/2024,NJT RAIL MYTIX,Chase,Other Spending,Public Transport,No,No,($8.5),transactions_2025_06_13.csv
f370a_1,1/1/2024,TARGET,Chase,Other Spending,Shopping,No,No,($14.34),transactions_2025_06_13.csv


Now we can query for specific fields as such, in case we want to add logic overrides to specific rows.

In [142]:
def find_row_from_hash(hash='e3637'):
    df = add_row_hash(read_transaction_csv())
    return df[df.index.str.startswith(hash)]

find_row_from_hash()

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
e3637_1,6/11/2025,ALLY BANK DES TRANSFER INDN LEN G HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,($400),transactions_2025_06_13.csv


# Dedupe the Data

We'll also be able to detect duplicates across files potentially like this.

In [149]:
def find_duplicate_groups():
    df = add_row_hash(read_transaction_csv())

    # Extract base hash and create group IDs
    df['BaseHash'] = df.index.str.replace(r'_\d+$', '', regex=True)
    duplicates = df[df.duplicated('BaseHash', keep=False)]

    return duplicates

find_duplicate_groups()

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile,BaseHash
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4b67d_1,5/17/2025,PER PERSON,Venmo - via Direct Web Access,Venmo,Income,No,No,$47.25,transactions_2025_06_13.csv,4b67d
4b67d_2,5/17/2025,PER PERSON,Venmo - via Direct Web Access,Venmo,Income,No,No,$47.25,transactions_2025_06_13.csv,4b67d
b7006_1,4/15/2025,OLD CHELSEA SNY,Wells Fargo Online Banking - via Direct Web API,BILT WORLD ELITE MASTERCARD ...7685,Restaurants/Dining,No,No,($93.01),transactions_2025_06_13.csv,b7006
b7006_2,4/15/2025,OLD CHELSEA SNY,Wells Fargo Online Banking - via Direct Web API,BILT WORLD ELITE MASTERCARD ...7685,Restaurants/Dining,No,No,($93.01),transactions_2025_06_13.csv,b7006
7cb4f_1,4/1/2025,OLD CHELSEA SNY,Wells Fargo Online Banking - via Direct Web API,BILT WORLD ELITE MASTERCARD ...7685,Restaurants/Dining,No,No,($83.6),transactions_2025_06_13.csv,7cb4f
...,...,...,...,...,...,...,...,...,...,...
c685a_3,2/19/2024,LUNCH,Venmo - via Direct Web Access,Venmo,Income,No,No,$20,transactions_2025_06_13.csv,c685a
9debd_1,1/21/2024,MOMOFUKU,Venmo - via Direct Web Access,Venmo,Income,No,No,$24.49,transactions_2025_06_13.csv,9debd
9debd_2,1/21/2024,MOMOFUKU,Venmo - via Direct Web Access,Venmo,Income,No,No,$24.49,transactions_2025_06_13.csv,9debd
5e2ce_1,1/13/2024,UBER,Venmo - via Direct Web Access,Venmo,Rideshare,No,No,$15.47,transactions_2025_06_13.csv,5e2ce


How might we solve duplicate within a file though? Many of these are Venmo statements that some of the naming/description gets cut out from the description. Namely, should we add better descriptions before passing in the data?

## Reformatting the Data

Here, we want to convert some of the values to things that will be easier to work with.
When reading it in, everything is a string. We want:
- the string `Date` to be datetime objects
- `Is Hidden` / `Is Pending` to bools
- `Amount` to be positive/negative numbers rather than accounting style strings.

In [103]:
def convert_types(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Is Hidden'] = df['Is Hidden'].replace({'yes': True, 'no': False})
    df['Is Pending'] = df['Is Pending'].replace({'yes': True, 'no': False})

    return df

convert_types(add_row_hash(read_transaction_csv()))

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
f35f80a5580758314f7e81c403c96f3a88372409022f7cc5d9acc779a5562438,2025-06-11,ALLY BANK DES TRANSFER INDN LEN G HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,($400),transactions_2025_06_13.csv
97ae98dce283d0803bf021ccfc33a5cac5c9f55e09486c70ab11c831ffa0716b,2025-06-11,VENMO DES PAYMENT INDN LEN HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Online Payment,No,No,($2500),transactions_2025_06_13.csv
1befc714d8fdd198dbf448d5a5fc60f65290a9a9f266cce9b47c33a706b21051,2025-06-11,ALLY BANK DES ID LEN G HUANG INDN LEN G HUANG ...,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Income,No,No,$5000,transactions_2025_06_13.csv
d59059c9936f5432f346f30a1e7eb1fb2d54bae871f11a9016c7c920b9a4aae0,2025-06-10,OZZIES FRESH MARKET NY,Bank of America - via Direct Web API,Customized Cash Rewards Visa Signature - 0966,Groceries,No,No,($6.99),transactions_2025_06_13.csv
33bf93e9bcc20a6ee5927cbf6874ebd2fa744854e65386261cbb6daef3f3a39b,2025-06-10,SECURITY DEPOSIT UNION AVE,Venmo - via Direct Web Access,Venmo,Income,No,No,($2500),transactions_2025_06_13.csv
...,...,...,...,...,...,...,...,...,...
03c76a846b3e17045fbea5b5d044af3b3879c481a9b066634628d35759c7caca,2024-01-02,ONLINE BANKING TRANSFER FROM CHK CONFIRMATION,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,$300,transactions_2025_06_13 copy.csv
741065bc7f441776aa1f35cea1566943f0f43f89ed53ebff4bbf45bb0cc153bf,2024-01-01,COURT SQUARE DINER,Chase,Other Spending,Restaurants/Dining,No,No,($77.65),transactions_2025_06_13 copy.csv
4681d08e3149d82f0883d0358d6b189f3f2df6aab0f61410beed9f5a0efbdea2,2024-01-01,NJT RAIL MYTIX,Chase,Other Spending,Public Transport,No,No,($8.5),transactions_2025_06_13 copy.csv
bda7d3eb104c4f57cdaf40fb59dc3e06dcdc984f8b9276f4dd7852dd01e2481b,2024-01-01,TARGET,Chase,Other Spending,Shopping,No,No,($14.34),transactions_2025_06_13 copy.csv
