# Personal Finance Analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import hashlib

%matplotlib inline

## Reading in the Data

Here, we expect some consistent format from the download button within Fidelity. 
In the event that we download more files for more up to date reflections on spending, I want to be able to easily add them and use the same code.
Here, we read in multiple csv files and provide unique hashes to reference them again in the future if we need to.

If we assume the average life of a human is 80 years x 365 days = 29,200 days, and we upper bound our spending by saying we have 100 transactions a day, we will have 2,920,000 transactions in a lifetime.

`log_36(2920000) = 4.166`, so if we round up to a hash of 5, we should have more than enough numbers to track all of our spending.

In [155]:
def read_transaction_csv():
    # Get list of CSV files
    csv_files = glob.glob('../data/transactions_*.csv')

    # Read each file and add source column
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        df['SourceFile'] = file.split('/')[-1]  # Just the filename, not full path
        dfs.append(df)

    # Combine all DataFrames
    return pd.concat(dfs, ignore_index=True)

def create_row_hash(row):
    # Get column names to exclude
    exclude_cols = ['SourceFile']

    # Filter out excluded columns before hashing
    row_values = [str(row[col]) for col in row.index if col not in exclude_cols]
    row_str = ''.join(row_values)

    # Create hash
    full_hash = hashlib.sha256(row_str.encode()).hexdigest()
    return full_hash[:5]

def add_row_hash(df):
    # Apply to each row
    df['GroupingKey'] = df.apply(create_row_hash, axis=1)
    # Avoid duplicates with count as suffix
    df['RowHashID'] = df.groupby('GroupingKey').cumcount().add(1).astype(str).radd(df['GroupingKey'] + '_')
    df.set_index('RowHashID', inplace=True)
    return df

# Preview data
add_row_hash(read_transaction_csv())

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile,GroupingKey
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
e3637_1,6/11/2025,ALLY BANK DES TRANSFER INDN LEN G HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,($400),transactions_2025_06_13.csv,e3637
94c5e_1,6/11/2025,VENMO DES PAYMENT INDN LEN HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Online Payment,No,No,($2500),transactions_2025_06_13.csv,94c5e
72b02_1,6/11/2025,ALLY BANK DES ID LEN G HUANG INDN LEN G HUANG ...,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Income,No,No,$5000,transactions_2025_06_13.csv,72b02
4871c_1,6/10/2025,OZZIES FRESH MARKET NY,Bank of America - via Direct Web API,Customized Cash Rewards Visa Signature - 0966,Groceries,No,No,($6.99),transactions_2025_06_13.csv,4871c
a1c2b_1,6/10/2025,SECURITY DEPOSIT UNION AVE,Venmo - via Direct Web Access,Venmo,Income,No,No,($2500),transactions_2025_06_13.csv,a1c2b
...,...,...,...,...,...,...,...,...,...,...
2d51e_1,1/2/2024,ONLINE BANKING TRANSFER FROM CHK CONFIRMATION,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,$300,transactions_2025_06_13.csv,2d51e
92d6d_1,1/1/2024,COURT SQUARE DINER,Chase,Other Spending,Restaurants/Dining,No,No,($77.65),transactions_2025_06_13.csv,92d6d
ee82d_1,1/1/2024,NJT RAIL MYTIX,Chase,Other Spending,Public Transport,No,No,($8.5),transactions_2025_06_13.csv,ee82d
f370a_1,1/1/2024,TARGET,Chase,Other Spending,Shopping,No,No,($14.34),transactions_2025_06_13.csv,f370a


Now we can query for specific fields as such, in case we want to add logic overrides to specific rows.

In [157]:
def find_row_from_hash(hash='e3637'):
    df = add_row_hash(read_transaction_csv())
    return df[df.index.str.startswith(hash)]

find_row_from_hash()

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile,GroupingKey
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
e3637_1,6/11/2025,ALLY BANK DES TRANSFER INDN LEN G HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,($400),transactions_2025_06_13.csv,e3637


# Dedupe the Data

We'll also be able to detect duplicates across files potentially like this.

In [171]:
def find_duplicate_groups():
    df = add_row_hash(read_transaction_csv())
    return df[df.duplicated('GroupingKey', keep=False)]

find_duplicate_groups()

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile,GroupingKey
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4b67d_1,5/17/2025,PER PERSON,Venmo - via Direct Web Access,Venmo,Income,No,No,$47.25,transactions_2025_06_13.csv,4b67d
4b67d_2,5/17/2025,PER PERSON,Venmo - via Direct Web Access,Venmo,Income,No,No,$47.25,transactions_2025_06_13.csv,4b67d
b7006_1,4/15/2025,OLD CHELSEA SNY,Wells Fargo Online Banking - via Direct Web API,BILT WORLD ELITE MASTERCARD ...7685,Restaurants/Dining,No,No,($93.01),transactions_2025_06_13.csv,b7006
b7006_2,4/15/2025,OLD CHELSEA SNY,Wells Fargo Online Banking - via Direct Web API,BILT WORLD ELITE MASTERCARD ...7685,Restaurants/Dining,No,No,($93.01),transactions_2025_06_13.csv,b7006
7cb4f_1,4/1/2025,OLD CHELSEA SNY,Wells Fargo Online Banking - via Direct Web API,BILT WORLD ELITE MASTERCARD ...7685,Restaurants/Dining,No,No,($83.6),transactions_2025_06_13.csv,7cb4f
...,...,...,...,...,...,...,...,...,...,...
c685a_3,2/19/2024,LUNCH,Venmo - via Direct Web Access,Venmo,Income,No,No,$20,transactions_2025_06_13.csv,c685a
9debd_1,1/21/2024,MOMOFUKU,Venmo - via Direct Web Access,Venmo,Income,No,No,$24.49,transactions_2025_06_13.csv,9debd
9debd_2,1/21/2024,MOMOFUKU,Venmo - via Direct Web Access,Venmo,Income,No,No,$24.49,transactions_2025_06_13.csv,9debd
5e2ce_1,1/13/2024,UBER,Venmo - via Direct Web Access,Venmo,Rideshare,No,No,$15.47,transactions_2025_06_13.csv,5e2ce


But it looks like these are actually not duplicates, they are venmo requests of the same kind to different people! Let's understand this more.

In [172]:
def duplicates_summary():
    # Read and process data
    df = add_row_hash(read_transaction_csv())

    # Group and aggregate
    grouped = df.groupby('GroupingKey').agg({
        'Description': 'first',
        'Date': ['min', 'max', 'count'],
        'Amount': 'first',
        'SourceFile': lambda x: ', '.join(set(x))  # Unique source files
    })

    # df.set_index('RowHashID', inplace=True)

    # Flatten multi-index columns
    grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]

    # Rename columns
    grouped.rename(columns={
        'GroupingKey_': 'GroupID',
        'Description_first': 'Description',
        'Date_count': 'DuplicateCount',
        'Date_min': 'FirstDate',
        'Date_max': 'LastDate',
        'Amount_first': 'Amount',
        'SourceFile_<lambda>': 'SourceFiles'
    }, inplace=True)

    # Filter for actual duplicates (count > 1)
    duplicates = grouped[grouped['DuplicateCount'] > 1]

    # Sort by most frequent duplicates first
    return duplicates.sort_values('DuplicateCount', ascending=False)

duplicates_summary()

Unnamed: 0_level_0,Description,FirstDate,LastDate,DuplicateCount,Amount,SourceFiles
GroupingKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fed3c,MESSED UP SOME MATH WILL PENN PT,4/14/2024,4/14/2024,4,$1.46,transactions_2025_06_13.csv
839bd,WILL PENN TAVERN,4/14/2024,4/14/2024,4,$26,transactions_2025_06_13.csv
28855,RAJ S FRIDAY DINNER,4/21/2024,4/21/2024,4,$21.71,transactions_2025_06_13.csv
c685a,LUNCH,2/19/2024,2/19/2024,3,$20,transactions_2025_06_13.csv
9075a,BKOFAMERICA ATM WITHDRWL LAGUARDIA AIRPORT LAG...,1/21/2025,1/21/2025,3,($300),transactions_2025_06_13.csv
0d08b,SHAVED ICE FRIDAY EVENING,4/21/2024,4/21/2024,3,$3.67,transactions_2025_06_13.csv
39ed8,ADDA SPOT,12/23/2024,12/23/2024,2,$61.62,transactions_2025_06_13.csv
9debd,MOMOFUKU,1/21/2024,1/21/2024,2,$24.49,transactions_2025_06_13.csv
f664c,BKOFAMERICA ATM WITHDRWL LONG ISLAND CITYJ LON...,6/20/2024,6/20/2024,2,($100),transactions_2025_06_13.csv
ed8cb,HAPPY BIRTHDAY,7/21/2024,7/21/2024,2,$5,transactions_2025_06_13.csv


In [None]:
def find_duplicate_groups():
    # Read and process data
    df = add_row_hash(read_transaction_csv())

    # Group and aggregate
    grouped = df.groupby('GroupingKey').agg({
        'Description': 'first',
        'Date': ['min', 'max', 'count'],
        'Amount': 'first',
        'SourceFile': lambda x: ', '.join(set(x))  # Unique source files
    })

    # df.set_index('RowHashID', inplace=True)

    # Flatten multi-index columns
    grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]

    # Rename columns
    grouped.rename(columns={
        'GroupingKey_': 'GroupID',
        'Description_first': 'Description',
        'Date_count': 'DuplicateCount',
        'Date_min': 'FirstDate',
        'Date_max': 'LastDate',
        'Amount_first': 'Amount',
        'SourceFile_<lambda>': 'SourceFiles'
    }, inplace=True)

    # Filter for actual duplicates (count > 1)
    duplicates = grouped[grouped['DuplicateCount'] > 1]

    # Sort by most frequent duplicates first
    return duplicates.sort_values('DuplicateCount', ascending=False)

find_duplicate_groups()

Unnamed: 0_level_0,Description,FirstDate,LastDate,DuplicateCount,Amount,SourceFiles
GroupingKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fed3c,MESSED UP SOME MATH WILL PENN PT,4/14/2024,4/14/2024,4,$1.46,transactions_2025_06_13.csv
839bd,WILL PENN TAVERN,4/14/2024,4/14/2024,4,$26,transactions_2025_06_13.csv
28855,RAJ S FRIDAY DINNER,4/21/2024,4/21/2024,4,$21.71,transactions_2025_06_13.csv
c685a,LUNCH,2/19/2024,2/19/2024,3,$20,transactions_2025_06_13.csv
9075a,BKOFAMERICA ATM WITHDRWL LAGUARDIA AIRPORT LAG...,1/21/2025,1/21/2025,3,($300),transactions_2025_06_13.csv
0d08b,SHAVED ICE FRIDAY EVENING,4/21/2024,4/21/2024,3,$3.67,transactions_2025_06_13.csv
39ed8,ADDA SPOT,12/23/2024,12/23/2024,2,$61.62,transactions_2025_06_13.csv
9debd,MOMOFUKU,1/21/2024,1/21/2024,2,$24.49,transactions_2025_06_13.csv
f664c,BKOFAMERICA ATM WITHDRWL LONG ISLAND CITYJ LON...,6/20/2024,6/20/2024,2,($100),transactions_2025_06_13.csv
ed8cb,HAPPY BIRTHDAY,7/21/2024,7/21/2024,2,$5,transactions_2025_06_13.csv


How might we solve duplicate within a file though? Many of these are Venmo statements that some of the naming/description gets cut out from the description. Namely, should we add better descriptions before passing in the data?

## Reformatting the Data

Here, we want to convert some of the values to things that will be easier to work with.
When reading it in, everything is a string. We want:
- the string `Date` to be datetime objects
- `Is Hidden` / `Is Pending` to bools
- `Amount` to be positive/negative numbers rather than accounting style strings.

In [159]:
def convert_types(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Is Hidden'] = df['Is Hidden'].replace({'yes': True, 'no': False})
    df['Is Pending'] = df['Is Pending'].replace({'yes': True, 'no': False})

    return df

convert_types(add_row_hash(read_transaction_csv()))

Unnamed: 0_level_0,Date,Description,Institution,Account,Category,Is Hidden,Is Pending,Amount,SourceFile,GroupingKey
RowHashID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
e3637_1,2025-06-11,ALLY BANK DES TRANSFER INDN LEN G HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,($400),transactions_2025_06_13.csv,e3637
94c5e_1,2025-06-11,VENMO DES PAYMENT INDN LEN HUANG CO WEB,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Online Payment,No,No,($2500),transactions_2025_06_13.csv,94c5e
72b02_1,2025-06-11,ALLY BANK DES ID LEN G HUANG INDN LEN G HUANG ...,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Income,No,No,$5000,transactions_2025_06_13.csv,72b02
4871c_1,2025-06-10,OZZIES FRESH MARKET NY,Bank of America - via Direct Web API,Customized Cash Rewards Visa Signature - 0966,Groceries,No,No,($6.99),transactions_2025_06_13.csv,4871c
a1c2b_1,2025-06-10,SECURITY DEPOSIT UNION AVE,Venmo - via Direct Web Access,Venmo,Income,No,No,($2500),transactions_2025_06_13.csv,a1c2b
...,...,...,...,...,...,...,...,...,...,...
2d51e_1,2024-01-02,ONLINE BANKING TRANSFER FROM CHK CONFIRMATION,Bank of America - via Direct Web API,Adv Plus Banking - 2082,Transfers,No,No,$300,transactions_2025_06_13.csv,2d51e
92d6d_1,2024-01-01,COURT SQUARE DINER,Chase,Other Spending,Restaurants/Dining,No,No,($77.65),transactions_2025_06_13.csv,92d6d
ee82d_1,2024-01-01,NJT RAIL MYTIX,Chase,Other Spending,Public Transport,No,No,($8.5),transactions_2025_06_13.csv,ee82d
f370a_1,2024-01-01,TARGET,Chase,Other Spending,Shopping,No,No,($14.34),transactions_2025_06_13.csv,f370a
