In [None]:
# import packages
import pandas as pd
from datetime import datetime
import os, dotenv, sys

from utils.google_api_utils import gsheet_upload

# Statement Data Cleaning

In [None]:
# set the folder path for the statement datasets
folder_path = "../personal_envs/household-cashflow-analyzer/private/"

# get the account names
account_list = []

for name in os.listdir(folder_path):
    if os.path.isdir(os.path.join(folder_path, name)):
        account_list.append(name)

In [None]:
# define the function of cleaning the Discover credit card
def get_dis_cc(file_path):
    df = pd.read_csv(file_path).rename(columns = {'Trans. Date': 'Date'})
    df['Amount'] = -df['Amount']
    df["date_str"] = pd.to_datetime(df["Date"], format="%m/%d/%Y").dt.strftime("%Y%m%d")
    df["row_num"] =  (df.groupby("date_str").cumcount() + 1).astype(str).str.zfill(3)
    df["Id"] = os.path.splitext(os.path.basename(file_path))[0][0: 7] + '_' + df["date_str"] + df["row_num"]
    df = df[['Date', 'Id', 'Description', 'Amount']]
    return df


# define the function of cleaning the BOA credit card
def get_boa_cc(file_path):
    df = pd.read_csv(file_path, dtype={"Id": str}).rename(columns = {'Posted Date': 'Date', 'Payee': 'Description'})
    df["date_str"] = pd.to_datetime(df["Date"], format="%m/%d/%Y").dt.strftime("%Y%m%d")
    df["row_num"] =  (df.groupby("date_str").cumcount() + 1).astype(str).str.zfill(3)
    df["Id"] = os.path.splitext(os.path.basename(file_path))[0][0: 7] + '_' + df["date_str"] + df["row_num"]
    df = df[['Date', 'Id', 'Description', 'Amount']]
    return df


# define the function of cleaning the BOA debit card
def get_boa_dc(file_path):
    df = pd.read_csv(file_path, skiprows = 6)
    df["date_str"] = pd.to_datetime(df["Date"], format="%m/%d/%Y").dt.strftime("%Y%m%d")
    df["row_num"] =  (df.groupby("date_str").cumcount() + 1).astype(str).str.zfill(3)
    df["Id"] = os.path.splitext(os.path.basename(file_path))[0][0: 7] + '_' + df["date_str"] + df["row_num"]

    df['Amount'] = df['Amount'].apply(lambda x: str(x).replace(",", "")).astype(float)
    df = df[df['Amount'].notna()]
    df = df[['Date', 'Id', 'Description', 'Amount']]

    raw_df = pd.read_csv(file_path, nrows = 4)
    beginning_bal, ending_bal = float(raw_df.iloc[0, 2].replace(",", "")), float(raw_df.iloc[3, 2].replace(",", ""))
    
    return df, beginning_bal, ending_bal

In [None]:
# define the function of combining all the datasets to one
def get_data(head, folder_path):
    
    df_list, folder_path = [], folder_path
    df_folder_path = os.path.join(folder_path, head)
    csv_files = [f for f in os.listdir(df_folder_path) if f.endswith('.csv')]
    csv_files.sort(key = lambda x: x.replace('.csv', '').zfill(20))
    if head == 'CC-5257':
        used_func = get_dis_cc
    elif head[0: 2] == 'CC':
        used_func = get_boa_cc
    elif head[0: 2] == 'DC' or head[0: 2] == 'SA':
        used_func = get_boa_dc

    if head[0: 2] == 'DC' or head[0: 2] == 'SA':
        last_ending_bal = None
        for file in csv_files:
            file_path = os.path.join(df_folder_path, file)
            df_list.append(used_func(file_path)[0])
            
            beginning_bal, ending_bal = used_func(file_path)[1], used_func(file_path)[2]
            if last_ending_bal == None or beginning_bal == last_ending_bal: 
                last_ending_bal = ending_bal
            else:
                print('Wrong beginning balance:', file, beginning_bal, last_ending_bal)
            
    else:
        for file in csv_files:
            file_path = os.path.join(df_folder_path, file)            
            df_list.append(used_func(file_path))
            
    combined_df = pd.concat(df_list)
    combined_df['Card'] = head
    combined_df["Date"] = pd.to_datetime(combined_df["Date"], format="%m/%d/%Y").dt.date
    
    user_map = {'CC-5257': 'User1', 'CC-4253': 'User1', 'DC-9084': 'User1', 'CC-2853': 'User1',
                'DC-8540': 'User2', 'CC-0401': 'User2', 
                'SA-7913': 'Savings'}
    combined_df["User"] = combined_df['Card'].map(user_map)
    
    combined_df = combined_df.sort_values('Date').reset_index(drop = True)

    return combined_df

In [None]:
# define the start date of the file reading
start_date = '2025-01-01'

# read the datasets
account_df_dict = {}

for account in account_list:
    df = get_data(account, folder_path)
    df = df[df['Date'] >= datetime.strptime(start_date, "%Y-%m-%d").date()]
    account_df_dict[account] = df
    
# combine all the datasets to one dataset
combined_df = pd.concat([account_df_dict[account] for account in account_list]).sort_values('Date').reset_index(drop = True)

# Transaction Categorization

In [None]:
sys.path.append("../personal_envs/household-cashflow-analyzer")
from description_map import description_map

In [None]:
# map the description to the type
for keyword, mapped_value in description_map.items():
    combined_df.loc[combined_df["Description"].str.contains(keyword, case=False, regex=False, na=False), "Type"] = mapped_value

combined_df.loc[combined_df['Type'].isna(), 'Type'] = None
combined_df['Original_Amount'] = combined_df['Amount']

# Transaction Offset

In [None]:
# remove unnecessary rows and columns
remove_list = ['Income: Reimbursement',
               'CC Payback: Discover', 
               'CC Payback: BOA',
               'Internal: from SA to DC',
               'Internal: from DC to SA',
               'Internal: from User1 to User2', 
               'Internal: from User2 to User1' ]

for remove_item in remove_list:
    if round(combined_df[combined_df['Type'] == remove_item]['Amount'].sum(), 2) == 0:
        print('remove: ', remove_item)
    else:
        print('Not =0: ', remove_item)
    combined_df = combined_df[combined_df['Type'] != remove_item]

combined_df = combined_df[combined_df['Amount'] != 0][['Date', 'Id', 'Description', 'Original_Amount', 'Amount', 'Card', 'User', 'Type']]

# Mapping Old File Information

In [None]:
# defin the function of mapping the old excel file information to this new old
def sub_old_data(new_df, old_file_name, folder_path):
    old_df = pd.read_excel(os.path.join(folder_path, old_file_name), sheet_name = 'Sheet1')[['Id', 'Type', 'General_Type', 'Amount']]

    comb_df = pd.merge(new_df, old_df, on = "Id", how = "left", suffixes = ("", "_old"))
    
    comb_df.loc[comb_df['Type'].isin([None, 'Other: Mobile Check']), 'Type'] = comb_df.loc[comb_df['Type'].isin([None, 'Other: Mobile Check']), 'Type_old']

    comb_df.loc[comb_df['Type_old'].isin(['Omit: others']), 'Type'] = 'Omit: others'

    comb_df.loc[(comb_df['Amount'] !=  comb_df['Amount_old']) & (comb_df['Amount_old'].notna()), 'Amount'] = comb_df.loc[(comb_df['Amount'] !=  comb_df['Amount_old']) & (comb_df['Amount_old'].notna()), 'Amount_old'] 
    
    return comb_df[['Date', 'Id', 'Description', 'Original_Amount', 'Amount', 'Card', 'User', 'Type', 'General_Type']]

# map the old excel file information to this new old
old_file_name = 'combined_20251023142629.xlsx'
combined_df_new = sub_old_data(combined_df, old_file_name, folder_path)

# clean the columns of the dataset and add new columns
combined_df_new["General_Type"] = combined_df_new["Type"].str.split(":", n = 1).str[0]

In [None]:
# validate the dateset
if round(combined_df_new.loc[combined_df_new["Type"] == 'Omit: others', 'Original_Amount'].sum(), 2) != 0:
    raise ValueError('The Type Omit: others do not sum up to 0.')
elif round(combined_df_new['Original_Amount'].sum(), 2) != round(combined_df_new['Amount'].sum(), 2):
    raise ValueError('The Original_Amount and Amount totals are not the same.')

# Export Results

In [None]:
# export the file to EXCEL
excel_name = f"combined_{datetime.now().strftime('%Y%m%d%H%M%S')}.xlsx"

combined_df_new.to_excel(os.path.join(folder_path, excel_name), index = False)

In [None]:
# import the google_sheet_id, sheet_name and gsheet_credentials
dotenv.load_dotenv("../personal_envs/household-cashflow-analyzer/.env", override = True)
spreadsheet_id, worksheet_name = os.getenv("gsheet_id"), os.getenv("sheet_name")
gsheet_credentials = "../personal_envs/household-cashflow-analyzer/gsheet_credentials.json"

# clean the Date column for uploading
combined_df_new["Date"] = pd.to_datetime(combined_df_new["Date"]).dt.strftime("%Y-%m-%d")

# upload the data to Google Sheet
gsheet_upload(gsheet_credentials, spreadsheet_id, worksheet_name, combined_df_new)