In [None]:
# import packages
import pandas as pd
from datetime import datetime
import os, dotenv

from oauth2client.service_account import ServiceAccountCredentials
import gspread

pd.options.mode.chained_assignment = None

# Statement Data Cleaning

In [None]:
# set the folder path for the statement datasets
folder_path = "../personal_envs/household-cashflow-analyzer/private/"

# get the account names
account_list = []

for name in os.listdir(folder_path):
    if os.path.isdir(os.path.join(folder_path, name)):
        account_list.append(name)

In [None]:
# define the function of cleaning the Discover credit card
def get_dis_cc(file_path):
    df = pd.read_csv(file_path).rename(columns = {'Trans. Date': 'Date'})
    df['Amount'] = -df['Amount']
    df["date_str"] = pd.to_datetime(df["Date"], format="%m/%d/%Y").dt.strftime("%Y%m%d")
    df["row_num"] =  (df.groupby("date_str").cumcount() + 1).astype(str).str.zfill(3)
    df["Id"] = os.path.splitext(os.path.basename(file_path))[0][0: 7] + '_' + df["date_str"] + df["row_num"]
    df = df[['Date', 'Id', 'Description', 'Amount']]
    return df


# define the function of cleaning the BOA credit card
def get_boa_cc(file_path):
    df = pd.read_csv(file_path, dtype={"Id": str}).rename(columns = {'Posted Date': 'Date', 'Payee': 'Description'})
    df["date_str"] = pd.to_datetime(df["Date"], format="%m/%d/%Y").dt.strftime("%Y%m%d")
    df["row_num"] =  (df.groupby("date_str").cumcount() + 1).astype(str).str.zfill(3)
    df["Id"] = os.path.splitext(os.path.basename(file_path))[0][0: 7] + '_' + df["date_str"] + df["row_num"]
    df = df[['Date', 'Id', 'Description', 'Amount']]
    return df


# define the function of cleaning the BOA debit card
def get_boa_dc(file_path):
    df = pd.read_csv(file_path, skiprows = 6).rename(columns = {'Running Bal.': 'Running_balance'})
    df["date_str"] = pd.to_datetime(df["Date"], format="%m/%d/%Y").dt.strftime("%Y%m%d")
    df["row_num"] =  (df.groupby("date_str").cumcount() + 1).astype(str).str.zfill(3)
    df["Id"] = os.path.splitext(os.path.basename(file_path))[0][0: 7] + '_' + df["date_str"] + df["row_num"]

    df['Amount'] = df['Amount'].apply(lambda x: str(x).replace(",", "")).astype(float)
    df = df[df['Amount'].notna()]
    df = df[['Date', 'Id', 'Description', 'Amount']]
    
    return df

In [None]:
# define the function of combining all the datasets to one
def get_data(head, folder_path):
    
    df_list, folder_path = [], folder_path
    df_folder_path = os.path.join(folder_path, head)
    csv_files = [f for f in os.listdir(df_folder_path) if f.endswith('.csv')]

    if head == 'CC-5257':
        used_func = get_dis_cc
    elif head[0: 2] == 'CC':
        used_func = get_boa_cc
    elif head[0: 2] == 'DC' or head[0: 2] == 'SA':
        used_func = get_boa_dc

    for file in csv_files:
        file_path = os.path.join(df_folder_path, file)
        df_list.append(used_func(file_path))
            
    combined_df = pd.concat(df_list)
    combined_df['Card'] = head
    combined_df["Date"] = pd.to_datetime(combined_df["Date"], format="%m/%d/%Y").dt.date
    
    user_map = {'DC-8540': 'Wei', 'CC-0401': 'Wei', 'CC-5257': 'Leo', 'SA-7913': 'saving', 'CC-4253': 'Leo', 'DC-9084': 'Leo'}
    combined_df["User"] = combined_df['Card'].map(user_map)
    
    combined_df = combined_df.sort_values('Date').reset_index(drop = True)

    return combined_df

In [None]:
# define the start date of the file reading
start_date = '2025-01-01'

# read the datasets
account_df_dict = {}

for account in account_list:
    df = get_data(account, folder_path)
    df = df[df['Date'] >= datetime.strptime(start_date, "%Y-%m-%d").date()]
    account_df_dict[account] = df
    
# combine all the datasets to one dataset
combined_df = pd.concat([account_df_dict[account] for account in account_list]).sort_values('Date').reset_index(drop = True)

# Transaction Categorization

In [None]:
# create the description-type matching map
description_map = {
    'GEICO': 'Auto: Insurance',
    'AAA CA MEMBERSHIP': 'Auto: Insurance',
    'FARMERS INS': 'Auto: Insurance',
    'CHEVRON': 'Auto: Gas',
    'MISSION FUEL': 'Auto: Gas',
    'COSTCO GAS': 'Auto: Gas',
    'CONSERV FUEL': 'Auto: Gas',
    'ARCO': 'Auto: Gas',
    'NEW CENTURY MAZDA': 'Auto: Maintainence/DMV',
    'DMV': 'Auto: Maintainence/DMV ',
    'BELLAGIO EXPRESS': 'Auto: Wash/Parking/Toll',
    'PARKING': 'Auto: Wash/Parking/Toll',
    'TOLL ROADS': 'Auto: Wash/Parking/Toll',
    
    'GOOD FORTUNE SUPERMARKET': 'Grocery: GFM',
    'GF MARKET': 'Grocery: GFM',
    '99 RANCH': 'Grocery: 99 Ranch',
    '7-ELEVEN': 'Grocery: others',
    'TARGET': 'Grocery: Target',
    'CVS/PHARMACY': 'Grocery: CVS',
    'COSTCO WHSE': 'Grocery: Costco',
    'COSTCO *ANNUAL RENEWAL': 'Grocery: Costco',
    'H MART': 'Grocery: HMart',
    'WHOLEFDS': 'Grocery: Whole Foods',
    'LITTLE PEACH MEAT': 'Grocery: others',
    'VONS': 'Grocery: others',
    'GINSENG': 'Grocery: others',
    'DAISO': 'Grocery: others',
    'HOME DEPOT': 'Grocery: others',
    'SAN GABRIEL SPRSTR': 'Grocery: others',

    'LinkedInPre': 'Study: LinkedIn',
    'UDEMY': 'Study: Udemy',
    'OPENAI': 'Study: ChatGPT',
    'GITHUB': 'Study: GitHub',
    'Google': 'Study: Google',
    'ADOBE': 'Study: Adobe',
    'CLAUDE.AI SUBSCRIPTION': 'Study: Claude AI',
    'WWW.FREEPIK.CDE': 'Study: Freepik',
    'WWW.GLOS.AC.UK': 'Study: Prize Application',
    'PAYPAL': 'Study: Prize Application',
    'WWW.AIGANY.ORNY': 'Study: Prize Application',
    'DEEPL* SUB': 'Study: DeepL',
    'DALLAS BAPTIST UNIVERSIT': 'Study: Dallas Baptist',
    'Motion Array': 'Study: Motion Array',
    'LYFT': 'Logistic: Lyft',
    'The UPS Store': 'Logistic: UPS/USPS/Fedex',
    'USPS': 'Logistic: UPS/USPS/Fedex',
    'FEDEX': 'Logistic: UPS/USPS/Fedex',

    'CITY OF ARCADIA': 'Utility: Water',
    'Spectrum': 'Utility: Spectrum',
    'SO CAL EDISON': 'Utility: Edison',
    'SO CAL GAS': 'Utility: SoCal Gas',
    'SoCalGas': 'Utility: SoCal Gas',
    'LA Co TTC Paymnt': 'Utility: Property Tax',
    'TMOBILE': 'Utility: T-Mobile',
    'Zelle payment to LZ COMFORT HOME': 'Utility: others',
    
    'Chun La Hao': 'Restaurant: Hotpot',
    'HAIDILAO': 'Restaurant: Hotpot',
    'CHI HUO': 'Restaurant: Hotpot',
    '101 POT': 'Restaurant: Hotpot',
    'ERWA COLD POT': 'Restaurant: Sichuan Dish',
    'KUAN ZHAI ALLEY': 'Restaurant: Sichuan Dish',
    'SICHUAN IMPRESSION': 'Restaurant: Sichuan Dish',
    'WANG LA YA INC': 'Restaurant: Sichuan Dish',
    'SHANGHAILANDER': 'Restaurant: Shanghai Dish',
    'SINBALA': 'Restaurant: Taiwan Dish',
    'IN-N-OUT': 'Restaurant: Fast Food',
    'HABIT': 'Restaurant: Fast Food',
    'RAISING CANES': 'Restaurant: Fast Food',
    'POPEYES': 'Restaurant: Fast Food',
    'TOFU HOUSE': 'Restaurant: Korean Dish',
    'SUSHI': 'Restaurant: Japanese Dish',
    'CURRY FLURRY': 'Restaurant: Japanese Dish',
    'RAMEN': 'Restaurant: Japanese Dish',
    'RUEN PAIR THAI RESTAURAN': 'Restaurant: Thai Dish', 
    'LADY M': 'Restaurant: Dessert',
    '85C': 'Restaurant: Dessert',
    'VANILLA BAKE': 'Restaurant: Dessert',
    'SUNRIGHT': 'Restaurant: Dessert',
    'YOGURTLAND': 'Restaurant: Dessert',
    'MELOMELO': 'Restaurant: Dessert',
    'AUNTIE ANNES': 'Restaurant: Dessert',
    'STARBUCKS': 'Restaurant: Dessert',
    'GELATO': 'Restaurant: Dessert',
    'PRESSED': 'Restaurant: Dessert',
    'PATISSERIE BLUEJAY': 'Restaurant: Dessert',
    'SQ *DOSE': 'Restaurant: Dessert',
    'MARU COFFEE': 'Restaurant: Dessert',

    'DD *DOORDASH': 'Online: DoorDash',
    'AMAZON': 'Online: Amazon',
    'WEEE': 'Online: Weee',
    'UBER *EATS': 'Online: Uber Eats',
    'YAMIBUY': 'Online: Yami',
    'HUNGRYPANDA': 'Online: Hungry Panda',
    'WWW.PETFIESTACO': 'Online: others', 
    'COS WEB': 'Online: others',
    
    'APPLE': 'Other: Apple',
    'HOSPITAL': 'Other: Healthcare',
    'ROSE WOMENS HEALTH': 'Other: Healthcare',
    'QUEST DIAGNOSTICS': 'Other: Healthcare',
    'PRIMROSE PSYCHIATRY': 'Other: Healthcare',
    'RADIANT IMAGING': 'Other: Healthcare',
    'AMERICAN PEDIATRICS': 'Other: Healthcare',
    'CA DEPT OF PUBLIC HEALTH': 'Other: Healthcare',
    'VCN*LOSANGELESCODPH': 'Other: Baby related',
    'CA SOS BPD LOS ANGELES': 'Other: Baby related',
    'BELLA BABY PHOTOGRAPHY': 'Other: Baby related',
    'Zelle payment to SHUHUI QIAN': 'Other: Baby related',
    'BKOFAMERICA MOBILE': 'Other: Mobile Check',
    'Wire Transfer Fee': 'Other: Bank fee',
    'LATE FEE': 'Other: Bank fee',
    'FOREIGN TRANSACTION FEE': 'Other: Bank fee',
    'INTEREST CHARGED': 'Other: Bank fee',
    'OVERDRAFT ITEM FEE': 'Other: Bank fee',
    'Zelle payment to THE CHURCH OF GOD': 'Other: Church',
    
    'Online payment': 'CC Payback: BOA',
    'Online Banking payment to CRD 4253': 'CC Payback: BOA',
    'Online Banking payment to CRD 0401': 'CC Payback: BOA',
    'DISCOVER DES': 'CC Payback: Discover',
    'INTERNET PAYMENT': 'CC Payback: Discover',
    'FID BKG SVC LLC': 'Investment: Fidelity',

    'DES:PAYROLL ID:XXXXX716960': 'Income: Luminys Payroll',
    'C185529 LUMINYS': 'Income: Luminys Payroll',
    'Money Network DES:': 'Income: Luminys Payroll',
    'LUMINYS SYSTEMS DES:PAYMENT': 'Income: Luminys Freelance',
    'ISSI INC. DES:PAYROLL': 'Income: ISSI',
    'CERTIFY- LUMINYS': 'Income: Reimbursement',
    'Zelle payment to QU WU': 'Income: Reimbursement',
    'Interest Earned': 'Income: Credit & Interest',
    'CASH REWARDS STATEMENT CREDIT': 'Income: Credit & Interest',
    'CASHREWARD': 'Income: Credit & Interest',
    'CASHBACK BONUS REDEMPTION': 'Income: Credit & Interest',
    'WIRE TYPE': 'Income: Wire',
    
    'IRS': 'Tax: IRS',
    'FRANCHISE TAX BD DES': 'Tax: CA',
    "Zelle payment to ROGER'S TAX SERVICES LLC": 'Tax: Roger Service',

    'Zelle payment from CHENWEI XU': 'Internal: from Chenwei to Leo',
    'Zelle payment to CHENWEI XU': 'Internal: from Leo to Chenwei',
    'Zelle payment from NINGCHUAN PENG': 'Internal: from Leo to Chenwei',
    'Zelle payment to NINGCHUAN PENG': 'Internal: from Chenwei to Leo',
    'Online Banking transfer from SAV 7913': 'Internal: from SA to DC',
    'Online Banking transfer to SAV 7913': 'Internal: from DC to SA',
    'Online Banking transfer from CHK 9084': 'Internal: from DC to SA',
    'Online Banking transfer to CHK 9084': 'Internal: from SA to DC',
    'Online Banking transfer from CHK 8540': 'Internal: from DC to SA',
    'Online Banking transfer to CHK 8540': 'Internal: from SA to DC',

    'Zelle payment to YUKAI GAO': 'Rent: CTHD',
    'Zelle payment from YUKAI GAO': 'Rent: CTHD',
    'Zelle payment to XUE SHIMING': 'Rent: Avlon',
    'Zelle payment to QINGMING ZENG': 'Rent: LA Fire Villa',
    'Zelle payment from GUOYUAN WU': 'Rent: LA Fire Villa',

    'AMC': 'Entertainment: AMC',
    'LA ARBORETUM': 'Entertainment: Arboretum'
}

In [None]:
# map the description to the type
for keyword, mapped_value in description_map.items():
    combined_df.loc[combined_df["Description"].str.contains(keyword, case=False, regex=False, na=False), "Type"] = mapped_value

# Transaction Offset

In [None]:
# clean the columns of the dataset and add new columns
combined_df["General_Type"] = combined_df["Type"].str.split(":", n=1).str[0]
combined_df.loc[combined_df['Type'].isna(), 'Type'] = None
combined_df.loc[combined_df['General_Type'].isna(), 'General_Type'] = None

# remove unnecessary rows and columns
for remove_item in combined_df['Type'].unique():
    if round(combined_df[combined_df['Type'] == remove_item]['Amount'].sum(), 2) == 0 and remove_item != None:
        print('remove: ', remove_item)
        combined_df = combined_df[combined_df['Type'] != remove_item]
        
combined_df = combined_df[combined_df['Amount'] != 0][['Date', 'Id', 'Description', 'Amount', 'Card', 'User', 'Type', 'General_Type']]

# Mapping Old File Information

In [None]:
# defin the function of mapping the old excel file information to this new old
def sub_old_data(new_df, old_file_name):
    old_df = pd.read_excel(old_file_name, sheet_name = 'Sheet1')[['Id', 'Type', 'General_Type']]

    comb_df = pd.merge(new_df, old_df, on = "Id", how = "left", suffixes = ("", "_old"))
    
    comb_df.loc[(comb_df['Type'] != comb_df['Type_old']) & 
                (comb_df['Type_old'].notna()), 'Type'] = comb_df.loc[(comb_df['Type'] != comb_df['Type_old']) & 
                (comb_df['Type_old'].notna()), 'Type_old']

    comb_df.loc[(comb_df['General_Type'] != comb_df['General_Type_old']) & 
                (comb_df['General_Type_old'].notna()), 'General_Type'] = comb_df.loc[(comb_df['General_Type'] != comb_df['General_Type_old']) & 
                (comb_df['General_Type_old'].notna()), 'General_Type_old']

    return comb_df[['Date', 'Id', 'Description', 'Amount', 'Card', 'User', 'Type', 'General_Type']]

# map the old excel file information to this new old
old_file_name = 'combined_20250827114849.xlsx'
combined_df_new = sub_old_data(combined_df, old_file_name)

# Export Results

In [None]:
# export the file to EXCEL
combined_df_new.to_excel(f"combined_{datetime.now().strftime('%Y%m%d%H%M%S')}.xlsx", index=False)

In [None]:
# export the file to Google Sheet

# import the gsheet_id, sheet_name and crendentials
dotenv.load_dotenv("../personal_envs/household-cashflow-analyzer/.env", override=True)
gsheet_id, sheet_name = os.getenv("gsheet_id"), os.getenv("sheet_name")
creds = ServiceAccountCredentials.from_json_keyfile_name("../personal_envs/household-cashflow-analyzer/gsheet_credentials.json", 
                                                         "https://www.googleapis.com/auth/spreadsheets")

# connect to the Google Sheet
client = gspread.authorize(creds)

spreadsheet = client.open_by_key(gsheet_id)
worksheet = spreadsheet.worksheet(sheet_name)

# clean the data for exporting
combined_df_new["Date"] = pd.to_datetime(combined_df_new["Date"]).dt.strftime("%Y-%m-%d")

# export the data
worksheet.clear()
worksheet.update([combined_df_new.columns.values.tolist()] + combined_df_new.values.tolist())