In [None]:
import os, time
import pandas as pd
from datetime import datetime

pd.options.mode.chained_assignment = None

In [None]:
folder_path = "statement/"

account_list = []

for name in os.listdir(folder_path):
    if os.path.isdir(os.path.join(folder_path, name)):
        account_list.append(name)

account_list

In [None]:
# define the function of cleaning the Discover credit card
def get_dis_cc(file_path):
    df = pd.read_csv(file_path).rename(columns = {'Trans. Date': 'Date'})
    df['Amount'] = -df['Amount']
    df = df[['Date', 'Description', 'Amount']]
    return df


# define the function of cleaning the BOA credit card
def get_boa_cc(file_path):
    df = pd.read_csv(file_path).rename(columns = {'Posted Date': 'Date', 'Payee': 'Description'})
    if 'remove' in df.columns:
        df = df[df['remove'].isna()]
    df = df[['Date', 'Description', 'Amount']]
    return df


# define the function of cleaning the BOA debit card
def get_boa_dc(file_path):
    df = pd.read_csv(file_path, skiprows = 6).rename(columns = {'Running Bal.': 'Running_balance'})
    df = df[['Date', 'Description', 'Amount', 'Running_balance']]
    df['Amount'] = df['Amount'].apply(lambda x: str(x).replace(",", "")).astype(float)
    df['Running_balance'] = df['Running_balance'].apply(lambda x: str(x).replace(",", "")).astype(float)
    return df

In [None]:
def get_data(head):
    
    df_list, folder_path = [], "statement/"
    df_folder_path = os.path.join(folder_path, head)
    csv_files = [f for f in os.listdir(df_folder_path) if f.endswith('.csv')]

    if head == 'CC-5257':
        used_func = get_dis_cc
    elif head[0: 2] == 'CC':
        used_func = get_boa_cc
    elif head[0: 2] == 'DC' or head[0: 2] == 'SA':
        used_func = get_boa_dc

    for file in csv_files:
        file_path = os.path.join(df_folder_path, file)
        df_list.append(used_func(file_path))

            
    combined_df = pd.concat(df_list)
    combined_df['Card'] = head
    combined_df["Date"] = pd.to_datetime(combined_df["Date"], format="%m/%d/%Y")
    user_map = {'DC-8540': 'Wei', 'CC-0401': 'Wei', 'CC-5257': 'Leo', 'SA-7913': 'saving', 'CC-4253': 'Leo', 'DC-9084': 'Leo'}
    combined_df["User"] = combined_df['Card'].map(user_map)
    
    combined_df = combined_df.sort_values('Date').reset_index(drop = True)

    if (head[0: 2] == 'DC' or head[0: 2] == 'SA') and round(combined_df['Running_balance'][0] + combined_df['Amount'].sum(),2) != combined_df['Running_balance'].iloc[-1]:
        raise ValueError(f"{head} data is not validate.")

    return combined_df

In [None]:
CC_dis_df, CC_Leo_df, DC_Leo_df = get_data('CC-5257'), get_data('CC-4253'), get_data('DC-9084')
CC_Wei_df, DC_Wei_df = get_data('CC-0401'), get_data('DC-8540')
SA_df = get_data('SA-7913')

In [None]:
start_date = '2025-01-01'

start_balance, end_balance = 0, 0

for DC in [DC_Leo_df, DC_Wei_df, SA_df]: 
    start_balance += DC[DC['Date'] < start_date]['Running_balance'].iloc[-1]

for DC in [DC_Leo_df, DC_Wei_df, SA_df]: 
    end_balance += DC['Running_balance'].iloc[-1]

DC_Leo_df = DC_Leo_df[DC_Leo_df['Date'] >= start_date]
DC_Wei_df = DC_Wei_df[DC_Wei_df['Date'] >= start_date]
SA_df = SA_df[SA_df['Date'] >= start_date]

In [None]:
description_map = {
    'GEICO': 'Auto: Insurance',
    'AAA CA MEMBERSHIP': 'Auto: Insurance',
    'FARMERS INS': 'Auto: Insurance',
    'CHEVRON': 'Auto: Gas',
    'MISSION FUEL': 'Auto: Gas',
    'COSTCO GAS': 'Auto: Gas',
    'CONSERV FUEL': 'Auto: Gas',
    'ARCO': 'Auto: Gas',
    'NEW CENTURY MAZDA': 'Auto: Maintainence',
    'BELLAGIO EXPRESS': 'Auto: Car Wash',
    'DMV': 'Auto: DMV fee',
    'PARKING': 'Auto: Parking',
    'TOLL ROADS': 'Auto: Toll',
    
    'GOOD FORTUNE SUPERMARKET': 'Grocery: GFM',
    'GF MARKET': 'Grocery: GFM',
    '99 RANCH': 'Grocery: 99 Ranch',
    '7-ELEVEN': 'Grocery: 7-ELEVEN',
    'TARGET': 'Grocery: Target',
    'CVS/PHARMACY': 'Grocery: CVS',
    'COSTCO WHSE': 'Grocery: Costco',
    'COSTCO *ANNUAL RENEWAL': 'Grocery: Costco',
    'H MART': 'Grocery: HMart',
    'LITTLE PEACH MEAT': 'Grocery: Meat Shop',
    'VONS': 'Grocery: others',
    'GINSENG': 'Grocery: others',
    'DAISO': 'Grocery: others',
    'HOME DEPOT': 'Grocery: others',
    'SAN GABRIEL SPRSTR': 'Grocery: others',

    'DD *DOORDASH': 'Online: DoorDash',
    'AMAZON': 'Online: Amazon',
    'WEEE': 'Online: Weee',
    'UBER *EATS': 'Online: Uber Eats',
    'YAMIBUY': 'Online: Yami',
    'HUNGRYPANDA': 'Online: Hungry Panda',
    'WWW.PETFIESTACO': 'Online: others', 
    'COS WEB': 'Online: others',

    'LinkedInPre': 'Study: LinkedIn',
    'UDEMY': 'Study: Udemy',
    'OPENAI': 'Study: ChatGPT',
    'GITHUB': 'Study: GitHub',
    'Google': 'Study: Google',
    'ADOBE': 'Study: Adobe',
    'CLAUDE.AI SUBSCRIPTION': 'Study: Claude AI',
    'WWW.FREEPIK.CDE': 'Study: Freepik',
    'WWW.GLOS.AC.UK': 'Study: Prize Application',
    'PAYPAL': 'Study: Prize Application',
    'DEEPL* SUB': 'Study: DeepL',

    'LYFT': 'Logistic: Lyft',
    'The UPS Store': 'Logistic: UPS',
    'USPS': 'Logistic: USPS',
    'FEDEX': 'Logistic: Fedex',

    'CITY OF ARCADIA': 'Utility: Water',
    'Spectrum': 'Utility: Spectrum',
    'SO CAL EDISON': 'Utility: Edison',
    'SO CAL GAS': 'Utility: SoCal Gas',
    'LA Co TTC Paymnt': 'Utility: Property Tax',
    'TMOBILE': 'Utility: T-Mobile',
    'Zelle payment to LZ COMFORT HOME': 'Utility: otheres',
    
    'Chun La Hao': 'Restaurant: Hotpot',
    'CHI HUO': 'Restaurant: Hotpot',
    '101 POT': 'Restaurant: Hotpot',
    'ERWA COLD POT': 'Restaurant: Sichuan Dish',
    'KUAN ZHAI ALLEY': 'Restaurant: Sichuan Dish',
    'SICHUAN IMPRESSION': 'Restaurant: Sichuan Dish',
    'SHANGHAILANDER': 'Restaurant: Shanghai Dish',
    'SINBALA': 'Restaurant: Taiwan Dish',
    'IN-N-OUT': 'Restaurant: Fast Food',
    'HABIT': 'Restaurant: Fast Food',
    'RAISING CANES': 'Restaurant: Fast Food',
    'TOFU HOUSE': 'Restaurant: Korean Dish',
    'SUSHI': 'Restaurant: Japanese Dish',
    'CURRY FLURRY': 'Restaurant: Japanese Dish',
    'RAMEN': 'Restaurant: Japanese Dish',
    'LADY M': 'Restaurant: Desert',
    '85C': 'Restaurant: Desert',
    'VANILLA BAKE': 'Restaurant: Desert',
    'SUNRIGHT': 'Restaurant: Desert',
    'YOGURTLAND': 'Restaurant: Desert',
    'MELOMELO': 'Restaurant: Desert',
    'AUNTIE ANNES': 'Restaurant: Desert',
    'STARBUCKS': 'Restaurant: Desert',
    'GELATO': 'Restaurant: Desert',
    'PRESSED': 'Restaurant: Desert',
    'PATISSERIE BLUEJAY': 'Restaurant: Desert',
    
    'APPLE': 'Other: Apple',
    'HOSPITAL': 'Other: Healthcare',
    'ROSE WOMENS HEALTH': 'Other: Healthcare',
    'QUEST DIAGNOSTICS': 'Other: Healthcare',
    'PRIMROSE PSYCHIATRY': 'Other: Healthcare',
    'RADIANT IMAGING': 'Other: Healthcare',
    'AMERICAN PEDIATRICS': 'Other: Healthcare',
    'CA DEPT OF PUBLIC HEALTH': 'Other: Healthcare',
    'VCN*LOSANGELESCODPH': 'Other: Baby related',
    'CA SOS BPD LOS ANGELES': 'Other: Baby related',
    'BELLA BABY PHOTOGRAPHY': 'Other: Baby related',
    'Zelle payment to SHUHUI QIAN': 'Other: Baby related',
    'Money Network DES:': 'Other: Baby related',
    'BKOFAMERICA MOBILE': 'Other: Mobile Check',
    'Wire Transfer Fee': 'Other: Bank fee',
    'LATE FEE': 'Other: Bank fee',
    'FOREIGN TRANSACTION FEE': 'Other: Bank fee',
    'INTEREST CHARGED': 'Other: Bank fee',
    'OVERDRAFT ITEM FEE': 'Other: Bank fee',

    'Online payment': 'CC Payback: Money',
    'Online Banking payment to CRD 4253': 'CC Payback: Money',
    'Online Banking payment to CRD 0401': 'CC Payback: Money',
    'DISCOVER DES': 'CC Payback: Discover',
    'INTERNET PAYMENT': 'CC Payback: Discover',
    'Interest Earned': 'Investment: Interest',
    'FID BKG SVC LLC': 'Investment: Fidelity',
    'CASH REWARDS STATEMENT CREDIT': 'Investment: CC Credit',
    'CASHREWARD': 'Investment: CC Credit',
    'CASHBACK BONUS REDEMPTION': 'Investment: CC Credit',

    'DES:PAYROLL ID:XXXXX716960': 'Income: Luminys Payroll',
    'C185529 LUMINYS': 'Income: Luminys Payroll',
    'LUMINYS SYSTEMS DES:PAYMENT': 'Income: Luminys Freelance',
    'ISSI INC. DES:PAYROLL': 'Income: ISSI',
    'CERTIFY- LUMINYS': 'Income: Reimbursement',
    'Zelle payment to QU WU': 'Income: Reimbursement',
    
    'IRS': 'Tax: IRS',
    'FRANCHISE TAX BD DES': 'Tax: CA',
    'FRANCHISE TAX BO DES': 'Tax: CA',
    "Zelle payment to ROGER'S TAX SERVICES LLC": 'Tax: Roger Service',

    'Zelle payment from CHENWEI XU': 'Internal: from Chenwei to Leo',
    'Zelle payment to CHENWEI XU': 'Internal: from Leo to Chenwei',
    'Zelle payment from NINGCHUAN PENG': 'Internal: from Leo to Chenwei',
    'Zelle payment to NINGCHUAN PENG': 'Internal: from Chenwei to Leo',
    'Online Banking transfer from SAV 7913': 'Internal: from SA to Leo/Chenwei',
    'Online Banking transfer to SAV 7913': 'Internal: from Leo/Chenwei to SA',
    'Online Banking transfer from CHK 9084': 'Internal: from Leo to SA',
    'Online Banking transfer to CHK 9084': 'Internal: from SA to Leo',
    'Online Banking transfer from CHK 8540': 'Internal: from Chenwei to SA',
    'Online Banking transfer to CHK 8540': 'Internal: from SA to Chenwei',
    'WIRE TYPE': 'Internal: Wire',
    'Zelle payment to THE CHURCH OF GOD': 'Internal: Church',


    'Zelle payment to YUKAI GAO': 'Rent: CTHD',
    'Zelle payment from YUKAI GAO': 'Rent: CTHD',
    'Zelle payment to XUE SHIMING': 'Rent: Avlon',
    'Zelle payment to QINGMING ZENG': 'Rent: LA Fire Villa',
    'Zelle payment from GUOYUAN WU': 'Rent: LA Fire Villa',

    'AMC': 'Entertainment: AMC',
    'LA ARBORETUM': 'Entertainment: Arboretum'
}

In [None]:
for keyword, mapped_value in description_map.items():
    for df in [CC_dis_df, CC_Leo_df, DC_Leo_df, CC_Wei_df, DC_Wei_df, SA_df]:
        df.loc[df["Description"].str.contains(keyword, case=False, regex=False, na=False), "Type"] = mapped_value

DC_Leo_df.loc[DC_Leo_df['Type'] == 'Internal: from SA to Leo/Chenwei', 'Type'] = 'Internal: from SA to Leo'
DC_Leo_df.loc[DC_Leo_df['Type'] == 'Internal: from Leo/Chenwei to SA', 'Type'] = 'Internal: from Leo to SA'

DC_Wei_df.loc[DC_Wei_df['Type'] == 'Internal: from SA to Leo/Chenwei', 'Type'] = 'Internal: from SA to Chenwei'
DC_Wei_df.loc[DC_Wei_df['Type'] == 'Internal: from Leo/Chenwei to SA', 'Type'] = 'Internal: from Chenwei to SA'

In [None]:
def CC_get(cc_dateset):
    if 'CC Payback: Money' in set(cc_dateset['Type']):
        cc_max_i = max(cc_dateset[cc_dateset['Type'] == 'CC Payback: Money'].index)
    else:
        cc_max_i = max(cc_dateset[cc_dateset['Type'] == 'CC Payback: Discover'].index)

    i = 0
    while round(cc_dateset.loc[i: cc_max_i]['Amount'].sum(), 2) != 0:
        i += 1

    if cc_dateset.loc[i: cc_max_i].shape[0] == 0:
        raise ValueError(f"The data is not validate.")
    else:
        return cc_dateset.loc[i: cc_max_i]

combined_data_list = [DC_Leo_df, CC_get(CC_Leo_df), CC_get(CC_dis_df), DC_Wei_df, CC_get(CC_Wei_df), SA_df]
combined_df = pd.concat(combined_data_list).sort_values('Date').reset_index(drop = True)

combined_df = combined_df[['Date', 'Description', 'Amount', 'Card', 'User', 'Type']]

combined_df = combined_df[(combined_df['Amount'].notna()) & (combined_df['Amount'] != 0)]

In [None]:
for remove_item in combined_df['Type'].unique():
    if round(combined_df[combined_df['Type'] == remove_item]['Amount'].sum(), 2) == 0:
        print('remove: ', remove_item)
        combined_df = combined_df[combined_df['Type'] != remove_item]

combined_df["General_Type"] = combined_df["Type"].str.split(":", n=1).str[0]

In [None]:
if round(start_balance + combined_df['Amount'].sum(),2) == end_balance:
    print('Starting Balance:', start_balance)
    print('Ending Balance:', end_balance)
    print('Changing Amount:', round(combined_df['Amount'].sum(),2))
else:
    raise ValueError("Changing Amount is not right.")

In [None]:
combined_df.to_excel(f"combined_test_{datetime.now().strftime('%Y%m%d%H%M%S')}.xlsx", index=False)