## Create

In [2]:
import re
import pandas as pd
import numpy as np
import os
import string

In [3]:
dir_path = "C:/Users/Daniel/Documents/Git/bank-statement-cleaner/synthetic_data"

In [4]:
savings_csv_path = os.path.join(dir_path, "savings.csv")
chequing_csv_path = os.path.join(dir_path, "chequing.csv")
credit_csv_path = os.path.join(dir_path, "credit.csv")

In [5]:
class CIBCTransactionDescription:
    def __init__(
        self,
        method="",
        type="",
        party=""
    ):
        self.method = method
        self.type = type
        self.party = party

    def to_pd_series(self):
        # convert to lower case to make queries easier
        attributes = [
            self.method,
            self.type,
            self.party
        ]
        for i in range(0, len(attributes)):
            if not attributes[i]:
                attributes[i] = np.nan
            else:
                attributes[i] = attributes[i].lower()
        return pd.Series(attributes)

In [6]:
def parse_debit_description(description):
    tx_type_match = re.search(r'[A-Z][^a-z0-9]*[A-Z]', description) # get transaction type
    if not tx_type_match:
        return CIBCTransactionDescription().to_pd_series()
    tx_type = tx_type_match.group()
    tx_method = description[:tx_type_match.span()[0] - 1]
    for word in [
        "CHARGE",
        "CORRECTION",
        "DEPOSIT",
        "FEE",
        "INTEREST",
        "MEMO",
        "PAY",
        "PURCHASE",
        "TRANSFER",
    ]:
        word_start_index = tx_type.rfind(word) # scan from right
        if word_start_index == -1:
            continue
        tx_type = tx_type[:word_start_index + len(word)]
        # special case for service charges
        if word == "CHARGE":
            return CIBCTransactionDescription(
                method=tx_method,
                type=tx_type
            ).to_pd_series()
        break
    tx_type_end_index = tx_type_match.span()[0] + len(tx_type) - 1
    if tx_type_end_index == len(description) - 1:
        return CIBCTransactionDescription(
            method=tx_method,
            type=tx_type
        ).to_pd_series()
    remainder = description[tx_type_end_index + 2:]
    remainder = remainder.replace("*", "") # delete asterisks
    # find a token with only letters and numbers with at least 1 letter and 1 number
    for token in remainder.split(" "):
        if re.search("^[0-9A-Z]+$", token) and re.search("[0-9]", token) and re.search("[A-Z]", token):
            return CIBCTransactionDescription(
                    method=tx_method,
                    type=tx_type,
                    party=remainder.replace(token, "").strip()
                ).to_pd_series()
    # find a token with only numbers
    token_match = re.search("^[0-9]+(?= )|(?<= )[0-9]+$|(?<= )[0-9]+(?= )", remainder)
    if token_match:
        return CIBCTransactionDescription(
            method=tx_method,
            type=tx_type,
            party=remainder.replace(token_match.group(), "").strip()
        ).to_pd_series()
    return CIBCTransactionDescription(
        method=tx_method,
        type=tx_type
    ).to_pd_series()

In [7]:
def expand_debit(df):
    df[["method", "type", "party"]] = df["description"].apply(parse_debit_description)

In [8]:
def parse_credit_description(description):
    tx_location_match = re.search(r'[^ ]+, .+$', description) # get transaction location
    if not tx_location_match:
        return np.nan
    return description.replace(tx_location_match.group(), "").strip().lower()

In [9]:
def expand_credit(df):
    df["party"] = df["description"].apply(parse_credit_description)

In [10]:
def expand_account_df(account, account_df, expand_fn=None, duplicate_index=True):
    df = account_df.copy()
    date = pd.to_datetime(df["date"])
    df["date"] = date.dt.strftime("%Y-%m-%d")
    df["year"] = date.dt.year
    df["month"] = date.dt.month
    df["day"] = date.dt.day
    df["account"] = account
    df["amount"] = df[["debit", "credit"]].apply(lambda x: x["credit"] if pd.isnull(x["debit"]) else -1 * x["debit"], axis=1)
    df = df.drop(columns=["debit", "credit"])
    if expand_fn:
        expand_fn(df)
    if duplicate_index:
        df["index_copy"] = df.index
    return df

In [11]:
savings_df = pd.read_csv(savings_csv_path, names=["date", "description", "debit", "credit"])
savings_df

Unnamed: 0,date,description,debit,credit
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,,2000.0
1,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,,2000.0
2,2025-01-31,Internet Banking E-TRANSFER 891379219227 LANDLORD,1500.0,
3,2025-01-31,Internet Banking E-TRANSFER 141960766495 LANDLORD,256.0,
4,2025-01-31,Internet Banking INTERNET TRANSFER 178752671254,,50000.0
5,2025-02-28,Internet Banking E-TRANSFER 899912012729 LANDLORD,1500.0,
6,2025-02-28,Internet Banking E-TRANSFER 506104578856 LANDLORD,259.0,
7,2025-02-28,Internet Banking INTERNET TRANSFER 853155311571,,50000.0
8,2025-03-31,Internet Banking INTERNET TRANSFER 193040394515,,50000.0
9,2025-03-31,Internet Banking E-TRANSFER 204002784579 LANDLORD,1500.0,


In [12]:
expanded_savings_df = expand_account_df("savings", savings_df, expand_fn=expand_debit)
expanded_savings_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,party,index_copy
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,0
1,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,1
2,2025-01-31,Internet Banking E-TRANSFER 891379219227 LANDLORD,2025,1,31,savings,-1500.0,internet banking,e-transfer,landlord,2
3,2025-01-31,Internet Banking E-TRANSFER 141960766495 LANDLORD,2025,1,31,savings,-256.0,internet banking,e-transfer,landlord,3
4,2025-01-31,Internet Banking INTERNET TRANSFER 178752671254,2025,1,31,savings,50000.0,internet banking,internet transfer,,4
5,2025-02-28,Internet Banking E-TRANSFER 899912012729 LANDLORD,2025,2,28,savings,-1500.0,internet banking,e-transfer,landlord,5
6,2025-02-28,Internet Banking E-TRANSFER 506104578856 LANDLORD,2025,2,28,savings,-259.0,internet banking,e-transfer,landlord,6
7,2025-02-28,Internet Banking INTERNET TRANSFER 853155311571,2025,2,28,savings,50000.0,internet banking,internet transfer,,7
8,2025-03-31,Internet Banking INTERNET TRANSFER 193040394515,2025,3,31,savings,50000.0,internet banking,internet transfer,,8
9,2025-03-31,Internet Banking E-TRANSFER 204002784579 LANDLORD,2025,3,31,savings,-1500.0,internet banking,e-transfer,landlord,9


In [13]:
chequing_df = pd.read_csv(chequing_csv_path, names=["date", "description", "debit", "credit"])
chequing_df

Unnamed: 0,date,description,debit,credit
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,,2000.0
1,2025-01-01,Internet Banking E-TRANSFER 349282506970 BLUE ...,50.0,
2,2025-01-02,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,3.0,
3,2025-01-03,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,9.0,
4,2025-01-04,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,9.0,
...,...,...,...,...
416,2025-12-01,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,4.0,
417,2025-12-01,Internet Banking E-TRANSFER 917263704293 BLUE ...,50.0,
418,2025-12-02,Internet Banking INTERNET TRANSFER 532904682993,93.0,
419,2025-12-31,Internet Banking INTERNET TRANSFER 790268759218,50000.0,


In [14]:
expanded_chequing_df = expand_account_df("chequing", chequing_df, expand_fn=expand_debit)
expanded_chequing_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,party,index_copy
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,chequing,2000.0,automated banking machine,atm deposit,,0
1,2025-01-01,Internet Banking E-TRANSFER 349282506970 BLUE ...,2025,1,1,chequing,-50.0,internet banking,e-transfer,blue hockey club,1
2,2025-01-02,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,1,2,chequing,-3.0,point of sale - visa debit,visa debit retail purchase,presto,2
3,2025-01-03,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,1,3,chequing,-9.0,point of sale - visa debit,visa debit retail purchase,presto,3
4,2025-01-04,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,1,4,chequing,-9.0,point of sale - visa debit,visa debit retail purchase,presto,4
...,...,...,...,...,...,...,...,...,...,...,...
416,2025-12-01,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,12,1,chequing,-4.0,point of sale - visa debit,visa debit retail purchase,presto,416
417,2025-12-01,Internet Banking E-TRANSFER 917263704293 BLUE ...,2025,12,1,chequing,-50.0,internet banking,e-transfer,blue hockey club,417
418,2025-12-02,Internet Banking INTERNET TRANSFER 532904682993,2025,12,2,chequing,-93.0,internet banking,internet transfer,,418
419,2025-12-31,Internet Banking INTERNET TRANSFER 790268759218,2025,12,31,chequing,-50000.0,internet banking,internet transfer,,419


In [15]:
def load_cibc_credit_csv(csv_path):
    credit_df = pd.read_csv(csv_path, usecols=range(0, 4), names=["date", "description", "debit", "credit"])
    return credit_df

In [16]:
credit_df = load_cibc_credit_csv(credit_csv_path)
credit_df

Unnamed: 0,date,description,debit,credit
0,2025-01-07,"WALMART TORONTO, ON",64.0,
1,2025-01-07,PAYMENT THANK YOU/PAIEMEN T MERCI,,64.0
2,2025-01-14,"WALMART TORONTO, ON",80.0,
3,2025-01-14,PAYMENT THANK YOU/PAIEMEN T MERCI,,80.0
4,2025-01-21,"WALMART TORONTO, ON",51.0,
...,...,...,...,...
91,2025-11-18,PAYMENT THANK YOU/PAIEMEN T MERCI,,82.0
92,2025-11-25,"WALMART TORONTO, ON",51.0,
93,2025-11-25,PAYMENT THANK YOU/PAIEMEN T MERCI,,51.0
94,2025-12-02,"WALMART TORONTO, ON",93.0,


In [17]:
expanded_credit_df = expand_account_df(
    "credit",
    credit_df,
    expand_fn=expand_credit,
    duplicate_index=False
)
expanded_credit_df

Unnamed: 0,date,description,year,month,day,account,amount,party
0,2025-01-07,"WALMART TORONTO, ON",2025,1,7,credit,-64.0,walmart
1,2025-01-07,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,1,7,credit,64.0,
2,2025-01-14,"WALMART TORONTO, ON",2025,1,14,credit,-80.0,walmart
3,2025-01-14,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,1,14,credit,80.0,
4,2025-01-21,"WALMART TORONTO, ON",2025,1,21,credit,-51.0,walmart
...,...,...,...,...,...,...,...,...
91,2025-11-18,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,11,18,credit,82.0,
92,2025-11-25,"WALMART TORONTO, ON",2025,11,25,credit,-51.0,walmart
93,2025-11-25,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,11,25,credit,51.0,
94,2025-12-02,"WALMART TORONTO, ON",2025,12,2,credit,-93.0,walmart


In [18]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [19]:
uid_dict = {}

def create_uid(date, description, account, amount):
    separator = "_"
    table = str.maketrans("", "", string.punctuation)
    description = description.translate(table)
    uid = separator.join([date, description, account, str(amount)])
    if uid not in uid_dict:
        uid_dict[uid] = 1
    else:
        uid_dict[uid] += 1
    return separator.join([uid, str(uid_dict[uid])]).lower()

def get_uid_series(df):
    return df.apply(lambda x: create_uid(
        x["date"],
        x["description"],
        x["account"],
        x["amount"]
    ), axis=1)

In [20]:
expanded_savings_df["uid"] = get_uid_series(expanded_savings_df)
expanded_savings_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,party,index_copy,uid
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,0,2025-01-01_automated banking machine atm depos...
1,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,1,2025-01-01_automated banking machine atm depos...
2,2025-01-31,Internet Banking E-TRANSFER 891379219227 LANDLORD,2025,1,31,savings,-1500.0,internet banking,e-transfer,landlord,2,2025-01-31_internet banking etransfer 89137921...
3,2025-01-31,Internet Banking E-TRANSFER 141960766495 LANDLORD,2025,1,31,savings,-256.0,internet banking,e-transfer,landlord,3,2025-01-31_internet banking etransfer 14196076...
4,2025-01-31,Internet Banking INTERNET TRANSFER 178752671254,2025,1,31,savings,50000.0,internet banking,internet transfer,,4,2025-01-31_internet banking internet transfer ...
5,2025-02-28,Internet Banking E-TRANSFER 899912012729 LANDLORD,2025,2,28,savings,-1500.0,internet banking,e-transfer,landlord,5,2025-02-28_internet banking etransfer 89991201...
6,2025-02-28,Internet Banking E-TRANSFER 506104578856 LANDLORD,2025,2,28,savings,-259.0,internet banking,e-transfer,landlord,6,2025-02-28_internet banking etransfer 50610457...
7,2025-02-28,Internet Banking INTERNET TRANSFER 853155311571,2025,2,28,savings,50000.0,internet banking,internet transfer,,7,2025-02-28_internet banking internet transfer ...
8,2025-03-31,Internet Banking INTERNET TRANSFER 193040394515,2025,3,31,savings,50000.0,internet banking,internet transfer,,8,2025-03-31_internet banking internet transfer ...
9,2025-03-31,Internet Banking E-TRANSFER 204002784579 LANDLORD,2025,3,31,savings,-1500.0,internet banking,e-transfer,landlord,9,2025-03-31_internet banking etransfer 20400278...


In [21]:
expanded_chequing_df["uid"] = get_uid_series(expanded_chequing_df)
expanded_chequing_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,party,index_copy,uid
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,chequing,2000.0,automated banking machine,atm deposit,,0,2025-01-01_automated banking machine atm depos...
1,2025-01-01,Internet Banking E-TRANSFER 349282506970 BLUE ...,2025,1,1,chequing,-50.0,internet banking,e-transfer,blue hockey club,1,2025-01-01_internet banking etransfer 34928250...
2,2025-01-02,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,1,2,chequing,-3.0,point of sale - visa debit,visa debit retail purchase,presto,2,2025-01-02_point of sale visa debit visa debi...
3,2025-01-03,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,1,3,chequing,-9.0,point of sale - visa debit,visa debit retail purchase,presto,3,2025-01-03_point of sale visa debit visa debi...
4,2025-01-04,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,1,4,chequing,-9.0,point of sale - visa debit,visa debit retail purchase,presto,4,2025-01-04_point of sale visa debit visa debi...
...,...,...,...,...,...,...,...,...,...,...,...,...
416,2025-12-01,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,12,1,chequing,-4.0,point of sale - visa debit,visa debit retail purchase,presto,416,2025-12-01_point of sale visa debit visa debi...
417,2025-12-01,Internet Banking E-TRANSFER 917263704293 BLUE ...,2025,12,1,chequing,-50.0,internet banking,e-transfer,blue hockey club,417,2025-12-01_internet banking etransfer 91726370...
418,2025-12-02,Internet Banking INTERNET TRANSFER 532904682993,2025,12,2,chequing,-93.0,internet banking,internet transfer,,418,2025-12-02_internet banking internet transfer ...
419,2025-12-31,Internet Banking INTERNET TRANSFER 790268759218,2025,12,31,chequing,-50000.0,internet banking,internet transfer,,419,2025-12-31_internet banking internet transfer ...


In [22]:
expanded_credit_df["uid"] = get_uid_series(expanded_credit_df)
expanded_credit_df

Unnamed: 0,date,description,year,month,day,account,amount,party,uid
0,2025-01-07,"WALMART TORONTO, ON",2025,1,7,credit,-64.0,walmart,2025-01-07_walmart toronto on_credit_-64.0_1
1,2025-01-07,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,1,7,credit,64.0,,2025-01-07_payment thank youpaiemen t merci_cr...
2,2025-01-14,"WALMART TORONTO, ON",2025,1,14,credit,-80.0,walmart,2025-01-14_walmart toronto on_credit_-80.0_1
3,2025-01-14,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,1,14,credit,80.0,,2025-01-14_payment thank youpaiemen t merci_cr...
4,2025-01-21,"WALMART TORONTO, ON",2025,1,21,credit,-51.0,walmart,2025-01-21_walmart toronto on_credit_-51.0_1
...,...,...,...,...,...,...,...,...,...
91,2025-11-18,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,11,18,credit,82.0,,2025-11-18_payment thank youpaiemen t merci_cr...
92,2025-11-25,"WALMART TORONTO, ON",2025,11,25,credit,-51.0,walmart,2025-11-25_walmart toronto on_credit_-51.0_1
93,2025-11-25,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,11,25,credit,51.0,,2025-11-25_payment thank youpaiemen t merci_cr...
94,2025-12-02,"WALMART TORONTO, ON",2025,12,2,credit,-93.0,walmart,2025-12-02_walmart toronto on_credit_-93.0_1


In [23]:
merged_df = expanded_savings_df.merge(expanded_chequing_df, left_on="description", right_on="description")
merged_df = merged_df.loc[merged_df["amount_x"] == -1 * merged_df["amount_y"]]
merged_df

Unnamed: 0,date_x,description,year_x,month_x,day_x,account_x,amount_x,method_x,type_x,party_x,...,year_y,month_y,day_y,account_y,amount_y,method_y,type_y,party_y,index_copy_y,uid_y
2,2025-01-31,Internet Banking INTERNET TRANSFER 178752671254,2025,1,31,savings,50000.0,internet banking,internet transfer,,...,2025,1,31,chequing,-50000.0,internet banking,internet transfer,,37,2025-01-31_internet banking internet transfer ...
3,2025-02-28,Internet Banking INTERNET TRANSFER 853155311571,2025,2,28,savings,50000.0,internet banking,internet transfer,,...,2025,2,28,chequing,-50000.0,internet banking,internet transfer,,72,2025-02-28_internet banking internet transfer ...
4,2025-03-31,Internet Banking INTERNET TRANSFER 193040394515,2025,3,31,savings,50000.0,internet banking,internet transfer,,...,2025,3,31,chequing,-50000.0,internet banking,internet transfer,,109,2025-03-31_internet banking internet transfer ...
5,2025-04-30,Internet Banking INTERNET TRANSFER 825950711483,2025,4,30,savings,50000.0,internet banking,internet transfer,,...,2025,4,30,chequing,-50000.0,internet banking,internet transfer,,147,2025-04-30_internet banking internet transfer ...
6,2025-05-31,Internet Banking INTERNET TRANSFER 354554250700,2025,5,31,savings,50000.0,internet banking,internet transfer,,...,2025,5,31,chequing,-50000.0,internet banking,internet transfer,,187,2025-05-31_internet banking internet transfer ...
7,2025-06-30,Internet Banking INTERNET TRANSFER 617707081506,2025,6,30,savings,50000.0,internet banking,internet transfer,,...,2025,6,30,chequing,-50000.0,internet banking,internet transfer,,223,2025-06-30_internet banking internet transfer ...
8,2025-07-31,Internet Banking INTERNET TRANSFER 757710935231,2025,7,31,savings,50000.0,internet banking,internet transfer,,...,2025,7,31,chequing,-50000.0,internet banking,internet transfer,,263,2025-07-31_internet banking internet transfer ...
9,2025-08-31,Internet Banking INTERNET TRANSFER 464886862147,2025,8,31,savings,50000.0,internet banking,internet transfer,,...,2025,8,31,chequing,-50000.0,internet banking,internet transfer,,301,2025-08-31_internet banking internet transfer ...
10,2025-09-30,Internet Banking INTERNET TRANSFER 786873531098,2025,9,30,savings,50000.0,internet banking,internet transfer,,...,2025,9,30,chequing,-50000.0,internet banking,internet transfer,,337,2025-09-30_internet banking internet transfer ...
11,2025-10-31,Internet Banking INTERNET TRANSFER 127344273120,2025,10,31,savings,50000.0,internet banking,internet transfer,,...,2025,10,31,chequing,-50000.0,internet banking,internet transfer,,376,2025-10-31_internet banking internet transfer ...


In [24]:
internal_transfer_df = pd.concat([
    expanded_savings_df.loc[merged_df["index_copy_x"]],
    expanded_chequing_df.loc[merged_df["index_copy_y"]]
]).drop(columns=["index_copy", "party"]).sort_values(by=["uid"]).reset_index(drop=True)
internal_transfer_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,uid
0,2025-01-31,Internet Banking INTERNET TRANSFER 178752671254,2025,1,31,chequing,-50000.0,internet banking,internet transfer,2025-01-31_internet banking internet transfer ...
1,2025-01-31,Internet Banking INTERNET TRANSFER 178752671254,2025,1,31,savings,50000.0,internet banking,internet transfer,2025-01-31_internet banking internet transfer ...
2,2025-02-28,Internet Banking INTERNET TRANSFER 853155311571,2025,2,28,chequing,-50000.0,internet banking,internet transfer,2025-02-28_internet banking internet transfer ...
3,2025-02-28,Internet Banking INTERNET TRANSFER 853155311571,2025,2,28,savings,50000.0,internet banking,internet transfer,2025-02-28_internet banking internet transfer ...
4,2025-03-31,Internet Banking INTERNET TRANSFER 193040394515,2025,3,31,chequing,-50000.0,internet banking,internet transfer,2025-03-31_internet banking internet transfer ...
5,2025-03-31,Internet Banking INTERNET TRANSFER 193040394515,2025,3,31,savings,50000.0,internet banking,internet transfer,2025-03-31_internet banking internet transfer ...
6,2025-04-30,Internet Banking INTERNET TRANSFER 825950711483,2025,4,30,chequing,-50000.0,internet banking,internet transfer,2025-04-30_internet banking internet transfer ...
7,2025-04-30,Internet Banking INTERNET TRANSFER 825950711483,2025,4,30,savings,50000.0,internet banking,internet transfer,2025-04-30_internet banking internet transfer ...
8,2025-05-31,Internet Banking INTERNET TRANSFER 354554250700,2025,5,31,chequing,-50000.0,internet banking,internet transfer,2025-05-31_internet banking internet transfer ...
9,2025-05-31,Internet Banking INTERNET TRANSFER 354554250700,2025,5,31,savings,50000.0,internet banking,internet transfer,2025-05-31_internet banking internet transfer ...


In [25]:
raw_debit_df = pd.concat([
    expanded_savings_df.drop(merged_df["index_copy_x"]),
    expanded_chequing_df.drop(merged_df["index_copy_y"])
]).drop(columns=["index_copy"]).reset_index(drop=True)
raw_debit_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,party,uid
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,2025-01-01_automated banking machine atm depos...
1,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,2025-01-01_automated banking machine atm depos...
2,2025-01-31,Internet Banking E-TRANSFER 891379219227 LANDLORD,2025,1,31,savings,-1500.0,internet banking,e-transfer,landlord,2025-01-31_internet banking etransfer 89137921...
3,2025-01-31,Internet Banking E-TRANSFER 141960766495 LANDLORD,2025,1,31,savings,-256.0,internet banking,e-transfer,landlord,2025-01-31_internet banking etransfer 14196076...
4,2025-02-28,Internet Banking E-TRANSFER 899912012729 LANDLORD,2025,2,28,savings,-1500.0,internet banking,e-transfer,landlord,2025-02-28_internet banking etransfer 89991201...
...,...,...,...,...,...,...,...,...,...,...,...
430,2025-11-30,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,11,30,chequing,-8.0,point of sale - visa debit,visa debit retail purchase,presto,2025-11-30_point of sale visa debit visa debi...
431,2025-12-01,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,12,1,chequing,-4.0,point of sale - visa debit,visa debit retail purchase,presto,2025-12-01_point of sale visa debit visa debi...
432,2025-12-01,Internet Banking E-TRANSFER 917263704293 BLUE ...,2025,12,1,chequing,-50.0,internet banking,e-transfer,blue hockey club,2025-12-01_internet banking etransfer 91726370...
433,2025-12-02,Internet Banking INTERNET TRANSFER 532904682993,2025,12,2,chequing,-93.0,internet banking,internet transfer,,2025-12-02_internet banking internet transfer ...


In [26]:
internal_payment_from_debit_df = raw_debit_df.loc[
    (raw_debit_df["method"] == "internet banking") &
    (raw_debit_df["type"] == "internet transfer")
].drop(columns=["party"])
internal_payment_from_debit_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,uid
35,2025-01-07,Internet Banking INTERNET TRANSFER 755558011330,2025,1,7,chequing,-64.0,internet banking,internet transfer,2025-01-07_internet banking internet transfer ...
43,2025-01-14,Internet Banking INTERNET TRANSFER 996991902804,2025,1,14,chequing,-80.0,internet banking,internet transfer,2025-01-14_internet banking internet transfer ...
51,2025-01-21,Internet Banking INTERNET TRANSFER 986463128183,2025,1,21,chequing,-51.0,internet banking,internet transfer,2025-01-21_internet banking internet transfer ...
59,2025-01-28,Internet Banking INTERNET TRANSFER 089849535459,2025,1,28,chequing,-80.0,internet banking,internet transfer,2025-01-28_internet banking internet transfer ...
69,2025-02-04,Internet Banking INTERNET TRANSFER 858247631127,2025,2,4,chequing,-74.0,internet banking,internet transfer,2025-02-04_internet banking internet transfer ...
77,2025-02-11,Internet Banking INTERNET TRANSFER 701548196466,2025,2,11,chequing,-75.0,internet banking,internet transfer,2025-02-11_internet banking internet transfer ...
84,2025-02-18,Internet Banking INTERNET TRANSFER 985272724725,2025,2,18,chequing,-51.0,internet banking,internet transfer,2025-02-18_internet banking internet transfer ...
92,2025-02-25,Internet Banking INTERNET TRANSFER 722097679043,2025,2,25,chequing,-90.0,internet banking,internet transfer,2025-02-25_internet banking internet transfer ...
102,2025-03-04,Internet Banking INTERNET TRANSFER 761892470662,2025,3,4,chequing,-67.0,internet banking,internet transfer,2025-03-04_internet banking internet transfer ...
111,2025-03-11,Internet Banking INTERNET TRANSFER 120658481851,2025,3,11,chequing,-83.0,internet banking,internet transfer,2025-03-11_internet banking internet transfer ...


In [27]:
credit_payment_bool_series = expanded_credit_df["description"].str.contains("PAYMENT THANK YOU")

In [28]:
internal_payment_df = pd.concat([
    internal_payment_from_debit_df,
    expanded_credit_df.loc[credit_payment_bool_series]
]).drop(columns=["party"]).sort_values(by=["uid"]).reset_index(drop=True)
internal_payment_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,uid
0,2025-01-07,Internet Banking INTERNET TRANSFER 755558011330,2025,1,7,chequing,-64.0,internet banking,internet transfer,2025-01-07_internet banking internet transfer ...
1,2025-01-07,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,1,7,credit,64.0,,,2025-01-07_payment thank youpaiemen t merci_cr...
2,2025-01-14,Internet Banking INTERNET TRANSFER 996991902804,2025,1,14,chequing,-80.0,internet banking,internet transfer,2025-01-14_internet banking internet transfer ...
3,2025-01-14,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,1,14,credit,80.0,,,2025-01-14_payment thank youpaiemen t merci_cr...
4,2025-01-21,Internet Banking INTERNET TRANSFER 986463128183,2025,1,21,chequing,-51.0,internet banking,internet transfer,2025-01-21_internet banking internet transfer ...
...,...,...,...,...,...,...,...,...,...,...
91,2025-11-18,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,11,18,credit,82.0,,,2025-11-18_payment thank youpaiemen t merci_cr...
92,2025-11-25,Internet Banking INTERNET TRANSFER 321821919755,2025,11,25,chequing,-51.0,internet banking,internet transfer,2025-11-25_internet banking internet transfer ...
93,2025-11-25,PAYMENT THANK YOU/PAIEMEN T MERCI,2025,11,25,credit,51.0,,,2025-11-25_payment thank youpaiemen t merci_cr...
94,2025-12-02,Internet Banking INTERNET TRANSFER 532904682993,2025,12,2,chequing,-93.0,internet banking,internet transfer,2025-12-02_internet banking internet transfer ...


In [29]:
debit_df = raw_debit_df.drop(internal_payment_from_debit_df.index).reset_index(drop=True)
debit_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,party,uid
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,2025-01-01_automated banking machine atm depos...
1,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,2025-01-01_automated banking machine atm depos...
2,2025-01-31,Internet Banking E-TRANSFER 891379219227 LANDLORD,2025,1,31,savings,-1500.0,internet banking,e-transfer,landlord,2025-01-31_internet banking etransfer 89137921...
3,2025-01-31,Internet Banking E-TRANSFER 141960766495 LANDLORD,2025,1,31,savings,-256.0,internet banking,e-transfer,landlord,2025-01-31_internet banking etransfer 14196076...
4,2025-02-28,Internet Banking E-TRANSFER 899912012729 LANDLORD,2025,2,28,savings,-1500.0,internet banking,e-transfer,landlord,2025-02-28_internet banking etransfer 89991201...
...,...,...,...,...,...,...,...,...,...,...,...
382,2025-11-30,Branch Transaction CREDIT MEMO,2025,11,30,chequing,7000.0,branch transaction,credit memo,,2025-11-30_branch transaction credit memo _che...
383,2025-11-30,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,11,30,chequing,-8.0,point of sale - visa debit,visa debit retail purchase,presto,2025-11-30_point of sale visa debit visa debi...
384,2025-12-01,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,12,1,chequing,-4.0,point of sale - visa debit,visa debit retail purchase,presto,2025-12-01_point of sale visa debit visa debi...
385,2025-12-01,Internet Banking E-TRANSFER 917263704293 BLUE ...,2025,12,1,chequing,-50.0,internet banking,e-transfer,blue hockey club,2025-12-01_internet banking etransfer 91726370...


In [30]:
def get_sign(amount):
    if amount > 0:
        return "income"
    elif amount < 0:
        return "expense"
    return "zero-value"

In [31]:
cash_flow_df = pd.concat([
    debit_df,
    expanded_credit_df.loc[~credit_payment_bool_series]
]).sort_values(by=["uid"]).reset_index(drop=True)
cash_flow_df["sign"] = cash_flow_df["amount"].apply(get_sign)
cash_flow_df

Unnamed: 0,date,description,year,month,day,account,amount,method,type,party,uid,sign
0,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,chequing,2000.0,automated banking machine,atm deposit,,2025-01-01_automated banking machine atm depos...,income
1,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,2025-01-01_automated banking machine atm depos...,income
2,2025-01-01,Automated Banking Machine ATM DEPOSIT TORONTO,2025,1,1,savings,2000.0,automated banking machine,atm deposit,,2025-01-01_automated banking machine atm depos...,income
3,2025-01-01,Internet Banking E-TRANSFER 349282506970 BLUE ...,2025,1,1,chequing,-50.0,internet banking,e-transfer,blue hockey club,2025-01-01_internet banking etransfer 34928250...,expense
4,2025-01-02,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,1,2,chequing,-3.0,point of sale - visa debit,visa debit retail purchase,presto,2025-01-02_point of sale visa debit visa debi...,expense
...,...,...,...,...,...,...,...,...,...,...,...,...
430,2025-12-01,Point of Sale - Visa Debit VISA DEBIT RETAIL P...,2025,12,1,chequing,-4.0,point of sale - visa debit,visa debit retail purchase,presto,2025-12-01_point of sale visa debit visa debi...,expense
431,2025-12-02,"WALMART TORONTO, ON",2025,12,2,credit,-93.0,,,walmart,2025-12-02_walmart toronto on_credit_-93.0_1,expense
432,2025-12-31,Branch Transaction CREDIT MEMO,2025,12,31,chequing,7000.0,branch transaction,credit memo,,2025-12-31_branch transaction credit memo _che...,income
433,2025-12-31,Internet Banking E-TRANSFER 149174661918 LANDLORD,2025,12,31,savings,-255.0,internet banking,e-transfer,landlord,2025-12-31_internet banking etransfer 14917466...,expense


In [32]:
with pd.ExcelWriter('output.xlsx') as writer:
    cash_flow_df.to_excel(writer, sheet_name="cash_flow", index=False)
    internal_transfer_df.to_excel(writer, sheet_name="internal_transfer", index=False)
    internal_payment_df.to_excel(writer, sheet_name="internal_payment", index=False)

## Command Test

In [34]:
# db_df = pd.read_excel("C:/Users/Daniel/Documents/Git/bank-statement-cleaner/outdated.xlsx", sheet_name=None)
# db_df

In [35]:
# internal_payment_df.loc[~internal_payment_df["uid"].isin(db_df["internal_payment"]["uid"])]