# Set up

Import packages, dependencies, other modules

In [1]:
%load_ext autoreload
%autoreload 2

import sys
from importlib import reload
import pandas as pd

sys.path.append(r"E:\enyutan\Document\2025 MJE Advanced Analytics")
import _00_util
import _00_util_sql
reload(_00_util_sql)
from _00_util_sql import Conn_ODBC


Define key column names & values

In [2]:
config_col_names={
    "col_DrCr": "Dr or Cr",
    "col_Acc_Code": "No_GLAcc",
    "col_Acc_Name": "Name_GLAcc",
}
config_col_values={
    "col_DrCr": {"Dr":"Dr","Cr":"Cr"},
}
config_group_by={
    "je_line": ["CompanyName","PostingDate_GLEntry","DocumentNo_GLEntry","EntryNo_GLEntry"],
    "je_doc": ["CompanyName","PostingDate_GLEntry","DocumentNo_GLEntry"],
}

Set up pyodbc connection config

In [3]:
sql_db=Conn_ODBC(database="test_enyu")

# Clustering

## 1) Manually using accounts

Import & clean base data - detailed journal entries

In [4]:
##### Import data 
conn=sql_db.odbc_conn_db_pyodbc()
sql_query=f"SELECT * FROM [stg_JE_base_info]"
df_je_base=sql_db.odbc_run_sql(conn, sql_query, return_result=True)
conn.close()

##### Clean data
df_je_base[config_col_names["col_Acc_Name"]]=df_je_base[config_col_names["col_Acc_Name"]].str.lower()

df_je_base.head(3)

Unnamed: 0,CompanyName,PostingDate_GLEntry,DocumentNo_GLEntry,EntryNo_GLEntry,No_GLAcc,Name_GLAcc,Description_GLEntry,Dr or Cr,DebitAmount_GLEntry,CreditAmount_GLEntry,NO_OF_JE_LINES
0,Hailong2,2023-05-25,GJ23050379,12403,20000000,ap - trade,WO13_Site Inspections in March 2023_425055-24A,Cr,0.0,78183.0,5
1,Hailong2,2023-05-25,GJ23050380,12413,20000000,ap - trade,WO15_Task 1A - Review of Employer Requirement ...,Cr,0.0,93858.0,5
2,Hailong2,2023-05-25,GJ23050381,12418,20000000,ap - trade,Consultant fee-Port Engineer (March 2023),Cr,0.0,163124.0,5


Step 1 - For each JE, add indicators for impact on account + total impact on each account

In [5]:
##### Function to add columns based on account impact (DB/CR which account)
def add_cols_indicator_account_impact(df, return_df=True):
    ### Get list of accounts
    df["col_Acc"]=df[config_col_names["col_Acc_Code"]]+" ("+df[config_col_names["col_Acc_Name"]]+")"
    list_acc=list(set(df["col_Acc"]))
    list_acc.sort()

    if return_df: 
        for acc in list_acc:
            for DrCr in config_col_values["col_DrCr"]:
                df[f"{DrCr} {acc}"]=df.apply(lambda x: 1 if (x["col_Acc"]==acc and x[config_col_names["col_DrCr"]]==DrCr) else 0, axis=1)
        
        for acc in list_acc:
            for DrCr in config_col_values["col_DrCr"]:
                df[f"Amt - {DrCr} {acc}"]=df.apply(lambda x: (x["DebitAmount_GLEntry"] + x["CreditAmount_GLEntry"]) if (x["col_Acc"]==acc and x[config_col_names["col_DrCr"]]==DrCr) else 0, axis=1)
    else: 
        df=None

    list_cols_added_DrCr=["Dr " + acc for acc in list_acc]+["Cr " + acc for acc in list_acc]
    list_cols_added_Amt=["Amt - " + acc for acc in list_cols_added_DrCr]
    
    return df, list_cols_added_DrCr, list_cols_added_Amt

In [6]:
##### First run takes very long - 7min to run, 6min to output excel file (60min to import to ssms)
##### Otherwise, reading from file takes 7 mins (table takes only 20 seconds but import takes too long...)

first_run=False
table_name="cluster_01_01_JE_entry_details"
file_path=rf"E:\enyutan\Document\2025 MJE Advanced Analytics\Cluster01-ManualByAccount\01 - JE Entry Details.xlsx"

if first_run: 
    ##### Add columns for features
    df_je_acc_impact, list_cols_added_DrCr, list_cols_added_Amt=add_cols_indicator_account_impact(df_je_base, return_df=True)
    # ##### Import to db --- 60 mins 
    sql_db.fn_create_new_table_from_df(table_name=table_name, df=df_je_acc_impact, auto_data_type=True)
    res=sql_db.fn_append_df_to_table(table_name=table_name, df=df_je_acc_impact)
    print("SUCCESS: ", res)
    ##### Save excel file --- 6 mins
    df_je_acc_impact.to_excel(file_path,index=False)

else: 
    _, list_cols_added_DrCr, list_cols_added_Amt=add_cols_indicator_account_impact(df_je_base, return_df=False)
    ##### Read from db --- 20 secs
    conn=sql_db.odbc_conn_db_pyodbc()
    sql_query=f"SELECT * FROM [{table_name}]"
    df_je_acc_impact=sql_db.odbc_run_sql(conn, sql_query, return_result=True)
    conn.close()
    ##### OR Read from file --- 7 mins
    # df_je_acc_impact=pd.read_excel(file_path)

list_cols_added=list_cols_added_DrCr + list_cols_added_Amt

Step 2 - Derive documents & clusters. Each cluster is based on the exact set of Dr & Cr accounts

In [7]:
##### Group by document
agg_cols_DrCr={col: "max" for col in list_cols_added_DrCr}
agg_cols_Amt={col: "sum" for col in list_cols_added_Amt}
agg_cols={**agg_cols_DrCr, **agg_cols_Amt}
df_je_doc=df_je_acc_impact.groupby(by=config_group_by["je_doc"], as_index=False).agg(agg_cols)
df_je_doc.shape

(16973, 459)

In [11]:
table_name="cluster_01_02_cluster_profile"
save_results=True
file_path=rf"E:\enyutan\Document\2025 MJE Advanced Analytics\Cluster01-ManualByAccount\02 - Cluster Profile.xlsx"

##### Create clusters --- 2 seconds
df_cluster_manual_acc_impact=df_je_doc.value_counts(list_cols_added_DrCr).reset_index(name="No. of JE Docs")
df_cluster_manual_acc_impact["Cluster ID"]=df_cluster_manual_acc_impact.index+1
df_cluster_manual_acc_impact=df_cluster_manual_acc_impact[["Cluster ID","No. of JE Docs"]+list_cols_added_DrCr]

##### 12 seconds
if save_results: 
    ##### Insert cluster info to db
    sql_db.fn_create_new_table_from_df(table_name=table_name, df=df_cluster_manual_acc_impact, auto_data_type=True)
    res=sql_db.fn_append_df_to_table(table_name=table_name, df=df_cluster_manual_acc_impact)
    ##### Save as excel
    df_cluster_manual_acc_impact.to_excel(file_path, index=False)

df_cluster_manual_acc_impact.shape 

(711, 230)

Step 3 - Tag each JE doc with a cluster ID, based on the accounts debited or credited. 

In [13]:
table_name="cluster_01_03_JE_doc_clustered"
save_results=False

##### Left join cluster for each JE doc --- 1.2 seconds
df_je_doc_clustered=df_je_doc.merge(df_cluster_manual_acc_impact, on=list_cols_added_DrCr, how="left")
df_je_doc_clustered=df_je_doc_clustered[config_group_by["je_doc"]+["Cluster ID","No. of JE Docs"]+list_cols_added]

if save_results:
    ##### Insert clustered JE docs to db --- 15 mins
    sql_db.fn_create_new_table_from_df(table_name=table_name, df=df_je_doc_clustered, auto_data_type=True)
    res=sql_db.fn_append_df_to_table(table_name=table_name, df=df_je_doc_clustered)
    ##### Save as excel --- 4 mins
    # dt_now=_00_util.get_current_datetime()
    df_cluster_manual_acc_impact.to_excel(rf"E:\enyutan\Document\2025 MJE Advanced Analytics\Cluster01-ManualByAccount\03 - JE Docs Clustered.xlsx", index=False)

df_je_doc_clustered

Unnamed: 0,CompanyName,PostingDate_GLEntry,DocumentNo_GLEntry,Cluster ID,No. of JE Docs,Dr 10000000 (cash on hands - twd),Dr 10000010 (cash on hands - usd),Dr 10000020 (cash on hands - eur),Dr 10000045 (cash on hands - jpy),Dr 10000050 (time deposit - twd),...,Amt - Cr 60100000 (professional fees),Amt - Cr 60250000 (bank charges),Amt - Cr 60325000 (interest income),Amt - Cr 60900100 (cost sharing - hl3),Amt - Cr 60900200 (cost capatalization),Amt - Cr 60900200 (cost capitalization),Amt - Cr 70010000 (foreign exchange gain/loss),Amt - Cr 70010100 (unrealized exchange gain/loss),Amt - Cr 70100000 (gain/loss on sale of assets),Amt - Cr 80100000 (income taxes - deferred)
0,Hailong2,2022-07-01,GJ22070004,104,11,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Hailong2,2022-07-01,GJ22070007,2,2029,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Hailong2,2022-07-01,GJ22070009,3,1994,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Hailong2,2022-07-01,GJ22070010,3,1994,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Hailong2,2022-07-20,GJ22070008,2,2029,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16968,Hailong3,2024-05-31,GJ24050528,98,13,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16969,Hailong3,2024-05-31,GJ24050529,275,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16970,Hailong3,2024-05-31,GJ24050530,264,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16971,Hailong3,2024-05-31,GJ24050531,365,2,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Using algorithms