# Intro<br>
This file is to get detailed information about cluster data whereby each cluster represents the types of interaction a journal entry has with its involved accounts

Information:<ul>
<li><b>Cluster_ID:</b> Unique number given to each cluster</li>
<li><b>Cluster:</b> List of interactions with different accounts which make up the cluster name and uniqueness.<br> Format is interaction + account id (account number + account name)
    <ul>
        <li>Cr: Amount credited to account</li>
        <li>Dr: Amount debited to account</li>
        <li>Zero: 0 dollars were credited/debited to the account</li>
    </ul>
</li>
<li><b>UniqueInteractions:</b> Count of the unique interactions + account IDs found in cluster (length of cluster)</li>
<li><b>AccountCount:</b> Number of unique accounts found in the cluster (Number of unique accounts the journal documents inside the cluster interact with, which could be repeated ie an account could be both credited and debited)</li>
<li><b>DocumentCount:</b> Number of unique journal documents inside each cluster</li>
</ul><br>
<b>Note: Please change saving location</b>

# Import data and libraries

In [1]:
%reload_ext autoreload
%autoreload 2

import sys
from importlib import reload
import pandas as pd
import numpy as np

import _00_util_sql
reload(_00_util_sql)
from _00_util_sql import Conn_ODBC

In [2]:
sql_db=Conn_ODBC(database="JE_ML_2025")

##### Import data 
conn=sql_db.odbc_conn_db_pyodbc()
sql_query=f"SELECT * FROM [data_Hailong_staging_JE_updated_All]"
data_22=sql_db.odbc_run_sql(conn, sql_query, return_result=True)
conn.close()

data_22.info()
# 1 mins

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51977 entries, 0 to 51976
Data columns (total 51 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CompanyName                             51977 non-null  object 
 1   ExcludeBalanceOnly                      51977 non-null  object 
 2   PrintReversedEntries                    51977 non-null  object 
 3   PageGroupNo                             51977 non-null  int64  
 4   PrintOnlyOnePerPage                     51977 non-null  object 
 5   PrintClosingEntries                     51977 non-null  object 
 6   PrintOnlyCorrections                    51977 non-null  object 
 7   EmptyString                             0 non-null      object 
 8   No_GLAcc                                51977 non-null  object 
 9   DetailTrialBalCaption                   51977 non-null  object 
 10  PageCaption                             51977 non-null  ob

# Clustering

In [3]:
# Old
# Cluster journal documents (JE_Doc_ID) according to interaction and unique account. 
# JE_Doc_IDs can be repeated across clusters
def cluster_journal_entries(df):
    """
    Clusters journal entries into Dr/Cr/Zero groups by GL Account,
    and counts the number of unique JE documents per cluster.
    
    Parameters:
        df (pd.DataFrame): Input data containing JE base info
    
    Returns:
        pd.DataFrame: Clustered result with document counts
    """

    # Step 1: Determine Dr / Cr / Zero
    def get_cluster_type(row):
        if row['DebitAmount_GLEntry'] != 0:
            return f"Dr {row['No_GLAcc']} {row['Name_GLAcc']}"
        elif row['CreditAmount_GLEntry'] != 0:
            return f"Cr {row['No_GLAcc']} {row['Name_GLAcc']}"
        else:
            return f"Zero {row['No_GLAcc']} {row['Name_GLAcc']}"

    # Apply function to create cluster column
    df['Cluster'] = df.apply(get_cluster_type, axis=1)

    # Step 2: Create a unique identifier for each JE document
    df['JE_Doc_ID'] = df['CompanyName'] + '|' + df['DocumentNo_GLEntry'].astype(str)

    # Step 3: Group by Cluster and count unique JE documents
    result = (
        df[['Cluster', 'JE_Doc_ID']]
        .drop_duplicates()
        .groupby('Cluster')
        .size()
        .reset_index(name='JE_Count')
    )

    return result

# Run clustering
cluster_result = cluster_journal_entries(data_22)
cluster_result

Unnamed: 0,Cluster,JE_Count
0,Cr 10000000 Cash on Hands - TWD,3989
1,Cr 10000010 Cash on Hands - USD,722
2,Cr 10000020 Cash on Hands - EUR,816
3,Cr 10000045 Cash on Hands - JPY,17
4,Cr 10000050 Time Deposit - TWD,2
...,...,...
236,Zero 60900200 Cost Capatalization,2
237,Zero 70010000 Foreign Exchange Gain/Loss,5
238,Zero 70010100 Unrealized Exchange Gain/Loss,4
239,Zero 70100000 Gain/Loss on Sale of Assets,1


In [4]:
# Old
def cluster_journal_entries_grouped_by_je(df):
    """
    Clusters journal entries into Dr/Cr/Zero groups by GL Account,
    grouped by Journal Entry (CompanyName + DocumentNo_GLEntry),
    and counts how many JE documents fall into each cluster.

    Parameters:
        df (pd.DataFrame): Input data containing JE base info
    
    Returns:
        pd.DataFrame: Clustered result with document counts
    """

    # Step 1: Determine Dr / Cr / Zero cluster per row
    def get_cluster_type(row):
        if row['DebitAmount_GLEntry'] != 0:
            return f"Dr {row['No_GLAcc']} {row['Name_GLAcc']}"
        elif row['CreditAmount_GLEntry'] != 0:
            return f"Cr {row['No_GLAcc']} {row['Name_GLAcc']}"
        else:
            return f"Zero {row['No_GLAcc']} {row['Name_GLAcc']}"

    df['Cluster'] = df.apply(get_cluster_type, axis=1)

    # Step 2: Create JE document ID
    df['JE_Doc_ID'] = df['CompanyName'].astype(str) + '|' + df['DocumentNo_GLEntry'].astype(str)

    # Step 3: For each JE document, collect all unique clusters it belongs to
    je_clusters = (
        df[['JE_Doc_ID', 'Cluster']]
        .drop_duplicates()
        .groupby('JE_Doc_ID')['Cluster']
        .apply(lambda x: tuple(sorted(x)))
        .reset_index()
    )

    return je_clusters

clus = cluster_journal_entries_grouped_by_je(data_22)
clus

Unnamed: 0,JE_Doc_ID,Cluster
0,Hailong2|EX20221025,"(Cr 70010000 Foreign Exchange Gain/Loss, Dr 10..."
1,Hailong2|FX20221101,"(Cr 10000020 Cash on Hands - EUR, Cr 70010000 ..."
2,Hailong2|FX20221104,"(Cr 10000010 Cash on Hands - USD, Cr 70010000 ..."
3,Hailong2|FX20221201,"(Cr 70010000 Foreign Exchange Gain/Loss, Dr 10..."
4,Hailong2|FX20221202,"(Cr 10000010 Cash on Hands - USD, Cr 70010000 ..."
...,...,...
16951,Hailong3|GJ24050528,"(Cr 16100000 Construction in Progress, Dr 2002..."
16952,Hailong3|GJ24050529,"(Cr 14020000 Derivative Asset - Current, Cr 30..."
16953,Hailong3|GJ24050530,"(Cr 20800000 Deferred Tax Liabilities, Cr 3040..."
16954,Hailong3|GJ24050531,"(Cr 2060EDBS Bank Loan - EBL DBS, Cr 2060EMUFG..."


In [5]:
# Old
def cluster_journal_entries_grouped_by_je(df):
    """
    Clusters journal entries into Dr/Cr/Zero groups by GL Account,
    grouped by Journal Entry (CompanyName + DocumentNo_GLEntry),
    and counts how many JE documents fall into each *unique combination* of clusters.

    Parameters:
        df (pd.DataFrame): Input data containing JE base info

    Returns:
        pd.DataFrame: Each unique combination of clusters with a count of JE documents
    """

    def get_cluster_type(row):
        if row['DebitAmount_GLEntry'] != 0:
            return f"Dr {row['No_GLAcc']} {row['Name_GLAcc']}"
        elif row['CreditAmount_GLEntry'] != 0:
            return f"Cr {row['No_GLAcc']} {row['Name_GLAcc']}"
        else:
            return f"Zero {row['No_GLAcc']} {row['Name_GLAcc']}"

    df['Cluster'] = df.apply(get_cluster_type, axis=1)

    df['JE_Doc_ID'] = df['CompanyName'].astype(str) + '|' + df['DocumentNo_GLEntry'].astype(str)

    # Convert cluster list to tuple so it's hashable
    je_clusters = (
        df[['JE_Doc_ID', 'Cluster']]
        .drop_duplicates()
        .groupby('JE_Doc_ID')['Cluster']
        .apply(lambda x: tuple(sorted(x)))  # Convert to tuple after sorting, 
        # sorting to ensure the combis are organised and thus will be treated the same
        .reset_index()
    )

    grouped_counts = (
        je_clusters
        .groupby('Cluster')
        .size()
        .reset_index(name='DocumentCount')
        .sort_values(by='DocumentCount', ascending=False)
    )

    # Convert Cluster back to list for readability
    grouped_counts['Cluster'] = grouped_counts['Cluster'].apply(list)
    
    return grouped_counts

result = cluster_journal_entries_grouped_by_je(data_22)
result

Unnamed: 0,Cluster,DocumentCount
32,"[Cr 10000000 Cash on Hands - TWD, Dr 20000000 ...",2034
386,"[Cr 20000000 AP - Trade, Dr 11080000 Input VAT...",2028
398,"[Cr 20000000 AP - Trade, Dr 16100000 Construct...",1993
37,"[Cr 10000000 Cash on Hands - TWD, Dr 20010010 ...",1187
609,"[Cr 20080000 Withholding Tax Payable, Dr 20000...",1074
...,...,...
725,"[Cr 70010000 Foreign Exchange Gain/Loss, Cr 70...",1
726,"[Cr 70010000 Foreign Exchange Gain/Loss, Cr 70...",1
728,"[Cr 70010000 Foreign Exchange Gain/Loss, Cr 70...",1
730,"[Cr 70010000 Foreign Exchange Gain/Loss, Dr 10...",1


In [6]:
import pandas as pd
import re

def normalize_name(name):
    """Normalize account names for comparison"""
    name = str(name).lower()
    name = re.sub(r'[^a-z0-9\s]', '', name)  # Remove non-alphanumeric characters
    name = re.sub(r'\s+', ' ', name).strip()  # Normalize spacing
    return name

# Count number of UNIQUE accounts in each cluster
# Remove direction prefix (Dr/Cr/Zero) to get unique account ID
def count_unique_accounts(cluster_list):
    unique_accounts = set()
    for cluster_item in cluster_list:
        # Remove the direction prefix (first word) to get the account identifier
        account_id = ' '.join(cluster_item.split()[1:])  # Remove first word (Dr/Cr/Zero)
        unique_accounts.add(account_id)
    return len(unique_accounts)

def cluster_journal_entries_grouped_by_je(df):
    """
    Clusters journal entries into Dr/Cr/Zero groups by GL Account,
    grouped by Journal Entry (CompanyName + DocumentNo_GLEntry),
    and counts how many JE documents fall into each *unique combination* of clusters.

    Parameters:
        df (pd.DataFrame): Input data containing JE base info

    Returns:
        pd.DataFrame: Each unique combination of clusters with a count of JE documents
    """

    def get_cluster_type(row):
        if row['DebitAmount_GLEntry'] != 0:
            direction = 'Dr'
        elif row['CreditAmount_GLEntry'] != 0:
            direction = 'Cr'
        else:
            direction = 'Zero'

        # Use normalized name in the cluster key
        normalized_name = normalize_name(row['Name_GLAcc'])
        return f"{direction} {row['No_GLAcc']} {normalized_name}"

    df['Cluster'] = df.apply(get_cluster_type, axis=1)

    df['JE_Doc_ID'] = df['CompanyName'].astype(str) + '|' + df['DocumentNo_GLEntry'].astype(str) + '|' + df['PostingDate_GLEntry'].astype(str)

    # Convert cluster list to tuple so it's hashable
    je_clusters = (
        df[['JE_Doc_ID', 'Cluster']]
        .drop_duplicates()
        .groupby('JE_Doc_ID')['Cluster']
        .apply(lambda x: tuple(sorted(x)))  # Ensure consistent ordering
        .reset_index()
    )

    grouped_counts = (
        je_clusters
        .groupby('Cluster')
        .size()
        .reset_index(name='DocumentCount')
        .sort_values(by='DocumentCount', ascending=False)
    )

    # Optional: convert Cluster back to list for readability
    grouped_counts['Cluster'] = grouped_counts['Cluster'].apply(list)

    # Assign a unique cluster ID
    grouped_counts['Cluster_ID'] = [str(i) for i in range(1, len(grouped_counts) + 1)]

    # Count number of unique interactions in each cluster
    grouped_counts['UniqueInteractions'] = grouped_counts['Cluster'].apply(len)
  
    grouped_counts['AccountCount'] = grouped_counts['Cluster'].apply(count_unique_accounts)

    # Move Cluster_ID to the leftmost position
    grouped_counts = grouped_counts[['Cluster_ID', 'Cluster', 'UniqueInteractions','AccountCount', 'DocumentCount']]

    return grouped_counts
 
# Example usage
result = cluster_journal_entries_grouped_by_je(data_22)
result

Unnamed: 0,Cluster_ID,Cluster,UniqueInteractions,AccountCount,DocumentCount
30,1,"[Cr 10000000 cash on hands twd, Dr 20000000 ap...",2,2,2034
367,2,"[Cr 20000000 ap trade, Dr 11080000 input vat, ...",3,3,2029
379,3,"[Cr 20000000 ap trade, Dr 16100000 constructio...",2,2,1994
35,4,"[Cr 10000000 cash on hands twd, Dr 20010010 ap...",2,2,1187
590,5,"[Cr 20080000 withholding tax payable, Dr 20000...",2,2,1074
...,...,...,...,...,...
44,725,"[Cr 10000000 cash on hands twd, Dr 20400000 ot...",3,3,1
46,726,"[Cr 10000000 cash on hands twd, Dr 60325000 in...",2,2,1
697,727,"[Cr 60900200 cost capitalization, Dr 16100000 ...",2,2,1
698,728,"[Cr 60900200 cost capitalization, Dr 16100000 ...",3,3,1


## Save

In [None]:
# Change where you want to save the dataframe
result.to_excel(rf"D:\Shared\Internal Dev - JE ML\Hailong\ManualCluster.xlsx", index=False)

# Clustering using Ishi code <br>
Clustering done using old Ishi code, can be ignored

In [10]:
data_22['Dr_Cr'] = np.where(
    data_22['DebitAmount_GLEntry'] != 0, 'Dr',
    np.where(
        data_22['CreditAmount_GLEntry'] != 0, 'Cr',
        'Zero'
    )
)

data_22['Account'] = data_22['No_GLAcc'].astype(str) + ' ' + data_22['Name_GLAcc']

In [9]:
data_22['Dr_Cr'].value_counts()

Dr      28686
Cr      22577
Zero      714
Name: Dr_Cr, dtype: int64

In [16]:
len(data_22['Account'].unique().tolist())

116

In [20]:
data_22['Amt'] = data_22.apply(
    lambda row: row['DebitAmount_GLEntry'] if row['DebitAmount_GLEntry'] != 0 and row['CreditAmount_GLEntry'] == 0
    else row['CreditAmount_GLEntry'] if row['CreditAmount_GLEntry'] != 0 and row['DebitAmount_GLEntry'] == 0
    else 0,
    axis=1
)

data_22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51977 entries, 0 to 51976
Data columns (total 54 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CompanyName                             51977 non-null  object 
 1   ExcludeBalanceOnly                      51977 non-null  object 
 2   PrintReversedEntries                    51977 non-null  object 
 3   PageGroupNo                             51977 non-null  object 
 4   PrintOnlyOnePerPage                     51977 non-null  object 
 5   PrintClosingEntries                     51977 non-null  object 
 6   PrintOnlyCorrections                    51977 non-null  object 
 7   EmptyString                             0 non-null      object 
 8   No_GLAcc                                51977 non-null  object 
 9   DetailTrialBalCaption                   51977 non-null  object 
 10  PageCaption                             51977 non-null  ob

In [33]:
identifiers = ['CompanyName', 'DocumentNo_GLEntry']

# Create pivot table with multi-index columns
pivot = data_22.pivot_table(
    index=identifiers,
    columns=['Dr_Cr', 'Account'],
    values='Amt',
    aggfunc='sum'
)

# Flatten the column MultiIndex into single strings like "Dr_Asset"
pivot.columns = [f"{col[0]}_{col[1]}" for col in pivot.columns]

# After creating the pivot table
pivot_binary = pivot.fillna(0).applymap(lambda x: 1 if x != 0 else 0)
# pivot_binary = pivot.applymap(lambda x: 1 if x != 0 else 0)
pivot_binary
# pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Cr_10000000 Cash on Hands - TWD,Cr_10000010 Cash on Hands - USD,Cr_10000020 Cash on Hands - EUR,Cr_10000045 Cash on Hands - JPY,Cr_10000050 Time Deposit - TWD,Cr_10000051 Time Deposit - USD,Cr_10000052 Time Deposit - EUR,Cr_10010001 Restricted Cash - Time Deposit(NTD),Cr_10010001 Restricted Cash - Time Deposit(TWD),Cr_10010010 Restricted Cash - Time Deposit(EUR),...,Zero_60040900 Other Operation Cost,Zero_60100000 Professional Fees,Zero_60250000 Bank Charges,Zero_60325000 Interest Income,Zero_60900100 Cost Sharing - HL3,Zero_60900200 Cost Capatalization,Zero_70010000 Foreign Exchange Gain/Loss,Zero_70010100 Unrealized Exchange Gain/Loss,Zero_70100000 Gain/Loss on Sale of Assets,Zero_80100000 Income Taxes - Deferred
CompanyName,DocumentNo_GLEntry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Hailong2,EX20221025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hailong2,FX20221101,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hailong2,FX20221104,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hailong2,FX20221201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hailong2,FX20221202,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hailong3,GJ24050528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hailong3,GJ24050529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hailong3,GJ24050530,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hailong3,GJ24050531,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
features = pivot_binary.columns
print(len(features))

241


In [39]:
# Create a signature string for each row
pivot_binary['Cluster_Signature'] = pivot_binary[features].apply(
    lambda row: ', '.join([col for col, val in row.items() if val != 0]), axis=1
)

# Now group by the signature to form clusters
clusters = pivot_binary.groupby('Cluster_Signature')

pivot_binary['Cluster_ID'] = pivot_binary.groupby('Cluster_Signature').ngroup()
pivot_binary

Unnamed: 0_level_0,Unnamed: 1_level_0,Cr_10000000 Cash on Hands - TWD,Cr_10000010 Cash on Hands - USD,Cr_10000020 Cash on Hands - EUR,Cr_10000045 Cash on Hands - JPY,Cr_10000050 Time Deposit - TWD,Cr_10000051 Time Deposit - USD,Cr_10000052 Time Deposit - EUR,Cr_10010001 Restricted Cash - Time Deposit(NTD),Cr_10010001 Restricted Cash - Time Deposit(TWD),Cr_10010010 Restricted Cash - Time Deposit(EUR),...,Zero_60250000 Bank Charges,Zero_60325000 Interest Income,Zero_60900100 Cost Sharing - HL3,Zero_60900200 Cost Capatalization,Zero_70010000 Foreign Exchange Gain/Loss,Zero_70010100 Unrealized Exchange Gain/Loss,Zero_70100000 Gain/Loss on Sale of Assets,Zero_80100000 Income Taxes - Deferred,Cluster_Signature,Cluster_ID
CompanyName,DocumentNo_GLEntry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Hailong2,EX20221025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_70010000 Foreign Exchange Gain/Loss, Dr_100...",723
Hailong2,FX20221101,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_10000020 Cash on Hands - EUR, Cr_70010000 F...",117
Hailong2,FX20221104,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_10000010 Cash on Hands - USD, Cr_70010000 F...",75
Hailong2,FX20221201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_70010000 Foreign Exchange Gain/Loss, Dr_100...",723
Hailong2,FX20221202,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_10000010 Cash on Hands - USD, Cr_70010000 F...",75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hailong3,GJ24050528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_16100000 Construction in Progress, Dr_2002Y...",276
Hailong3,GJ24050529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_14020000 Derivative Asset - Current, Cr_304...",238
Hailong3,GJ24050530,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_20800000 Deferred Tax Liabilities, Cr_30400...",651
Hailong3,GJ24050531,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"Cr_2060EDBS Bank Loan - EBL DBS, Cr_2060EMUFG ...",625


In [40]:
# Get cluster signature, cluster id and population map
cluster_id_to_signature = pivot_binary.copy()
cluster_id_to_signature = cluster_id_to_signature[['Cluster_ID', 'Cluster_Signature']].drop_duplicates()

cluster_counts = pivot_binary['Cluster_ID'].value_counts().sort_index()
cluster_id_to_signature['Population'] = cluster_id_to_signature['Cluster_ID'].map(cluster_counts)

cluster_map = dict(zip(cluster_id_to_signature['Cluster_ID'], 
                       zip(cluster_id_to_signature['Cluster_Signature'],
                       cluster_id_to_signature['Population'])))

cluster_map

{723: ('Cr_70010000 Foreign Exchange Gain/Loss, Dr_10000020 Cash on Hands - EUR',
  5),
 117: ('Cr_10000020 Cash on Hands - EUR, Cr_70010000 Foreign Exchange Gain/Loss, Dr_10000020 Cash on Hands - EUR, Dr_70010000 Foreign Exchange Gain/Loss',
  1),
 75: ('Cr_10000010 Cash on Hands - USD, Cr_70010000 Foreign Exchange Gain/Loss, Dr_10000010 Cash on Hands - USD, Dr_70010000 Foreign Exchange Gain/Loss',
  2),
 360: ('Cr_20000000 AP - Trade, Cr_70010000 Foreign Exchange Gain/Loss, Cr_70010100 Unrealized Exchange Gain/Loss, Dr_10000010 Cash on Hands - USD, Dr_10000020 Cash on Hands - EUR, Dr_20000000 AP - Trade, Dr_70010100 Unrealized Exchange Gain/Loss',
  1),
 88: ('Cr_10000010 Cash on Hands - USD, Dr_70010000 Foreign Exchange Gain/Loss',
  1),
 55: ('Cr_10000010 Cash on Hands - USD, Cr_10000020 Cash on Hands - EUR, Cr_20000000 AP - Trade, Cr_70010100 Unrealized Exchange Gain/Loss, Dr_20000000 AP - Trade, Dr_70010000 Foreign Exchange Gain/Loss, Dr_70010100 Unrealized Exchange Gain/Loss',
 

In [41]:
# Convert cluster_map dictionary to a DataFrame
cluster_map_df = pd.DataFrame.from_dict(cluster_map, orient='index', columns=['Cluster_Signature', 'Population'])
cluster_map_df.index.name = 'Cluster_ID'
cluster_map_df = cluster_map_df.reset_index()  # Make Cluster_ID a column instead of index
cluster_map_df = cluster_map_df.sort_values(by='Population', ascending=False)
cluster_map_df

Unnamed: 0,Cluster_ID,Cluster_Signature,Population
31,33,"Cr_10000000 Cash on Hands - TWD, Dr_20000000 A...",2034
23,381,"Cr_20000000 AP - Trade, Dr_11080000 Input VAT,...",2028
22,392,"Cr_20000000 AP - Trade, Dr_16100000 Constructi...",1993
68,38,"Cr_10000000 Cash on Hands - TWD, Dr_20010010 A...",1187
45,603,"Cr_20080000 Withholding Tax Payable, Dr_200000...",1074
...,...,...,...
461,243,Cr_16050010 Lease(ROU) Asset - Interest Capita...,1
460,246,Cr_16060000 Lease (ROU) Asset - Accumulated Am...,1
457,488,"Cr_20010010 AP - Employee, Dr_60040030 Travel ...",1
456,318,"Cr_16100010 DEVEX - Open Balance, Cr_20000000 ...",1


## Save

In [None]:
# cluster_map_df.to_excel(rf"d:\Shared\Internal Dev - JE ML\Hailong\MC2.xlsx", index=False)