In [None]:
%%sql -r dataframe_3
USE SCHEMA RAW_DB.PDF;
USE WAREHOUSE COMPUTE_WH;

# Processing TCB credit statement (pdf files)
* Method: AI_PARSE_DOCUMENT
* Stage for this has been created in 02-historical_parse script
* Upload done prior to running this script



In [None]:
# Import python packages
# import streamlit as st
import pandas as pd

from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import Session

import snowflake.connector
import re
import time


session = get_active_session()


In [None]:
CREDIT_STAGE = 'TCB_CREDIT_STAGE'
DATABASE = 'RAW_DB'
SCHEMA = 'PDF'

In [None]:
file_list = session.sql(f"""
    select distinct
        metadata$filename as file_name,
        metadata$file_last_modified as file_last_modified,
        'TCB_CREDIT_' || split(file_name, '_')[1]::string as table_name
    from @"{DATABASE}"."{SCHEMA}"."{CREDIT_STAGE}"
    """
    ).to_pandas()

In [None]:
for _,row in file_list.iterrows():
    file_name = row['FILE_NAME']
    table_name = row['TABLE_NAME']

    print(f'Processing {file_name} into {table_name}')
    parse_query = f"""
        create or replace table {DATABASE}.{SCHEMA}.{table_name} as (
        SELECT AI_PARSE_DOCUMENT (
            TO_FILE('@"{DATABASE}"."{SCHEMA}"."{CREDIT_STAGE}"','{file_name}'),
            {{'mode': 'LAYOUT', 'page_split': true}}) AS content);
            """
    session.sql(parse_query).collect()

print("Completed loading images to tables")


In [None]:
process_count = 0
total_files = len(file_list)
print(total_files)
query = ""


for _,row in file_list.iterrows():
    file_name = row['FILE_NAME']
    table_name = row['TABLE_NAME']
    
    process_count +=1
    if process_count < total_files:
        query += f"""
            select '{table_name}' as file_source, content::string as content
            from {DATABASE}.{SCHEMA}.{table_name} union all  --{process_count}
        """
    else:
        query += f"""
            select '{table_name}' as file_source, content::string as content
            from {DATABASE}.{SCHEMA}.{table_name}  --{process_count}
        """

print(query)


In [None]:
session.sql(f"""
    create or replace table TRANSFORM_DB.BASE.UNIONED_TCB_CREDIT as 
    (with unioned as ({query})
    
     select 
            file_source,
            try_parse_json(content) as content
        
            from unioned)
""")

In [None]:
transform_df = session.sql(f"""
with flattened as (
    select 
        file_source,
        p.index + 1 as page,    
        p.value['content']::string as page_content_text,
        pt.index,
        pt.value,
        row_number() over(order by file_source, p.index, pt.index) as seq
    from TRANSFORM_DB.BASE.UNIONED_TCB_CREDIT,
    lateral flatten(input => content['pages']) as p,
    lateral split_to_table(page_content_text,'|') as pt
    order by file_source, page, seq
),
anchors as (
    select 
        file_source,
        case 
            when file_source = 'TCB_CREDIT_20250520' and value like '%Ghi nợ%' then seq 
            when file_source != 'TCB_CREDIT_20250520' and value like '%Diễn giải%' then seq
        else null end
        as anchor_index
    from flattened
    where anchor_index is not null
)
, data_rows as (
    select
        t.*,
        t.seq - a.anchor_index - 9 as offset_from_anchor
    from flattened t
    join anchors a 
        on t.seq >= a.anchor_index + 9 --has to be seq, not index
        and t.file_source = a.file_source
        
)
        
, numbered as (
    select
        file_source,
        page,
        page_content_text,
        index,
        value,
        seq,
        offset_from_anchor,
        case 
            -- special case
            when file_source = 'TCB_CREDIT_20250520' and page = 1 and mod(offset_from_anchor, 6) = 4 then 5
            when file_source = 'TCB_CREDIT_20250520' and page = 1 and mod(offset_from_anchor, 6) = 5 then 6
            when file_source = 'TCB_CREDIT_20250520' and page = 1 then mod(offset_from_anchor, 6)
            when file_source = 'TCB_CREDIT_20250520' and page = 2 and mod(index - 1, 6) < 5 then mod(index - 1, 6) - 1 
            when file_source = 'TCB_CREDIT_20250520' and page = 2 and mod(index - 1, 6) >= 5 then mod(index - 1, 6)
            --else
            when file_source != 'TCB_CREDIT_20250520' and page = 1 then mod(offset_from_anchor, 7) 
            when file_source != 'TCB_CREDIT_20250520' and page = 2 and mod(index - 1, 6) < 5 then mod(index - 1, 6) - 1 
            when file_source != 'TCB_CREDIT_20250520' and page = 2 and mod(index - 1, 6) >= 5 then mod(index - 1, 6)
        end 
            as col_pos, 
        row_number() over(partition by file_source, page, col_pos order by index) as record_num        
    from data_rows
    order by seq
)

, pivoted as (
select
    file_source,
    page,
    record_num,
    max(case when col_pos = 0 then value end) as transaction_date_tmp,
    max(case when col_pos = 1 then value end) as post_date_tmp,
    max(case when col_pos = 2 then value end) as original_amount_tmp,
    max(case when col_pos = 3 then value end) as debit_tmp,
    max(case when col_pos = 4 then value end) as credit_tmp,
    max(case when col_pos = 5 then value end) as description
    --ignore col 6
from numbered
group by all
order by 1,2,3
)
-- , final_result as (
select 
    file_source,
    page,
    record_num,
    -- transaction_date_tmp,
    interpolate_bfill(
        try_to_date(transaction_date_tmp, 'dd/mm/yyyy')) 
        over(partition by file_source, page order by record_num desc) as transaction_date,
    try_to_date(post_date_tmp, 'dd/mm/yyyy') as post_date,
    
    try_cast(replace(
        split(original_amount_tmp, ' ')[1]::string,
        ',','') as decimal(18,2))
        as original_amount,
    split(original_amount_tmp, ' ')[2]::string as original_curr,
    coalesce(case when description ilike '%the tin dung%' or description ilike '%merchandi%return%' or description ilike '%tindung%' or description ilike '%tín dụng%'then 0
        else try_cast(replace(debit_tmp, ',', '') as decimal(18,2)) end,0) as debit,
    coalesce(case when description ilike '%the tin dung%' or description ilike '%merchandi%return%' or description ilike '%tindung%' or description ilike '%tín dụng%'then 
        coalesce(try_cast(replace(credit_tmp, ',', '') as decimal(18,2)), original_amount)
        else try_cast(replace(credit_tmp, ',', '') as decimal(18,2)) end,0) as credit,
    description
from pivoted 
where post_date is not null
order by 1,2,3
""").to_pandas()

In [None]:
session.write_pandas(
    df=transform_df,
    database='TRANSFORM_DB',
    schema='INT',
    table_name='HISTORY_TCB_CREDIT',
    overwrite=True,
    auto_create_table=True
)

In [None]:
test_df = session.sql('''
    with test as (
        select 
            f.file_source,
            try_cast(replace(f.paper_debit, ',','') as decimal(18,2)) as debit,
            try_cast(replace(f.paper_credit, ',','') as decimal(18,2)) as credit,
            sum(fr.debit) as sql_debit, 
            sum(fr.credit) as sql_credit,
        from raw_db.audit_files.tcb_credit_balance as f
        left join transform_db.int.history_tcb_credit as fr using (file_source)
        group by all
        order by 1
        )
select * ,
    debit - sql_debit as diff_dr,
    credit - sql_credit as diff_cr
from test
where diff_dr !=0 or diff_cr != 0

'''
).collect()

test_df