# PDF extraction
## Option 1: Process all JPEG files in stage using AI_PARSE_DOCUMENT
### Step 1: Load files into stage




In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import Session

# Import required libraries
import snowflake.connector
import re
import time

session = get_active_session()


In [None]:
-- This step needs to be done from local machine
-- The stage must follow this guide:

In [None]:
if session.sql('select * from raw.pdf.file_list') is None:

    session.sql(f'''
    CREATE OR REPLACE TABLE raw.pdf.file_list as
    SELECT distinct        
        metadata$filename as file_name,
        metadata$file_last_modified as file_last_modified,
        split_part(file_name, '_',1) as from_where,
        split_part(split_part(file_name, '/',2),'.',1) as table_name
    FROM @RAW.PDF.BNK
    where metadata$filename ilike '%jpeg'
    ''')

file_list = session.sql('select * from raw.pdf.file_list').to_pandas()
file_list


### Step 2: Parse everything into tables

In [None]:

# process_count = 0
# total_files = len(file_list)

for x in file_list.iterrows():
    file_name = x[1]['FILE_NAME']
    table_name = x[1]['TABLE_NAME']
    print(file_name, table_name)
    parse_query = f"""
        create or replace table RAW.PDF.{table_name} as (
        SELECT AI_PARSE_DOCUMENT (
            TO_FILE('@"RAW"."PDF"."BNK"','{file_name}'),
            {{'mode': 'LAYOUT', 'page_split': false}}) AS content);
            """
    session.sql(parse_query).collect()
    session.sql(f"select * from RAW.PDF.{table_name}").collect()

print("Completed loading to tables")

In [None]:
process_count = 0
total_files = len(file_list)
print(total_files)
query = ""


for x in file_list.iterrows():
    table_name = x[1]['TABLE_NAME']
    
    process_count +=1
    if process_count < total_files:
        query += f"""
            select '{table_name}' as file_source, content from RAW.PDF.{table_name} union all  --{process_count}
        """
    else:
        query += f"""
            select '{table_name}' as file_source, content from RAW.PDF.{table_name}  --{process_count}
        """

print(query)


In [None]:

    create database transform;
    create schema transform.intermediate;
    -- create table transform.intermediate.unioned_pdf as         

In [None]:
session.sql(f"""
    create or replace table transform.intermediate.unioned_pdf as 
    (with unioned as ({query})
    
    select u.*,
        fl.from_where
    from unioned as u
    left join raw.pdf.file_list as fl
        on u.file_source = fl.table_name)
""")
# print(query)

In [None]:
select * from transform.intermediate.unioned_pdf

### Create TCB table out of PDFs

In [None]:
create or replace table transform.intermediate.statements_tcb as (
with joined as (
    select *
    from transform.intermediate.unioned_pdf
    where from_where ilike '%tcb%'
),
cleansed as (
    select
        from_where,
        file_source,
        index,
        mod(index - 1, 11) as position_in_group,
        value,
        row_number() over(partition by file_source, position_in_group order by index) as record_group
    from joined,
    lateral split_to_table(content['content']::string,'|')
    order by file_source,index 
),
-- select * from cleansed;
transformed_data AS (
    SELECT 
        file_source,
        record_group,
        MAX(CASE WHEN position_in_group = 1 THEN trim(value) END) AS transaction_date,
        MAX(CASE WHEN position_in_group = 2 THEN trim(value) END) AS remitter,
        MAX(CASE WHEN position_in_group = 3 THEN trim(value) END) AS remitter_bank,
        MAX(CASE WHEN position_in_group = 4 THEN trim(value) END) AS details,
        MAX(CASE WHEN position_in_group = 5 THEN trim(value) END) AS transaction_no,
        MAX(CASE WHEN position_in_group = 6 THEN trim(value) END) AS debit,
        MAX(CASE WHEN position_in_group = 7 THEN trim(value) END) AS credit,
        MAX(CASE WHEN position_in_group = 8 THEN trim(value) END) AS fee_interest,
        MAX(CASE WHEN position_in_group = 9 THEN trim(value) END) AS tax,
        MAX(CASE WHEN position_in_group = 10 THEN trim(value) END) AS balance
    from cleansed
    group by all
    having len(transaction_date) = 10
    
)
-- select * from transformed_data order by file_source, record_group ;

select 
    file_source,
    to_date(transaction_date, 'dd/mm/yyyy') AS transaction_date,
    remitter,
    remitter_bank,
    details,
    transaction_no,
    TRY_CAST(REPLACE(debit, ',', '') AS DECIMAL(18,2)) AS debit,
    TRY_CAST(REPLACE(credit, ',', '') AS DECIMAL(18,2)) AS credit,
    TRY_CAST(REPLACE(fee_interest, ',', '') AS DECIMAL(18,2)) AS fee_interest,
    TRY_CAST(REPLACE(tax, ',', '') AS DECIMAL(18,2)) AS tax,
    TRY_CAST(REPLACE(balance, ',', '') AS DECIMAL(18,2)) AS balance,
    row_number() over(order by file_source,record_group) AS record_sequence
from transformed_data 
order by file_source, record_group 
)