In [None]:
!pip3 install PyPDF2

In [5]:
import oci,re,json,io
import pandas as pd
from collections import deque
from PyPDF2 import PdfWriter, PdfReader

In [6]:

COMPARTMENT_ID="<compartment_id>"
OP_BUCKET="<name_of_bucket>"
IN_BUCKET="<name_of_bucket>"
MID_BUCKET="<name_of_bucket>"
OP_BUCKET_PREFIX="<provide_any_name>"



In [7]:
CONFIG_FILE_PATH = "<config_file_path>"
CONFIG = oci.config.from_file(CONFIG_FILE_PATH)
OBJECT_STORAGE_CLIENT = oci.object_storage.ObjectStorageClient(CONFIG)
OBJECT_STORAGE_COMP_CLIENT=oci.object_storage.ObjectStorageClientCompositeOperations(OBJECT_STORAGE_CLIENT)
AIDOC_CLIENT = oci.ai_document.AIServiceDocumentClient(CONFIG)
AIDOC_COMP_CLIENT = oci.ai_document.AIServiceDocumentClientCompositeOperations(AIDOC_CLIENT)
NAMESPACE_NAME = OBJECT_STORAGE_CLIENT.get_namespace().data

In [9]:
def multipart_upload_obj(upload_obj_name,file,object_storage_client,namespace_name,target_bucket_name,content_type):
    file.seek(0)
    CHUNK_SIZE = 1024 * 1024
    upload_id = object_storage_client.create_multipart_upload(
        namespace_name=namespace_name,
        bucket_name=target_bucket_name,
        create_multipart_upload_details=oci.object_storage.models.CreateMultipartUploadDetails(
            object=upload_obj_name,
            content_type=content_type,
            storage_tier = oci.object_storage.models.CreateMultipartUploadDetails.STORAGE_TIER_STANDARD
        )).data.upload_id
    part_number = 1
    offset = 0
    while True:
        chunk = file.read(CHUNK_SIZE)            
        if not chunk:
            break
        chunk_length = len(chunk)
        upload_part_res = object_storage_client.upload_part(
            namespace_name=namespace_name,
            bucket_name=target_bucket_name,
            object_name=upload_obj_name,
            upload_part_body=chunk,
            upload_id = upload_id,
            upload_part_num = part_number
        )

        part_number += 1            
        offset += chunk_length
    parts = oci.pagination.list_call_get_all_results(object_storage_client.list_multipart_upload_parts,
        namespace_name=namespace_name,
        bucket_name=target_bucket_name,
        object_name=upload_obj_name,
        upload_id = upload_id
    ).data
    object_storage_client.commit_multipart_upload(
        namespace_name=namespace_name,
        bucket_name=target_bucket_name,
        object_name=upload_obj_name,
        upload_id = upload_id,
        commit_multipart_upload_details=oci.object_storage.models.CommitMultipartUploadDetails(
            parts_to_commit=[
                oci.object_storage.models.CommitMultipartUploadPartDetails(part_num=x.part_number,etag=x.etag)
                for x in parts
            ]
        )                
    )

In [53]:
def run_aidoc_extract(object_list):
    key_value_extraction_feature = oci.ai_document.models.DocumentKeyValueExtractionFeature()
    output_location = oci.ai_document.models.OutputLocation()
    output_location.namespace_name = NAMESPACE_NAME
    output_location.bucket_name = OP_BUCKET
    output_location.prefix =  OP_BUCKET_PREFIX
    object_locations=[oci.ai_document.models.ObjectLocation(bucket_name=MID_BUCKET,namespace_name=NAMESPACE_NAME,object_name=o.name) for o in object_list]
    source_type=oci.ai_document.models.ObjectStorageLocations.SOURCE_TYPE_OBJECT_STORAGE_LOCATIONS
    create_processor_job_details_text_extraction = oci.ai_document.models.CreateProcessorJobDetails(
        compartment_id=COMPARTMENT_ID,
        input_location=oci.ai_document.models.ObjectStorageLocations(source_type=source_type,object_locations=object_locations),
        output_location=output_location,
        processor_config=oci.ai_document.models.GeneralProcessorConfig(language="ENG",features=[key_value_extraction_feature],document_type="INVOICE")
    )
    
    proc_job = AIDOC_COMP_CLIENT.create_processor_job_and_wait_for_state(
        create_processor_job_details=create_processor_job_details_text_extraction,
        wait_for_states=[oci.ai_document.models.ProcessorJob.LIFECYCLE_STATE_SUCCEEDED]
    )
    print(f"({proc_job.data.display_name})",proc_job.data.id)
    return proc_job.data.id

In [80]:
def process_json(ocid,object_list):
    print("processing for",ocid)
    FIELDS=deque()
    count=0
    unprocessed=deque()
    total = object_list.shape[0]
    for o in object_list:      
        get_object_response  = OBJECT_STORAGE_CLIENT.get_object(
            namespace_name=NAMESPACE_NAME,
            bucket_name=OP_BUCKET,
            object_name=o
        )
        json_data = json.loads(str(get_object_response.data.content.decode()))
        if(json_data["pages"][0]["words"]!=None):
            FIELDS.append(json_data["pages"][0]["documentFields"])              
        else:            
            unprocessed.append(o)
        count+=1
        print(f"\r{count}/{total}",end="\r")
    print()
    print(unprocessed)
    print(len(unprocessed),end="\n\n")
    FIELDS_DF = pd.DataFrame([{d['fieldLabel']['name']:d['fieldValue']['text'] for d in x if d['fieldType']=='KEY_VALUE'} for x in FIELDS])
    return FIELDS_DF   

In [None]:
#Splitting PDF
obj_list = oci.pagination.list_call_get_all_results(OBJECT_STORAGE_CLIENT.list_objects,namespace_name=NAMESPACE_NAME, bucket_name=IN_BUCKET).data.objects
print(obj_list)
for o in obj_list:
    get_object_response  = OBJECT_STORAGE_CLIENT.get_object(
        namespace_name=NAMESPACE_NAME,
        bucket_name=IN_BUCKET,
        object_name=o.name
    )
    inputpdf = PdfReader(io.BytesIO(get_object_response.data.content))
    print(inputpdf)
    for i in range(len(inputpdf.pages)):
        output = PdfWriter()
        output.add_page(inputpdf.pages[i])
        outputStream = io.BytesIO()
        output.write(outputStream)        
        output.close()
        outputStream.seek(0)        
        multipart_upload_obj(o.name+f"/pg{i+1}.pdf",outputStream,OBJECT_STORAGE_CLIENT,NAMESPACE_NAME,MID_BUCKET,content_type='application/pdf')
        outputStream.close()
    

In [None]:
obj_list1 = oci.pagination.list_call_get_all_results(OBJECT_STORAGE_CLIENT.list_objects,namespace_name=NAMESPACE_NAME, bucket_name=MID_BUCKET).data.objects
ocid = run_aidoc_extract(obj_list1)

In [None]:
obj_list2 = oci.pagination.list_call_get_all_results(OBJECT_STORAGE_CLIENT.list_objects,namespace_name=NAMESPACE_NAME, bucket_name=OP_BUCKET,prefix=OP_BUCKET_PREFIX+"/"+ocid).data.objects
objs=pd.Series([o.name for o in obj_list2 if o.name[-5:]=='.json'])
process_json(ocid,objs)  