In [2]:
import json
import uuid
import base64
import os

def convert_docling_to_target_keep_body_order(doc):
    """
    Convert a DoclingDocument JSON to the desired TARGET format,
    preserving the order in doc["body"]["children"]
    and flattening out groups.
    """
    
    # We'll store results in target_items
    target_items = []
    
    # We assume doc["body"]["children"] is an array of refs: e.g. {"$ref": "#/texts/0"}, {"$ref": "#/pictures/0"}, etc.
    body_children = doc.get("body", {}).get("children", [])
    
    # For easy reference, let's keep the texts, pictures, groups, etc.
    texts = doc.get("texts", [])
    pictures = doc.get("pictures", [])
    groups = doc.get("groups", [])
    
    def process_text_obj(text_obj):
        """Convert a single text_obj to a dict in the desired format."""
        item = {
            "type": "text",
            "text": text_obj.get("text", "")
        }
        # Derive page_idx from text_obj.prov if present
        prov = text_obj.get("prov", [])
        if prov:
            item["page_idx"] = prov[0].get("page_no", None)
        # If it's a section_header, store its level
        if text_obj.get("label") == "section_header":
            item["text_level"] = text_obj.get("level", 1)
        return item

    def process_picture_obj(pic_obj):
        """Convert a single picture_obj to a dict in the desired format."""
        item = {
            "type": "image",
            # You can handle base64 decoding if you want; here we just use a placeholder path
            "img_path": "images/dummy.png",
            "img_caption": [],
            "img_footnote": []
        }
        # Page index
        prov_list = pic_obj.get("prov", [])
        if prov_list:
            item["page_idx"] = prov_list[0].get("page_no", None)

        # Captions are references to texts in docling, so we need to fetch them
        captions_list = pic_obj.get("captions", [])
        for caption_ref in captions_list:
            if "$ref" in caption_ref:
                ref_path = caption_ref["$ref"]
                # Typically #/texts/<index>
                if ref_path.startswith("#/texts/"):
                    idx_str = ref_path.split("/")[-1]
                    try:
                        idx_int = int(idx_str)
                        caption_text_obj = texts[idx_int]
                        item["img_caption"].append(caption_text_obj.get("text", ""))
                    except:
                        pass
            else:
                # if for some reason it's inline text
                item["img_caption"].append(caption_ref.get("text", ""))

        # Footnotes
        footnotes_list = pic_obj.get("footnotes", [])
        for footnote_ref in footnotes_list:
            if "$ref" in footnote_ref:
                ref_path = footnote_ref["$ref"]
                if ref_path.startswith("#/texts/"):
                    idx_str = ref_path.split("/")[-1]
                    try:
                        idx_int = int(idx_str)
                        footnote_text_obj = texts[idx_int]
                        item["img_footnote"].append(footnote_text_obj.get("text", ""))
                    except:
                        pass
            else:
                item["img_footnote"].append(footnote_ref.get("text", ""))

        return item

    def process_group_obj(group_obj):
        """
        Return a list of final items flattened from the group's children.
        So if the group has references to texts or pictures, we convert them
        in the same manner as if they were in the body top-level.
        """
        flattened = []
        group_children = group_obj.get("children", [])
        for child_ref in group_children:
            if "$ref" in child_ref:
                ref_path = child_ref["$ref"]
                if ref_path.startswith("#/texts/"):
                    idx_str = ref_path.split("/")[-1]
                    try:
                        idx_int = int(idx_str)
                        text_obj = texts[idx_int]
                        flattened.append(process_text_obj(text_obj))
                    except:
                        pass
                elif ref_path.startswith("#/pictures/"):
                    idx_str = ref_path.split("/")[-1]
                    try:
                        idx_int = int(idx_str)
                        pic_obj = pictures[idx_int]
                        flattened.append(process_picture_obj(pic_obj))
                    except:
                        pass
                elif ref_path.startswith("#/groups/"):
                    idx_str = ref_path.split("/")[-1]
                    try:
                        idx_int = int(idx_str)
                        nested_group_obj = groups[idx_int]
                        # recursively flatten
                        flattened.extend(process_group_obj(nested_group_obj))
                    except:
                        pass
        return flattened

    # Now we walk over body_children in order
    for child_ref in body_children:
        if "$ref" in child_ref:
            ref_path = child_ref["$ref"]
            if ref_path.startswith("#/texts/"):
                # text
                idx_str = ref_path.split("/")[-1]
                try:
                    idx_int = int(idx_str)
                    text_obj = texts[idx_int]
                    target_items.append(process_text_obj(text_obj))
                except:
                    pass

            elif ref_path.startswith("#/pictures/"):
                # picture
                idx_str = ref_path.split("/")[-1]
                try:
                    idx_int = int(idx_str)
                    pic_obj = pictures[idx_int]
                    target_items.append(process_picture_obj(pic_obj))
                except:
                    pass

            elif ref_path.startswith("#/groups/"):
                # group
                idx_str = ref_path.split("/")[-1]
                try:
                    idx_int = int(idx_str)
                    group_obj = groups[idx_int]
                    # flatten
                    group_flattened = process_group_obj(group_obj)
                    target_items.extend(group_flattened)
                except:
                    pass
    
    # Now we have a flattened list of items in the order they appear in body.children
    return target_items

In [14]:
with open("./results/docling/1706.03762v7/figure_export_without_table_structure/1706.03762v7-with-image-refs.json", "r") as f:
    doc = json.load(f)
print(doc.keys())

dict_keys(['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'pages'])


In [4]:
doc["body"]["children"][:10]

[{'$ref': '#/texts/0'},
 {'$ref': '#/texts/1'},
 {'$ref': '#/texts/2'},
 {'$ref': '#/texts/3'},
 {'$ref': '#/texts/4'},
 {'$ref': '#/texts/5'},
 {'$ref': '#/texts/6'},
 {'$ref': '#/groups/0'},
 {'$ref': '#/texts/11'},
 {'$ref': '#/texts/12'}]

In [16]:
## Group Example
# doc['body']['children'][7]
print("GROUP:",doc['groups'][0])

for i in [7,8,9,10]:
    print(doc['texts'][i])


GROUP: {'self_ref': '#/groups/0', 'parent': {'$ref': '#/body'}, 'children': [{'$ref': '#/texts/7'}, {'$ref': '#/texts/8'}, {'$ref': '#/texts/9'}, {'$ref': '#/texts/10'}], 'name': 'group', 'label': 'key_value_area'}
{'self_ref': '#/texts/7', 'parent': {'$ref': '#/groups/0'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 126.882, 't': 508.153, 'r': 210.552, 'b': 475.27699999999993, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 46]}], 'orig': 'Llion Jones ∗ Google Research llion@google.com', 'text': 'Llion Jones ∗ Google Research llion@google.com'}
{'self_ref': '#/texts/8', 'parent': {'$ref': '#/groups/0'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 235.407, 't': 508.153, 'r': 339.994, 'b': 475.27699999999993, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 61]}], 'orig': 'Aidan N. Gomez ∗ † University of Toronto aidan@cs.toronto.edu', 'text': 'Aidan N. Gomez ∗ † University of Toronto aidan@cs.toronto.edu'}
{'self_ref': '#/texts/9', 'pa

In [5]:
results = convert_docling_to_target_keep_body_order(doc)

In [10]:
results[7]

{'type': 'text',
 'text': 'Llion Jones ∗ Google Research llion@google.com',
 'page_idx': 1}

In [6]:
results[:10]

[{'type': 'text',
  'text': 'arXiv:1706.03762v7  [cs.CL]  2 Aug 2023',
  'page_idx': 1},
 {'type': 'text',
  'text': 'Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.',
  'page_idx': 1},
 {'type': 'text',
  'text': 'Attention Is All You Need',
  'page_idx': 1,
  'text_level': 1},
 {'type': 'text',
  'text': 'Ashish Vaswani ∗ Google Brain avaswani@google.com',
  'page_idx': 1},
 {'type': 'text',
  'text': 'Noam Shazeer ∗ Google Brain noam@google.com',
  'page_idx': 1},
 {'type': 'text',
  'text': 'Niki Parmar ∗ Google Research nikip@google.com',
  'page_idx': 1},
 {'type': 'text',
  'text': 'Jakob Uszkoreit ∗ Google Research usz@google.com',
  'page_idx': 1},
 {'type': 'text',
  'text': 'Llion Jones ∗ Google Research llion@google.com',
  'page_idx': 1},
 {'type': 'text',
  'text': 'Aidan N. Gomez ∗ † University of Toronto aidan@cs.toronto.edu',
  'page_idx': 1},
 

In [17]:
# Identified headings
header_items = []
header_idx = 0
for item in results:
    if 'text_level' in item:
        header_item = {
            "idx": header_idx,
            "text": item['text']
        }
        header_items.append(header_item)
        header_idx+=1

header_items

[{'idx': 0, 'text': 'Attention Is All You Need'},
 {'idx': 1, 'text': 'Abstract'},
 {'idx': 2, 'text': '1 Introduction'},
 {'idx': 3, 'text': '2 Background'},
 {'idx': 4, 'text': '3 Model Architecture'},
 {'idx': 5, 'text': '3.1 Encoder and Decoder Stacks'},
 {'idx': 6, 'text': '3.2 Attention'},
 {'idx': 7, 'text': 'Scaled Dot-Product Attention'},
 {'idx': 8, 'text': '3.2.1 Scaled Dot-Product Attention'},
 {'idx': 9, 'text': '3.2.2 Multi-Head Attention'},
 {'idx': 10, 'text': '3.2.3 Applications of Attention in our Model'},
 {'idx': 11, 'text': '3.3 Position-wise Feed-Forward Networks'},
 {'idx': 12, 'text': '3.4 Embeddings and Softmax'},
 {'idx': 13, 'text': '3.5 Positional Encoding'},
 {'idx': 14, 'text': '4 Why Self-Attention'},
 {'idx': 15, 'text': '5 Training'},
 {'idx': 16, 'text': '5.1 Training Data and Batching'},
 {'idx': 17, 'text': '5.2 Hardware and Schedule'},
 {'idx': 18, 'text': '5.3 Optimizer'},
 {'idx': 19, 'text': '5.4 Regularization'},
 {'idx': 20, 'text': '6 Results'