# 1. Listitem test
* docling handles lists as `GroupItem` which consists of multiple items
* when parsing pdfs with version <=2.18.0 indentations were not properly recognized

In [2]:
import json
from pathlib import Path
import os
import time

import fitz
import pandas as pd
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    data_dir: str
    docling_model_weight_dir: str
    
settings = Settings()
os.environ["HF_HOME"] = settings.docling_model_weight_dir

In [3]:
# Import docling
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

from docling.datamodel.document import ConversionResult

IMAGE_RESOLUTION_SCALE = 2.0

pdf_format_options = PdfPipelineOptions()
pdf_format_options.images_scale = IMAGE_RESOLUTION_SCALE
pdf_format_options.generate_page_images = True
pdf_format_options.generate_picture_images = True

pdf_format_options.do_ocr = False
# pdf_format_options.do_table_structure = False
pdf_format_options.do_table_structure = True

converter = DocumentConverter(
    allowed_formats=[
            InputFormat.PDF,
            # InputFormat.IMAGE,
            # InputFormat.DOCX,
            # InputFormat.HTML,
            # InputFormat.PPTX,
    ],
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pdf_format_options, # pipeline options go here.
            # backend=PyPdfiumDocumentBackend # optional: pick an alternative backend
            backend=DoclingParseV2DocumentBackend # optional: pick an alternative backend
        ),
    }
)

In [12]:
# pdf with indented list created in google docs
file_path = "../samples/list_group_sample_google.pdf"

# pdf with indented list created in ms word
# file_path = "../samples/list_group_sample_msword.pdf"

result = converter.convert(file_path)
document = result.document

In [13]:
document.print_element_tree()

 0: unspecified with name=_root_
  1: list with name=list
   2: list_item
   3: list_item
   4: list_item
   5: list_item
   6: list_item
   7: list_item


In [14]:
for group in document.groups:
    print(group)
    print(group.label)
    print(group.children)

self_ref='#/groups/0' parent=RefItem(cref='#/body') children=[RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5')] name='list' label=<GroupLabel.LIST: 'list'>
list
[RefItem(cref='#/texts/0'), RefItem(cref='#/texts/1'), RefItem(cref='#/texts/2'), RefItem(cref='#/texts/3'), RefItem(cref='#/texts/4'), RefItem(cref='#/texts/5')]


In [15]:
for item in document.texts:
    print(item.text)

● Test1
· Test1-1
■ Test1-1-1
■ Test1-1-2
· Test1-2
● Test2


In [18]:
## doesn't apply indent
print(document.export_to_markdown())

- ● Test1
- · Test1-1
- ■ Test1-1-1
- ■ Test1-1-2
- · Test1-2
- ● Test2
