In [None]:
# design goals:
#
# - understandability
# - modularity
# - configurability

In [None]:
from pathlib import Path

WORKSPACE_NAME = "default"

WORKSPACE_ROOT = Path("workspaces")
WORKSPACE_ROOT.mkdir(exist_ok=True)

WORKSPACE_DIR = WORKSPACE_ROOT / WORKSPACE_NAME
WORKSPACE_DIR.mkdir(exist_ok=True)

SOURCE_DOCUMENT = None # to process a specific document, set its path here; otherwise, the entire source documents repository will be used
SOURCE_DOCUMENT_DIR = WORKSPACE_DIR / "source_documents"

CONVERSION_OUTPUT_DIR = WORKSPACE_DIR / "conversion"
CONVERSION_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNKING_OUTPUT_DIR = WORKSPACE_DIR / "chunking"
CHUNKING_OUTPUT_DIR.mkdir(exist_ok=True)

AUTHORING_OUTPUT_DIR = WORKSPACE_DIR / "authoring"
AUTHORING_OUTPUT_DIR.mkdir(exist_ok=True)

SEED_EXAMPLE_OUTPUT_DIR = WORKSPACE_DIR / "seed-examples"
SEED_EXAMPLE_OUTPUT_DIR.mkdir(exist_ok=True)

SDG_OUTPUT_DIR = WORKSPACE_DIR / "sdg"
SDG_OUTPUT_DIR.mkdir(exist_ok=True)

# Document Conversion

This notebook uses [Docling](https://github.com/docling-project/docling) to convert any type of document into a Docling Document. A Docling Document is the representation of the document after conversion that can be exported as JSON. The JSON output of this notebook can then be used in others such as one that uses Docling's chunking methods.

In [None]:
!pip install -qq docling

In [None]:
files = []

if SOURCE_DOCUMENT:
    files.append(Path(SOURCE_DOCUMENT))
else:
    print("***** WARNING! Only one file at a time is supported at this time.")
    files = list(SOURCE_DOCUMENT_DIR.rglob("*.pdf"))
    print(f"***** Using {files[0]})")

print(f"Files to convert: {files}")

Next we set the configuration options for our conversion pipeline. The PDF Conversion options set here are the defaults. More information about pipeline configuration can be found on Docling.

In [None]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

pipeline_options = PdfPipelineOptions() # TODO: show the options that can be set

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options
        )
    }
)

Finally, we convert every document into Docling JSON as long as it is a valid file type to be converted

In [None]:
import json

for file in files:
    doc = doc_converter.convert(source=file).document
    doc_dict = doc.export_to_dict()

    json_output_path = CONVERSION_OUTPUT_DIR / f"{file.stem}.json"
    with open(json_output_path, "w") as f:
        json.dump(doc_dict, f)
        print(f"Path of JSON output is: {Path(json_output_path).resolve()}")

## Chunking

The goal of chunking the converted documents is to provide the teacher model small and logical pieces of the source document to generate data off of.

In this notebook we are doing chunking with [Docling](https://docling-project.github.io/docling/examples/hybrid_chunking/#hybrid-chunking).

The input to this notebook is a docling JSON file created after a docling conversion, or a directory of docling JSON files.

## Initialize the Chunker

Docling provides two chunkers, the `HierarchicalChunker` and the `HybridChunker`.
The `HierarchicalChunker` creates chunks based on the hierarchy in the Docling document

The `HybridChunker` builds on the `HierarchicalChunker` and by making it tokenization aware.

The `HybridChunker` has options for a `tokenizer`, the `max_tokens` in a chunk, and whether to merge undersized peer chunks.

In [None]:
from docling.chunking import HybridChunker

chunker = HybridChunker() # TODO: expose configuration options

## Load and chunk the converted docling document

Next lets convert the document we want to chunk up into a Docling Document.

In [None]:
all_chunks = []
docs = []
for file in files:
    doc = DocumentConverter().convert(source=file)
    docs.append(doc)
    
    chunk_iter = chunker.chunk(dl_doc=doc.document)
    chunk_objs = list(chunk_iter)
    chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_objs]

    print(f"Extracted {len(chunks)} chunks from {document.name}")
    
    for chunk in chunks:
        c = dict(chunk=chunk, file=file.stem)
        all_chunks.append(c)

# TODO: save all chunks to single file for review

## View the Chunks

To view the chunks, run through the following cell. As you can see the document is broken into small pieces with metadata about the chunk based on the document's format

In [None]:
#print(all_chunks)
print(chunks[0])

## Save the chunks to a text file for each chunk

Each chunk is saved to an individual text file in the format: `{docling-json-file-name}-{chunk #}.txt`. Having chunking in this format is important as an input to create-sdg-seed-data notebook.

In [None]:
for i, chunk in enumerate(all_chunks):
    chunk_path = CHUNKING_OUTPUT_DIR / f"{chunk['file']}-{i}.txt"
    with open(chunk_path, "w") as file:
        file.write(chunk["chunk"])

# Authoring

In [None]:
!pip install -qq docling-sdg

In [None]:
from docling_sdg.qa.utils import get_qa_chunks

filters = [
    lambda chunk: len(str(chunk.text)) > 500
]

dataset = {}
for doc in docs:
    print(f"Chunking and filtering document {doc.document.name}")

    chunks = list(chunker.chunk(dl_doc=doc.document))
    qa_chunks = list(get_qa_chunks(doc.document.name, chunk_objs, filters)) #TODO: decouple reference to chunk_objs from above)
    dataset[doc.document.name] = qa_chunks
    
    print(f"Created dataset {doc.document.name} with {len(qa_chunks)} QA chunks")

## Initialize QA generator, supplying details for which model to use

GenerateOptions controls which model is used for QA generation by setting generate_options.provider below. Three options are available:

* LlmProviders.WATSONX for watsonx
* LlmProviders.OPENAI for OpenAI
* LlmProviders.OPENAI_LIKE for any model provider with OpenAI compatible APIs

In [None]:
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions, LlmProvider
from pydantic import SecretStr

generate_options = GenerateOptions(api_key="fake", project_id="project_id")
generate_options.provider = LlmProvider.OPENAI_LIKE
generate_options.api_key = SecretStr("fake")
generate_options.model_id = "granite3.3"

## Configure subset selection

In [None]:
NUM_CHUNKS_TO_SELECT_FOR_AUTHORING = 5

## Run QA generation on selected chunks

In [None]:
import random #TODO: replace random sampling with subset selection

gen = Generator(generate_options=generate_options)
for doc, chunks in dataset.items():
    generate_options.generated_file = AUTHORING_OUTPUT_DIR / f"qagen-{doc}.json"
    
    print(f"processing chunks that looks like:\n{chunks[0].text}")
    selected_chunks = random.sample(chunks, NUM_CHUNKS_TO_SELECT_FOR_AUTHORING)
    print(f"Selected {len(selected_chunks)} contexts")
    
    results = gen.generate_from_chunks(selected_chunks) # automatically saves to file
    
    print(f"{doc}: {results.status}")
    break

## Read generated QAs and restructure

In [None]:
import json
import yaml
from textwrap import wrap

qnas = {}
chunk_id_to_text = {}
with open(generate_options.generated_file, "rt") as f:
    for line in f.readlines():
        entry = json.loads(line)
        chunk_id = entry['chunk_id']
        if chunk_id not in chunk_id_to_text:
            chunk_id_to_text[chunk_id] = entry['context']
        if chunk_id not in qnas:
            qnas[chunk_id] = []
        qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})

print(list(qnas.values())[0])

## Output qna.yaml

In [None]:

qna_output_path = AUTHORING_OUTPUT_DIR / Path(generate_options.generated_file.stem)

def str_presenter(dumper, data):
  if len(data.splitlines()) > 1:  # check for multiline string
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  elif len(data) > 80:
    data = "\n".join(wrap(data, 80))
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

class IndentedDumper(yaml.Dumper):
    def increase_indent(self, flow=False, indentless=False):
        return super(IndentedDumper, self).increase_indent(flow, False)

data = {'seed_examples': []}
for chunk_id, context in chunk_id_to_text.items():
    data['seed_examples'].append({
        'context': context,
        'questions_and_answers': [
            {
                'question': example['question'],
                'answer': example['answer'],
            } for example in qnas[chunk_id]
        ]
    })

with open(qna_output_path, 'w') as yaml_file:
    yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)

print("Done")

In [None]:
## View generated qna.yaml

In [None]:
with open(qna_output_path) as yaml_file:
    print(yaml_file.read())

In [None]:
# sdg