In [1]:
# design goals:
#
# - understandability
# - modularity
# - configurability

In [None]:
# config

WORKSPACE_DIR="workspaces/default"
# replace with pathlib

# mkdir etc

In [None]:
# conversion

CONVERSION_OUTPUT_DIR = f"{WORKSPACE_DIR}/conversion"
# replace with pathlib

## Chunking
The goal of chunking for InstructLab SDG is to provide the teacher model small and logical pieces of the source document to generate data off of.

In this notebook we are doing chunking with Docling[https://docling-project.github.io/docling/examples/hybrid_chunking/#hybrid-chunking].

The input to this notebook is a docling JSON file created after a docling conversion, or a directory of docling JSON files.

In [None]:
!pip install docling

In [None]:
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker, HierarchicalChunker
from pathlib import Path

## Set the source document path

Here we're going to want to set the converted.json that comes from the conversion notebook.

If the conversion notebook was not run then, setting the path to the source document in any form is fine.

In [None]:
doc_path = Path("output")

files = []

if doc_path.is_file():
    files = [doc_path]
else:
    files = list(doc_path.rglob("*.json"))
print(f"Docling JSON's to chunk: {files}")

## Initialize the Chunker

Docling provides two chunkers, the `HierarchicalChunker` and the `HybridChunker`.
The `HierarchicalChunker` creates chunks based on the hierarchy in the Docling document

The `HybridChunker` builds on the `HierarchicalChunker` and by making it tokenization aware.

The `HybridChunker` has options for a `tokenizer`, the `max_tokens` in a chunk, and whether to merge undersized peer chunks.

In [None]:
#chunker = HierarchicalChunker()
chunker = HybridChunker()

In [None]:
## Load and chunk the converted docling document

Next lets convert the document we want to chunk up into a Docling Document.

In [None]:
all_chunks = []
for file in files:
    try:
        doc = DocumentConverter().convert(source=file).document
        chunk_iter = chunker.chunk(dl_doc=doc)
        chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter]
        for chunk in chunks:
            c = dict(chunk=chunk, file=file.stem)
            all_chunks.append(c)
    except ConversionError as e:
        print(f"Skipping file {file}")

## View the Chunks

To view the chunks, run through the following cell. As you can see the document is broken into small pieces with metadata about the chunk based on the document's format

In [None]:
# print(all_chunks)

## Save the chunks to a text file for each chunk

Each chunk is saved to an individual text file in the format: `{docling-json-file-name}-{chunk #}.txt`. Having chunking in this format is important as an input to create-sdg-seed-data notebook.

In [None]:
output_dir = Path("output/chunks")
for i, chunk in enumerate(all_chunks):
    chunk_path = output_dir / f"{chunk["file"]}-{i}.txt"
    with open(chunk_path, "w") as file:
        file.write(chunk["chunk"])

In [None]:
# Authoring

In [None]:
filters = [
    lambda chunk: len(str(chunk.text)) > 500
]

dataset = {}
for doc in docs:
    print(f"Chunking and filtering document {doc.document.name}")

    chunks = list(chunker.chunk(dl_doc=doc.document))
    qa_chunks = list(get_qa_chunks(doc.document.name, chunks, filters))
    dataset[doc.document.name] = qa_chunks
    
    print(f"Created dataset {doc.document.name} with {len(qa_chunks)} QA chunks")

# Initialize QA generator, supplying details for which model to use

GenerateOptions controls which model is used for QA generation by setting generate_options.provider below. Three options are available:

* LlmProviders.WATSONX for watsonx
* LlmProviders.OPENAI for OpenAI
* LlmProviders.OPENAI_LIKE for any model provider with OpenAI compatible APIs

In [None]:
!pip install docling-sdg

In [None]:
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions, LlmProviders

generate_options = GenerateOptions(api_key="fake", project_id="project_id")
generate_options.provider = LlmProviders.OPENAI_LIKE
generate_options.api_key = "fake"
generate_options.model_id = "mixtral" # for local ollama
generate_options.generated_file = f"data/chunks-{filename_base}.jsonl"

gen = Generator(generate_options=generate_options)

In [None]:
for doc, chunks in dataset.items():
    print(f"processing chunks that looks like:\n{chunks[0].text}")
    results = gen.generate_from_chunks(chunks)
    print(f"{doc}: {results.status}")
    break

# Read generated QAs and restructure

In [None]:
import json
import yaml
from textwrap import wrap

qnas = {}
chunk_id_to_text = {}
with open(generate_options.generated_file, "rt") as f:
    for line in f.readlines():
        entry = json.loads(line)
        chunk_id = entry['chunk_id']
        if chunk_id not in chunk_id_to_text:
            chunk_id_to_text[chunk_id] = entry['context']
        if chunk_id not in qnas:
            qnas[chunk_id] = []
        qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})

print(qnas[:1]

# Output qna.yaml

In [None]:
def str_presenter(dumper, data):
  if len(data.splitlines()) > 1:  # check for multiline string
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  elif len(data) > 80:
    data = "\n".join(wrap(data, 80))
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

class IndentedDumper(yaml.Dumper):
    def increase_indent(self, flow=False, indentless=False):
        return super(IndentedDumper, self).increase_indent(flow, False)

data = {'seed_examples': []}
for chunk_id, context in chunk_id_to_text.items():
    data['seed_examples'].append({
        'context': context,
        'questions_and_answers': [
            {
                'question': example['question'],
                'answer': example['answer'],
            } for example in qnas[chunk_id]
        ]
    })

with open('qna.yml', 'w') as yaml_file:
    yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)

print("Done")

In [None]:
# sdg