In [1]:
# design goals:
#
# - understandability
# - modularity
# - configurability

# config

In [None]:
from pathlib import Path

In [None]:
WORKSPACE_DIR="workspaces/default"

CONVERSION_OUTPUT_DIR = Path(f"{WORKSPACE_DIR}/conversion")
CONVERSION_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNKING_OUTPUT_DIR = Path(f"{WORKSPACE_DIR}/chunking")
CHUNKING_OUTPUT_DIR.mkdir(exist_ok=True)

SEED_EXAMPLE_OUTPUT_DIR = Path(f"{WORKSPACE_DIR}/seed-examples")
SEED_EXAMPLE_OUTPUT_DIR.mkdir(exist_ok=True)

SDG_OUTPUT_DIR = Path(f"{WORKSPACE_DIR}/sdg")
SDG_OUTPUT_DIR.mkdir(exist_ok=True)


# conversion 

In [None]:
!pip install docling

This notebook uses [Docling](https://docling-project.github.io/docling/examples/custom_convert/) to convert any type of document into a Docling Document. A Docling Document is the representation of the document after conversion that can be exported as JSON. The JSON output of this notebook can then be used in others such as one that uses Docling's chunking methods.

In [None]:
from docling.document_converter import DocumentConverter, ConversionError, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
import json
from pathlib import Path

First we set the paths for the documents we want to convert and where the JSON output should live.

In [None]:
doc_path = Path("/path/to/pdf")
output_dir = Path("/output/path")

files = []

if doc_path.is_file():
    files = [doc_path]
else:
    files = list(doc_path.rglob("*.pdf"))
print(f"Files to convert: {files}")

Next we set the configuration options for our conversion pipeline. The PDF Conversion options set here are the defaults. More information about pipeline configuration can be found on Docling.

In [None]:
pipeline_options = PdfPipelineOptions()

doc_converter = DocumentConverter(
     format_options={
         InputFormat.PDF: PdfFormatOption(
             pipeline_options=pipeline_options
         )
     }
)

Finally we convert every document into Docling JSON as long as it is a valid file type to be converted

In [None]:
for file in files:
    try:
        doc = doc_converter.convert(source=file).document
        doc_dict = doc.export_to_dict()
        json_output_path = output_dir / f"{file.stem}.json"
        with open(json_output_path, "w") as f:
            json.dump(doc_dict, f)
            print(f"Path of JSON output is: {Path(json_output_path).resolve()}")
    except ConversionError as e:
        print(f"Skipping file {file}")

In [None]:
# chunking

In [None]:
# authoring

# TODO: plug into docs

filters = [
    lambda chunk: len(str(chunk.text)) > 500
]

dataset = {}
for doc in docs:
    print(f"Chunking and filtering document {doc.document.name}")

    chunks = list(chunker.chunk(dl_doc=doc.document))
    qa_chunks = list(get_qa_chunks(doc.document.name, chunks, filters))
    dataset[doc.document.name] = qa_chunks
    
    print(f"Created dataset {doc.document.name} with {len(qa_chunks)} QA chunks")

#Initialize QA generator, supplying details for which model to use

GenerateOptions controls which model is used for QA generation by setting generate_options.provider below. Three options are available:

* LlmProviders.WATSONX for watsonx
* LlmProviders.OPENAI for OpenAI
* LlmProviders.OPENAI_LIKE for any model provider with OpenAI compatible APIs

In [None]:
!pip install docling-sdg

In [None]:
from docling_sdg.qa.generate import Generator
from docling_sdg.qa.base import GenerateOptions, LlmProviders

generate_options = GenerateOptions(api_key="fake", project_id="project_id")
generate_options.provider = LlmProviders.OPENAI_LIKE
generate_options.api_key = "fake"
generate_options.model_id = "mixtral" # for local ollama
generate_options.generated_file = f"data/chunks-{filename_base}.jsonl"

gen = Generator(generate_options=generate_options)

In [None]:
for doc, chunks in dataset.items():
    print(f"processing chunks that looks like:\n{chunks[0].text}")
    results = gen.generate_from_chunks(chunks)
    print(f"{doc}: {results.status}")
    break

# Read generated QAs and restructure

In [None]:
import json
import yaml
from textwrap import wrap

qnas = {}
chunk_id_to_text = {}
with open(generate_options.generated_file, "rt") as f:
    for line in f.readlines():
        entry = json.loads(line)
        chunk_id = entry['chunk_id']
        if chunk_id not in chunk_id_to_text:
            chunk_id_to_text[chunk_id] = entry['context']
        if chunk_id not in qnas:
            qnas[chunk_id] = []
        qnas[chunk_id].append({'question': entry['question'], 'answer': entry['answer']})

print(qnas[:1]

# Output qna.yaml

In [None]:
def str_presenter(dumper, data):
  if len(data.splitlines()) > 1:  # check for multiline string
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  elif len(data) > 80:
    data = "\n".join(wrap(data, 80))
    return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
  return dumper.represent_scalar('tag:yaml.org,2002:str', data)

yaml.add_representer(str, str_presenter)

# to use with safe_dump:
yaml.representer.SafeRepresenter.add_representer(str, str_presenter)

class IndentedDumper(yaml.Dumper):
    def increase_indent(self, flow=False, indentless=False):
        return super(IndentedDumper, self).increase_indent(flow, False)

data = {'seed_examples': []}
for chunk_id, context in chunk_id_to_text.items():
    data['seed_examples'].append({
        'context': context,
        'questions_and_answers': [
            {
                'question': example['question'],
                'answer': example['answer'],
            } for example in qnas[chunk_id]
        ]
    })

with open('qna.yml', 'w') as yaml_file:
    yaml.dump(data, yaml_file, Dumper=IndentedDumper, default_flow_style=False, sort_keys=False, width=80)

print("Done")

In [None]:
# sdg