In [1]:
%load_ext autoreload
%autoreload 2

### Install SDG
 - git clone https://github.com/Red-Hat-AI-Innovation-Team/SDG-Research.git && cd SDG-Research
 - pip install -r requirements.txt
 - pip install -e .
 - pip install rich datasets tabulate transformers
 - If you haven't already, run the document pre-processing notebook to create the seed data

In [4]:
!pip install openai sdg_hub

Collecting sdg_hub
  Downloading sdg_hub-0.1.0a3-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-text-splitters (from sdg_hub)
  Using cached langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain-text-splitters->sdg_hub)
  Downloading langchain_core-0.3.55-py3-none-any.whl.metadata (5.9 kB)
Collecting langsmith<0.4,>=0.1.125 (from langchain-core<1.0.0,>=0.3.51->langchain-text-splitters->sdg_hub)
  Downloading langsmith-0.3.33-py3-none-any.whl.metadata (15 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.51->langchain-text-splitters->sdg_hub)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting packaging (from datasets<4.0.0,>=2.18.0->sdg_hub)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.4,>=0.1.125->langchain-core<1.0.0,>=0.3.51->langchain-text-splitters->sdg_hub)
  Using cached orjson-3.10.16-

In [5]:
# Third Party
from datasets import load_dataset
from openai import OpenAI
from sdg_hub.flow import Flow
from sdg_hub.pipeline import Pipeline
from sdg_hub.sdg import SDG
import sys

### Setup OpenAI Client for interacting with the model

In [7]:
endpoint = "<INSERT ENDPOINT HERE>"
openai_api_key = "<INSERT KEY HERE>"
openai_api_base = endpoint

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
teacher_model = client.models.list().data[0].id
print(teacher_model)

mistralai/Mixtral-8x7B-Instruct-v0.1


### Run SDG
- This will create knowledge flow from provided yaml file
- We will run this on small dataset for demo purposes
- For large scale generation, please use the python command provided in the next cell
- You can analyze the generated data to ensure the quality is similar to proivded QnA pairs

In [8]:
knowledge_agentic_pipeline = "utils/synth_knowledge1.5.yaml"
flow_cfg = Flow(client).get_flow_from_file(knowledge_agentic_pipeline)
sdg = SDG(
    [Pipeline(flow_cfg)],
    num_workers=1,
    batch_size=1,
    save_freq=1000,
)

In [9]:
number_of_samples = 5
seed_data_dir = f"output"
ds = load_dataset('json', data_files=f'{seed_data_dir}/seed_data.jsonl', split='train')
ds = ds.shuffle(seed=42).select(range(number_of_samples))

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
# Checkpoint directory is used to save the intermediate datasets
generated_data = sdg.generate(ds, checkpoint_dir="Tmp")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 116508.44it/s]


  0%|                                                                                                                                                                                                                                                                | 0/5 [00:00<?, ?it/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/29 [00:00<?, ? examples/s]

Filter:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Filter:   0%|          | 0/23 [00:00<?, ? examples/s]

Filter:   0%|          | 0/23 [00:00<?, ? examples/s]

 20%|█████████████████████████████████████████████████▌                                                                                                                                                                                                      | 1/5 [01:35<06:23, 95.84s/it]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/33 [00:00<?, ? examples/s]

Filter:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Filter:   0%|          | 0/29 [00:00<?, ? examples/s]

Filter:   0%|          | 0/29 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Filter:   0%|          | 0/24 [00:00<?, ? examples/s]

Filter:   0%|          | 0/24 [00:00<?, ? examples/s]

 40%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                                                    | 2/5 [03:12<04:49, 96.56s/it]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Filter:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Filter:   0%|          | 0/38 [00:00<?, ? examples/s]

Filter:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37 [00:00<?, ? examples/s]

 60%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 3/5 [04:50<03:14, 97.08s/it]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Filter:   0%|          | 0/24 [00:00<?, ? examples/s]

Filter:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ? examples/s]

 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                 | 4/5 [06:39<01:41, 101.74s/it]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26 [00:00<?, ? examples/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [07:54<00:00, 94.94s/it]


In [11]:
print(generated_data)
generated_data.to_json(f"{seed_data_dir}/gen.jsonl")

Dataset({
    features: ['document_outline', 'document_title', 'domain', 'icl_document', 'icl_query_1', 'icl_response_1', 'icl_query_2', 'icl_response_2', 'icl_query_3', 'icl_response_3', 'raw_document', 'dataset_type', 'document', 'question', 'response'],
    num_rows: 111
})


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

710793

### Save the generated data into training format

In [12]:
from sdg_hub.utils.parse_and_convert import create_knowledge_regular_ds, create_knowledge_pretraining_ds
from datasets import concatenate_datasets

output_dir = f"output"

# Add the system prompt to final dataset if needed. For instructlab we use system prompt similar to below
system_prompt_lab = (
    "I am a LAB Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.1-8b-base model. My primary role is to serve as a chat assistant."
)

generated_ds = load_dataset('json', data_files=f'{output_dir}/gen.jsonl', split='train')

# Create Pretraining Knowledge Dataset (Also known as Phase 0.7/Phase 7)
phase_0_7_ds = create_knowledge_pretraining_ds(generated_ds)
phase_0_7_ds.to_json(f'{output_dir}/phase_0_7_ds.jsonl', orient='records', lines=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Filter:   0%|          | 0/111 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

809597