In [7]:
# generate_dataset.py
import ollama
import json
from tqdm import tqdm
import random

# --- Configuration ---
INPUT_CHUNKS_FILE = r"E:\Kishan Reddy Generation\cvpr_papers_chunks.jsonl"
OUTPUT_DATASET_FILE = r"E:\Kishan Reddy Generation\cvpr_finetuning_dataset.jsonl"
NUM_EXAMPLES_TO_GENERATE = 200 # Target number of training examples
OLLAMA_MODEL = 'llama3' # The model we'll use for generation

# --- System Prompt ---
# This prompt guides the LLM to generate data in the correct format.
PROMPT_TEMPLATE = """
From the following context taken from a computer vision research paper, please generate one high-quality question-and-answer pair that would be suitable for fine-tuning a large language model.

**Instructions:**
1.  The question should be a specific, technical question that can be answered *only* from the provided context.
2.  The answer should be a concise, clear, and direct response based *only* on the information in the context.
3.  Do not make up information.
4.  Your output MUST be a single, valid JSON object with the keys "instruction", "context", and "response".

**Context:**
"{context}"

**JSON Output:**
"""

# --- Main Script ---

# 1. Load the source chunks
print(f"Loading chunks from {INPUT_CHUNKS_FILE}...")
with open(INPUT_CHUNKS_FILE, 'r', encoding='utf-8') as f:
    chunks = [json.loads(line) for line in f]
print(f"Loaded {len(chunks)} chunks.")

# We'll select a random subset of chunks to generate from, to get variety
if len(chunks) > NUM_EXAMPLES_TO_GENERATE:
    chunks_to_process = random.sample(chunks, NUM_EXAMPLES_TO_GENERATE)
else:
    chunks_to_process = chunks

# 2. Initialize the Ollama client
client = ollama.Client()

# 3. Generate the dataset
print(f"Generating {NUM_EXAMPLES_TO_GENERATE} examples using '{OLLAMA_MODEL}'...")
generated_examples = []

with open(OUTPUT_DATASET_FILE, 'w', encoding='utf-8') as f_out:
    for chunk_data in tqdm(chunks_to_process, desc="Generating Data"):
        context = chunk_data['page_content']
        
        # Skip chunks that are too short to have meaningful content
        if len(context.split()) < 50:
            continue
            
        prompt = PROMPT_TEMPLATE.format(context=context)
        
        try:
            # Call the local Ollama model
            response = client.chat(
                model=OLLAMA_MODEL,
                messages=[{'role': 'user', 'content': prompt}],
                format='json' # Use Ollama's built-in JSON mode
            )
            
            # The response content should be a JSON string
            json_response_str = response['message']['content']
            
            # Parse the JSON string into a Python dictionary
            data = json.loads(json_response_str)
            
            # Validate the keys
            if all(k in data for k in ["instruction", "context", "response"]):
                # Write the valid example directly to the output file
                f_out.write(json.dumps(data) + '\n')
            else:
                print(f"Warning: Skipping malformed JSON object: {data}")

        except Exception as e:
            print(f"\nAn error occurred: {e}")
            print(f"Problematic context: {context[:200]}...")
            continue

print(f"\nSynthetic dataset generation complete.")
print(f"Data saved to {OUTPUT_DATASET_FILE}")
print("IMPORTANT: Please manually review the generated file for quality before fine-tuning.")



Loading chunks from E:\Kishan Reddy Generation\cvpr_papers_chunks.jsonl...
Loaded 2310 chunks.
Generating 200 examples using 'llama3'...


Generating Data:   2%|▎         | 5/200 [00:26<15:57,  4.91s/it]



Generating Data:   8%|▊         | 17/200 [01:36<16:44,  5.49s/it]



Generating Data:  22%|██▏       | 43/200 [12:48<57:25, 21.95s/it]   



Generating Data:  23%|██▎       | 46/200 [13:00<26:18, 10.25s/it]



Generating Data:  26%|██▌       | 52/200 [13:20<11:10,  4.53s/it]



Generating Data:  35%|███▌      | 70/200 [14:09<06:52,  3.17s/it]



Generating Data:  51%|█████     | 102/200 [15:32<05:38,  3.46s/it]



Generating Data:  56%|█████▋    | 113/200 [16:08<05:25,  3.75s/it]



Generating Data:  62%|██████▏   | 124/200 [16:30<02:25,  1.92s/it]



Generating Data:  76%|███████▌  | 152/200 [17:39<02:11,  2.74s/it]



Generating Data:  82%|████████▎ | 165/200 [18:17<02:08,  3.67s/it]



Generating Data:  90%|█████████ | 181/200 [19:02<00:58,  3.06s/it]



Generating Data:  98%|█████████▊| 197/200 [19:43<00:08,  2.98s/it]



Generating Data: 100%|██████████| 200/200 [19:54<00:00,  5.97s/it]


Synthetic dataset generation complete.
Data saved to E:\Kishan Reddy Generation\cvpr_finetuning_dataset.jsonl
IMPORTANT: Please manually review the generated file for quality before fine-tuning.



