In [85]:
import sys
import os

sys.path.append(os.path.abspath("../"))

In [86]:
from openai import AzureOpenAI
from config import settings
import json
from evaluation.schemas import QAPairList

In [87]:
client = AzureOpenAI(
    azure_endpoint=settings.AZURE_GPT_ENDPOINT,
    api_version="2025-01-01-preview",
    api_key=settings.AZURE_GPT_KEY,
)

In [88]:
def generate_context(chunk: dict) -> str:
    return f"""

"""

In [89]:
def generate_prompt(context: str) -> str:
    prompt = f"""
## **SYSTEM INSTRUCTION**
You are an expert in generating diverse, high-quality Question-Answer pairs from structured documents. 
Your goal is to create QA pairs that can effectively evaluate a **text-to-SQL Retrieval-Augmented Generation (RAG)** system. 
You will be provided with a small set of document chunks in JSON format, each representing a database record. 
These are **partial views** of the full dataset and not the entire database.

## **Guidelines**

### DO:
- Base your questions primarily on the **'usevec_description'** field of each document.
- Ensure questions are **contextually relevant** to the overall theme of the provided chunks.
- Keep questions **clear, natural, and realistically answerable** from the documentation alone.
- Ensure that **answers are self-contained**, without requiring external knowledge.
- You can create the question relevant sometimes without specifying the exact details like "Dairy products" instead of "Milk, Whey, Curd, Butter, etc"
- Do not be too specific all the time, and not also too vague.

### AVOID THE FOLLOWING:
- Referring to “this data” or “provided data” in the question. because in RAG System "this data" refers to entire document corpus.
- Asking about clustering metadata (e.g., `cluster_tsne`, `cluster_kmeans`).
- Making questions that assume the user has access to the full database.

### **Bad Questions**
- What goods are categorized under chapter heading '5303' according to the provided data?
- Compare IGST rates for goods in chapter headings '6506' and '5609'.
- Which items have the same numeric values?
- Are there any items listed in the records with specified GST rates?
- Considering the clustering information provided...
- How are records grouped by 'cluster_kmeans' and 'cluster_tsne'?

### **Good Questions**
- What services are exempt when offered at ancient monuments protected under Indian heritage laws?
- Which item under chapter heading 9801 relates to machinery for project setups?
- Name the item classified under chapter 3002 and describe its medical application.
- Are domestic services taxable under GST?
- Are there any items that carry Compensation Cess but no GST?
- What are the Tax rates for “Sugar Items” and “Caffeinated Drinks”?

## **OUTPUT INSTRUCTIONS**

- Generate **2 distinct question-answer pairs** based on the documentation below.
- Use a mix of **question types**: factual, inferential, analytical, or multi-record-based.
- Ensure **variation in difficulty**: some should be straightforward, others requiring synthesis.
- **Each QA pair should be self-contained**, fully answerable using the provided context.

## **DOCUMENTATION**
<start documentation>
{context}
<end documentation>
""".strip()
    return prompt


def generate_qa_pairs(context: str):
    response = client.beta.chat.completions.parse(
        messages=[
            {
                "role": "user",
                "content": generate_prompt(context),
            }
        ],
        model="gpt-4o",
        temperature=0.8,
        response_format=QAPairList,
    )

    return QAPairList.model_validate_json(response.choices[0].message.content.strip())

In [90]:
FILENAME = "./synthetic_dataset/synthetic_dataset.json"
EXISTING_LIST = json.load(open(FILENAME, "r"))


def main(file_name):
    with open(f"./sample_data/{file_name}_sample.json", "r") as f:
        data = json.load(f)
        for i, item in enumerate(data):
            qa_pair = generate_qa_pairs(json.dumps(item, indent=2))

            sample = qa_pair.model_dump()
            sample.update(
                {
                    "id": item["id"],
                    "cluster_tsne": file_name + "_" + str(item["cluster_tsne"]),
                    "chunk": item,
                }
            )

            try:
                EXISTING_LIST.append(sample)
                with open(FILENAME, "w") as f:
                    json.dump(EXISTING_LIST, f, indent=2, default=str)
                print("Generated for row", i)
            except:
                EXISTING_LIST.pop(-1)
                print("Failed for row", i)

In [91]:
files = ["goods", "services"]

for i in files:
    main(i)

Generated for row 0
Generated for row 1
Generated for row 2
Generated for row 3
Generated for row 4
Generated for row 5
Generated for row 6
Generated for row 7
Generated for row 8
Generated for row 9
Generated for row 10
Generated for row 11
Generated for row 12
Generated for row 13
Generated for row 14
Generated for row 15
Generated for row 16
Generated for row 17
Generated for row 18
Generated for row 19
Generated for row 20
Generated for row 21
Generated for row 22
Generated for row 23
Generated for row 24
Generated for row 25
Generated for row 26
Generated for row 27
Generated for row 28
Generated for row 29
Generated for row 30
Generated for row 31
Generated for row 32
Generated for row 33
Generated for row 34
Generated for row 35
Generated for row 36
Generated for row 37
Generated for row 38
Generated for row 39
Generated for row 40
Generated for row 41
Generated for row 42
Generated for row 43
Generated for row 44
Generated for row 45
Generated for row 46
Generated for row 47
Ge