# Search Evaluation - Generate ground truth dataset

 notes following the [3.3.2 video](https://www.youtube.com/watch?v=bpxi6fKcyLw)

# Step 1: Generate IDs for each record
Here, I am using the already generated.

In [None]:
import os
import json
import hashlib
from tqdm.auto import tqdm

from huggingface_hub import InferenceClient
from dotenv import load_dotenv
import pandas as pd
load_dotenv()


def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

# download documents
!wget -nc https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json

# load documents.json and flatten them
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

# add ID to document
for doc in documents:
    doc['id'] = generate_document_id(doc)

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

# Step 2: Generate questions for each document

In [None]:
def llm(prompt, provider ="novita", model="deepseek-ai/DeepSeek-V3-0324"):
    client = InferenceClient(
        provider=provider,
        api_key=os.getenv('HF_API_KEY'),
    )
    
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def generate_questions(doc, llm_params=None):
    if llm_params is None:
        llm_params = {}
    prompt = prompt_template.format(**doc)
    return llm(prompt, **llm_params)


# create prompte template
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

# generate extra questions for each record
results = {}
for doc in tqdm(documents[:5]):
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions


# Step 3: Parse results into ground truth dataset

In [None]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

doc_index = {d['id']: d for d in documents}

final_results = []
for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])
df.to_csv('ground-truth-data.csv', index=False)

