In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [None]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [10]:
n = len(documents)

for doc in documents:
    doc['id'] = generate_document_id(doc)

In [12]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

In [14]:
import json

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [23]:
prompt_template = """
You emulate a student who's taking our course. 
Formulate 5 questions this student might ask based on a FAQ record.
The record should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record.

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in a parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [19]:
from openai import OpenAI

client = OpenAI()

In [22]:
doc = documents[9]

prompt = prompt_template.format(**doc)

In [28]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content

    return json.loads(json_response)

In [30]:
json_response = generate_questions(doc)

In [32]:
from tqdm.auto import tqdm

In [33]:
results = {}

for doc in tqdm(documents):
    doc_id = doc['id']
    
    if doc_id in results:
        continue
        
    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [37]:
final_results = []

for doc_id, questions in results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [54]:
for doc_id, questions in results.items():
    course = doc_index[doc_id]['course']
    print(questions)
        

[
  "When is the start date and time for the course?",
  "How can I subscribe to the course's Google Calendar?",
  "Where do I register before the course begins?",
  "How do I join the course's Telegram channel?",
  "Should I register in DataTalks.Club's Slack for this course?"
]
[
  "Where can I find the prerequisites for this course?",
  "What should I know before joining the course?",
  "Is there a list of required skills for this course?",
  "Can you point me to the course prerequisites?",
  "What background knowledge is necessary for this course?"
]
[
  "Can I register for the course after it has already started?",
  "Am I allowed to submit homework if I join the course late?",
  "Are there deadlines for submitting final projects if I join late?",
  "If I don’t register on time, can I still participate in the course?",
  "What should I keep in mind if I join the course after it has begun?"
]
[
  "Is it necessary to wait for a confirmation email before starting the Data Engineering

In [57]:
import pickle

with open('results.pkl', 'wb') as archivo:
    pickle.dump(results, archivo)

In [38]:
import pandas as pd

In [39]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [40]:
df

Unnamed: 0,question,course,document
0,[,data-engineering-zoomcamp,c02e79ef
1,\n,data-engineering-zoomcamp,c02e79ef
2,,data-engineering-zoomcamp,c02e79ef
3,,data-engineering-zoomcamp,c02e79ef
4,"""",data-engineering-zoomcamp,c02e79ef
...,...,...,...
389497,n,mlops-zoomcamp,886d1617
389498,s,mlops-zoomcamp,886d1617
389499,?,mlops-zoomcamp,886d1617
389500,"""",mlops-zoomcamp,886d1617


In [None]:
df.to_csv('ground-truth-data.csv', index=False)