In [1]:
import requests

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
import hashlib
#Con esta funcion generamos un id unico para cada documento usando la funcion hask
def generate_document_id(doc):
    #combined = f"{doc['course']}-{doc[question]}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [3]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [4]:
documents[100]

{'text': 'In this section of the course, the 5432 port of pgsql is mapped to your computer’s 5432 port. Which means you can access the postgres database via pgcli directly from your computer.\nSo No, you don’t need to run it inside another container. Your local system will do.',
 'section': 'Module 1: Docker and Terraform',
 'question': 'PGCLI - INKhould we run pgcli inside another docker container?',
 'course': 'data-engineering-zoomcamp',
 'id': '176ce516'}

In [5]:
from collections import defaultdict

In [6]:
hashes = defaultdict(list)
#Con esto comparo la catidad de valores unicos resultado del hashing
for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)
    

In [7]:
len(hashes), len(documents) 

(947, 948)

In [8]:
#Buscamos valores duplicados, aunque al ser uno solo es practicamente irrelevante

for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [9]:
hashes['593f7569'] #muestra los data points duplicados

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [10]:
import json

In [11]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [12]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [None]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
from dotenv import load_dotenv
import os

load_dotenv() 

In [None]:
from openai import OpenAI
client = OpenAI()

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [13]:
from tqdm.auto import tqdm
import pickle

In [14]:
results = {}

In [None]:
SAVE_EVERY = 5  # Guardar cada 5 documentos

for i, doc in enumerate(tqdm(documents)):
    doc_id = doc['id']
    if doc_id in results:
        continue
        
    questions = generate_questions(doc)
    results[doc_id] = questions
    
    # Guardar periódicamente
    if i % SAVE_EVERY == 0:
        with open('results.bin', 'wb') as f_out:
            pickle.dump(results, f_out)

In [15]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [16]:
results['1f6520ca']

'[\n    "Where can I find the prerequisites for the course?",\n    "Are there prerequisites listed on GitHub for this course?",\n    "What platform contains the course prerequisites?",\n    "Which specific GitHub repository outlines the prerequisites?",\n    "Can I access the prerequisites on DataTalksClub\'s GitHub?"\n]'

In [17]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [18]:
### ✅ Solución: Eliminar los backticks antes de cargar

import json

parsed_resulst = {}

for doc_id, json_questions in results.items():
    try:
        # Elimina posibles bloques markdown con backticks y etiquetas
        cleaned = json_questions.strip()
        if cleaned.startswith("```json"):
            cleaned = cleaned.replace("```json", "", 1).strip()
        if cleaned.endswith("```"):
            cleaned = cleaned.rsplit("```", 1)[0].strip()

        parsed_resulst[doc_id] = json.loads(cleaned)

    except json.JSONDecodeError as e:
        print(f"[ERROR] No se pudo parsear JSON en '{doc_id}': {e}")
        print("Contenido:", json_questions)
        parsed_resulst[doc_id] = []


In [28]:
 parsed_resulst

{'c02e79ef': ['When does the course officially begin?',
  'What time is the first live session scheduled?',
  'How can I access the course schedule?',
  'Where should I register before the course starts?',
  'How do I join the course communication channels?'],
 '1f6520ca': ['Where can I find the prerequisites for the course?',
  'Are there prerequisites listed on GitHub for this course?',
  'What platform contains the course prerequisites?',
  'Which specific GitHub repository outlines the prerequisites?',
  "Can I access the prerequisites on DataTalksClub's GitHub?"],
 '7842b56a': ['Is it possible to enroll in the course after it has already begun?',
  'Are students allowed to participate even if they miss the initial registration?',
  'Can assignments be submitted if I join the course late?',
  'Does joining after the start date affect my ability to submit final projects?',
  'Are there any restrictions for late enrollees in terms of assignment deadlines?'],
 '0bbf41ec': ['When will 

In [19]:
doc_index = {d['id']: d for d in documents}

In [27]:
doc_index

{'c02e79ef': {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 '1f6520ca': {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 '7842b56a': {'text': "Yes, even if you don't register, you're still eligibl

In [20]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [21]:
import pandas as pd

In [22]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [26]:
df.head()

Unnamed: 0,question,course,document
0,When does the course officially begin?,data-engineering-zoomcamp,c02e79ef
1,What time is the first live session scheduled?,data-engineering-zoomcamp,c02e79ef
2,How can I access the course schedule?,data-engineering-zoomcamp,c02e79ef
3,Where should I register before the course starts?,data-engineering-zoomcamp,c02e79ef
4,How do I join the course communication channels?,data-engineering-zoomcamp,c02e79ef


In [23]:
df.to_csv('ground-truth-data.csv', index=False)

In [24]:
!head ground-truth-data.csv

question,course,document
When does the course officially begin?,data-engineering-zoomcamp,c02e79ef
What time is the first live session scheduled?,data-engineering-zoomcamp,c02e79ef
How can I access the course schedule?,data-engineering-zoomcamp,c02e79ef
Where should I register before the course starts?,data-engineering-zoomcamp,c02e79ef
How do I join the course communication channels?,data-engineering-zoomcamp,c02e79ef
Where can I find the prerequisites for the course?,data-engineering-zoomcamp,1f6520ca
Are there prerequisites listed on GitHub for this course?,data-engineering-zoomcamp,1f6520ca
What platform contains the course prerequisites?,data-engineering-zoomcamp,1f6520ca
Which specific GitHub repository outlines the prerequisites?,data-engineering-zoomcamp,1f6520ca
