In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [3]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [5]:
from collections import defaultdict
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [8]:
import json
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [9]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as few words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [11]:
from openai import OpenAI
client = OpenAI()

In [12]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [13]:
from tqdm.auto import tqdm

In [19]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
import pickle
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

29

In [22]:
import pandas as pd
file_path = 'ground-truth-data.csv'
df = pd.read_csv(file_path)

In [29]:
for _, json_questions in df.iterrows():
    print (json_questions)

question    When does the course begin?
course        data-engineering-zoomcamp
document                       c02e79ef
Name: 0, dtype: object
question    How can I get the course schedule?
course               data-engineering-zoomcamp
document                              c02e79ef
Name: 1, dtype: object
question    What is the link for course registration?
course                      data-engineering-zoomcamp
document                                     c02e79ef
Name: 2, dtype: object
question    How can I receive course announcements?
course                    data-engineering-zoomcamp
document                                   c02e79ef
Name: 3, dtype: object
question    Where do I join the Slack channel?
course               data-engineering-zoomcamp
document                              c02e79ef
Name: 4, dtype: object
question    Where can I find the prerequisites for this co...
course                              data-engineering-zoomcamp
document                                 

In [31]:
parsed_results = defaultdict(list)

for _, result in df.iterrows():
    parsed_results[result['document']].append(result['question'])

In [32]:
parsed_results

defaultdict(list,
            {'c02e79ef': ['When does the course begin?',
              'How can I get the course schedule?',
              'What is the link for course registration?',
              'How can I receive course announcements?',
              'Where do I join the Slack channel?'],
             '1f6520ca': ['Where can I find the prerequisites for this course?',
              'How do I check the prerequisites for this course?',
              'Where are the course prerequisites listed?',
              'What are the requirements for joining this course?',
              'Where is the list of prerequisites for the course?'],
             '7842b56a': ['Can I enroll in the course after it starts?',
              'Is late registration possible?',
              'Am I eligible to submit homework if I join late?',
              'Are there deadlines for final projects if I join late?',
              'Can I submit all assignments at the end of the course if I start late?'],
           