In [1]:
import json 
with open('document.json') as f_in: 
    docs_raw = json.load(f_in)
documents = []
for course_dict in docs_raw : 
    for doc in course_dict['documents'] : 
        doc['course'] = course_dict['course']
        documents.append(doc)

In [2]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [3]:
#created the dense vector using the pre-trained model
for i, doc in enumerate(documents):
    doc["id"] = i
    doc["text_id"] = generate_document_id(doc)

In [4]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record.

The record:

section: {section}
question: {question}
answer: {text}

* NOTE: PROVIDE THE OUTPUT IN PARSABLE JSON JUST OUTPUT TEXT WITHOUT USING CODE BLOCKS

["question1", "question2", ..., "question5"]
""".strip()

In [5]:
import google.generativeai as genai
import os 
from dotenv import load_dotenv

load_dotenv()
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel(model_name=("gemini-1.5-flash-002"))

In [6]:
def generate_questions(prompt): 
    answer = model.generate_content(prompt)
    return answer.text

In [7]:
from tqdm.auto import tqdm
results = {}

In [8]:
import time 
for doc in tqdm(documents): 
    doc_id = doc['text_id']
    prompt = prompt_template.format(**doc)
    questions = generate_questions(prompt)
    time.sleep(5)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [17]:
parsed_results = {}

for doc_id, json_questions in results.items():
    try:
        # Attempt to load the JSON data
        parsed_results[doc_id] = json.loads(json_questions)
    except json.JSONDecodeError as e:
        # Handle JSON parsing error, log the error, and continue
        print(f"Error parsing JSON for doc_id {doc_id}: {e}")

Error parsing JSON for doc_id e3106e07: Expecting value: line 1 column 1 (char 0)
Error parsing JSON for doc_id 72229da5: Expecting value: line 1 column 1 (char 0)
Error parsing JSON for doc_id f476a606: Expecting value: line 1 column 1 (char 0)
Error parsing JSON for doc_id d21bff1d: Expecting ',' delimiter: line 1 column 435 (char 434)


In [18]:
doc_index = {d['text_id']: d for d in documents}

In [19]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [20]:
import pandas as pd
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])
df.to_csv('ground-truth-data.csv', index=False)

In [21]:
print(df['question'])

0      What happens if I can't make the first Office ...
1      Is there a recording of the Office Hours sessi...
2      Where can I find the link to register for the ...
3      How do I subscribe to the Google Calendar from...
4      Will the course materials be available before ...
                             ...                        
485    If I'm not using MacOS, how can I identify and...
486    The FAQ mentions using 'pgcli -h localhost -p ...
487    The FAQ suggests changing the port to 5431, bu...
488    I tried changing the port, but I'm still getti...
489    The FAQ references a video (1.4.2) for more de...
Name: question, Length: 490, dtype: object


In [7]:
print("Data has", len(documents), "entities, each with fields: ", documents[0].keys())
print("Vector dim:", len(documents[0]["vector"]))

Data has 948 entities, each with fields:  dict_keys(['text', 'section', 'question', 'course', 'id', 'vector'])
Vector dim: 768


In [8]:
res = mv_client.insert(collection_name="course_info", data=documents)

print(res)

RPC error: [insert_rows], <DataNotMatchException: (code=1, message=The Input data type is inconsistent with defined schema, {id} field should be a int64, but got a {<class 'str'>} instead.)>, <Time:{'RPC start': '2024-10-07 04:10:50.408053', 'RPC error': '2024-10-07 04:10:50.410195'}>


DataNotMatchException: <DataNotMatchException: (code=1, message=The Input data type is inconsistent with defined schema, {id} field should be a int64, but got a {<class 'str'>} instead.)>

In [36]:
def elastic_search(query) : 
    query_vectors = [model.encode(query)]
    res = mv_client.search(
        collection_name="course_info",  
        data=query_vectors,  
        filter="course == 'data-engineering-zoomcamp'",
        limit=5,  
        output_fields=["text", "question", "section", "course"],  
    )
    return res[0]

In [42]:
def build_prompt(query, search_result) : 
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the the CONTEXT. 
        Use only the facts from the CONTEXT when answering the QUESTION. 
        If the CONTEXT doesn't contain the answer, output NONE 

        QUESTION: {question}

        CONTEXT: 
        {context}
        """.strip()
    context = ""

    for doc in search_result: 
        entity = doc['entity']
        context = context + f"section: {entity['section']}\nquestion: {entity['question']}\nanswer: {entity['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
    

In [44]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',)
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [45]:
def rag(query): 
    results = elastic_search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [48]:
print(rag('I just discovered the course. Can i still join ?'))

Based on the CONTEXT provided:

It appears that since you're just now discovering or registering for this Data Engineering Bootcamp hosted by Coursera through DATAtalkSquare, there is still time to join if registration hasn't been processed yet as courses typically have a period during which they remain open. The context does not explicitly state whether it’s still possible to enroll after certain deadlines or policies are in place; therefore from this information alone I would say YES, you could likely register even though the course has started since registration usually opens before the start date of 15th Jan 2024 at 17h00 as per Question: "Course - When will the course start?" Here's your answer assuming a general policy for educational institutions allowing prospective students to still enroll until official class materials or sessions begin.

As there is no specific mention of deadlines, policies on joining after starting in other sections of questions within these provided conte