In [33]:
from openai import OpenAI
import json

In [6]:
client = OpenAI()

In [43]:
def llm(user_prompt, instructions=None, model='gpt-4o-mini'):
    messages = []

    if instructions:
        messages.append({
            'role': 'system',
            'content': instructions
        })

    messages.append({
        'role': 'user',
        'content': user_prompt
    })

    response = client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [17]:
prompt = 'what is the objective of the course?'

In [35]:
llm(prompt)

"To provide an accurate response, I would need some context about the specific course you're referring to. Generally, the objectives of a course can include:\n\n1. **Knowledge Acquisition**: To help students gain a deep understanding of the subject matter.\n2. **Skill Development**: To equip students with practical skills relevant to the field.\n3. **Critical Thinking**: To encourage analytical thinking and problem-solving skills.\n4. **Application of Knowledge**: To enable students to apply what they’ve learned in real-world situations.\n5. **Collaboration and Communication**: To promote teamwork and effective communication skills.\n6. **Assessment and Reflection**: To engage students in evaluating their understanding and progress.\n\nIf you provide the subject or name of the course, I can give you a more detailed objective."

In [21]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [23]:
from minsearch import Index

index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7965ecfe39e0>

In [26]:
question = 'I just found the course. Can I join now?'

results = index.search(
    question,
    num_results=5
)

for result in results:
    print(result)

{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp'}
{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and joi

In [38]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [30]:
instructions = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [40]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    print(prompt)
    answer = llm(prompt, instructions=instructions)
    return answer

In [44]:
rag('how do I install Kafka in Python?')

<QUESTION>
how do I install Kafka in Python?
</QUESTION>

<CONTEXT>
[{"text": "confluent-kafka: `pip install confluent-kafka` or `conda install conda-forge::python-confluent-kafka`\nfastavro: pip install fastavro\nAbhirup Ghosh\nCan install Faust Library for Module 6 Python Version due to dependency conflicts?\nThe Faust repository and library is no longer maintained - https://github.com/robinhood/faust\nIf you do not know Java, you now have the option to follow the Python Videos 6.13 & 6.14 here https://www.youtube.com/watch?v=BgAlVknDFlQ&list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&index=80  and follow the RedPanda Python version here https://github.com/DataTalksClub/data-engineering-zoomcamp/tree/main/06-streaming/python/redpanda_example - NOTE: I highly recommend watching the Java videos to understand the concept of streaming but you can skip the coding parts - all will become clear when you get to the Python videos and RedPanda files.", "section": "Module 6: streaming with kafka", "que

"To install Kafka in Python, you can use the following commands:\n\n1. Install the Confluent Kafka client:\n   ```bash\n   pip install confluent-kafka\n   ```\n   or if you're using conda:\n   ```bash\n   conda install conda-forge::python-confluent-kafka\n   ```\n\n2. If needed, also install `fastavro`:\n   ```bash\n   pip install fastavro\n   ```\n\nIf you encounter compatibility issues, especially with specific versions of Python, consider using:\n```bash\npip uninstall kafka-python\npip install git+https://github.com/dpkp/kafka-python.git\n``` \n\nMake sure you adapt the installation commands based on your environment and requirements."