In [42]:
import min_search
import json

In [43]:
with open('documents.json','rt') as f_in:
    doc_raw=json.load(f_in)

In [45]:
documents=[]

for course_dict in doc_raw:
    for doc in  course_dict['documents']:
        doc['course']=course_dict['course']
        documents.append(doc)

In [46]:
documents[0]

{'text': "dThe purpose of this document is to capture frequently asked technical questions\nThe next cohort starts in Jan 2025. More info at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start？',
 'course': 'data-engineering-zoomcamp'}

In [49]:
min_search.Index(
    text_fields=['question','text','section'],
    keyword_fields=['course'])

<min_search.Index at 0x7323dc60bd60>

In [50]:
index.fit(documents)

<min_search.Index at 0x7323dc90bb80>

In [51]:
q='the course already started. Can i still enroll?'

In [52]:
boost={'question':3.0,'section':0.5}

results=index.search(
    query=q,
    boost_dict=boost,
    num_results=5
)

In [53]:
results

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related question',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks as long as the form is still open and accepting submissions.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything to the last minute.",
  'section': 'General course questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'mlops-zoomcamp'},
 {'text': "Yes, even i

In [54]:
from openai import OpenAI

In [55]:
client=OpenAI()

In [57]:
response=client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user','content':q}]
)

In [63]:
response.choices[0].message.content

"It depends on the specific course and the institution offering it. Here are a few steps you can take to find out if you can still enroll:\n\n1. **Check the Enrollment Policy:** Review the course description or the institution's website for information about late enrollment policies.\n\n2. **Contact the Instructor or Administration:** Reach out directly to the course instructor or the administrative office. Explain your situation and ask if it is still possible to enroll.\n\n3. **Consider Self-Paced or Rolling Admissions Courses:** Some institutions offer self-paced courses or have rolling admissions, which might give you more flexibility for late enrollment.\n\n4. **Catch Up on Missed Material:** If late enrollment is allowed, ask the instructor for guidance on how to catch up on any material you've missed so far.\n\nIf you explain your situation and show a strong interest in the course, many instructors and institutions are willing to accommodate late enrollments. Good luck!"

In [74]:
prompt_template="""
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from CONTEXT when answering the QUESTION
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT:
{context}

""".strip()

In [70]:
context=""

for doc in results:
    context=context+f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [82]:
prompt=prompt_template.format(question=q,context=context).strip()

In [83]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from CONTEXT when answering the QUESTION
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: the course already started. Can i still enroll?

CONTEXT:
section: General course-related question
question: The course has already started. Can I still join it?
answer: Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.
In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.

section: General course questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks as long as the form is still open and a

In [86]:
response=client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user','content':prompt}]
)

In [88]:
print(response.choices[0].message.content)

Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course. In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. 

Additionally, even if you don't register, you're still eligible to submit the homeworks as long as the form is still open and accepting submissions. Be aware, however, that there will be deadlines for turning in the final projects.


In [116]:
def search(query):

    boost={'question':3.0,'section':0.5}
    
    results=index.search(
        query=query,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=10
    )
    return results


In [117]:
def build_prompt(query,search_results):
    prompt_template="""
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from CONTEXT when answering the QUESTION
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    
    """.strip()
    
    context=""
    
    for doc in search_results:
        context=context+f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt=prompt_template.format(question=query,context=context).strip()
    
    return prompt

In [118]:
def llm(prompt):
    response=client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role':'user','content':prompt}])
    return response.choices[0].message.content

In [124]:
query='how do i run the Kafka'
def rag(query):
    results=search(query)
    prompt=build_prompt(query,results)
    answer=llm(prompt)
    return answer

In [125]:
rag(query)

"To run Kafka, you need to follow the instructions based on your specific scenario:\n\n1. **Java Kafka**:\n   - If you want to run a Kafka producer or consumer with a Java implementation, navigate to your project directory.\n   - Use the command:\n     ```bash\n     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n     ```\n\n2. **Python Kafka**:\n   - **Environment Setup**: If you're running Python scripts, such as `producer.py`, and encountering a module not found error:\n     - Create a virtual environment and install the necessary packages using `requirements.txt`:\n       ```bash\n       python -m venv env\n       source env/bin/activate\n       pip install -r ../requirements.txt\n       ```\n     - To activate the virtual environment for subsequent uses:\n       ```bash\n       source env/bin/activate  # On Windows, use env/Scripts/activate\n       ```\n     - To deactivate it:\n       ```bash\n       deactivate\n       ```\n\n3. **

In [126]:
rag('the course already started, how do i enroll?')

"Yes, you can still join the course after the start date. Even if you don't register, you're eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects, so it's important not to leave everything for the last minute."

In [127]:
from elasticsearch import Elasticsearch

In [128]:
es_client=Elasticsearch('http://localhost:9200')

In [129]:
es_client.info()

ObjectApiResponse({'name': '6ca038f3194f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'zX6HTCrcSf-mQH0rUiaWOw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [131]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(
    index=index_name,
    body=index_settings  # use 'body' instead of 'settings'
)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [133]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [134]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1026/1026 [00:31<00:00, 32.22it/s]


In [135]:
query="I just discovered the course. Can i still enroll it?"

In [136]:
search_query={
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [138]:
response=es_client.search(index=index_name,body=search_query)

In [141]:
results_doc=[]
for hit in response['hits']['hits']:
    results_doc.append(hit['_source'])

In [142]:
results_doc

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [143]:
def elastic_search(query):
    search_query={
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
    response=es_client.search(index=index_name,body=search_query)
    results_doc=[]
    for hit in response['hits']['hits']:
        results_doc.append(hit['_source'])
    return results_doc
    
    

In [144]:
query='how do i run the Kafka'
def rag(query):
    results=elastic_search(query)
    prompt=build_prompt(query,results)
    answer=llm(prompt)
    return answer

In [145]:
rag(query)

"To run Kafka producer, consumer, or kstreams in the terminal, navigate to your project directory and execute the following command:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nReplace `<jar_name>` with the appropriate name of your JAR file. This command assumes you're using Java and have already built your project."