In [2]:
## Download documents.json
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

--2024-06-20 06:23:52--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 658332 (643K) [text/plain]
Saving to: ‘documents.json.1’


2024-06-20 06:23:53 (101 MB/s) - ‘documents.json.1’ saved [658332/658332]



In [1]:
import json
from openai import OpenAI
from tqdm.auto import tqdm
import elasticsearch
from elasticsearch import Elasticsearch

  from .autonotebook import tqdm as notebook_tqdm


![rag_chart.png](./imgs/rag_chart.png)

In [3]:
def config_elastic_search():
    index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"} 
            }
        }
    }
    return index_settings


def build_search_query(num_results, query, text_boost_fields, query_type, filter_dict):
    search_query = {
        "size": num_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": text_boost_fields,
                        "type": query_type
                    }
                },
                "filter": {
                    "term": filter_dict
                }
            }
        }
    }

    return search_query

# https://techoverflow.net/2021/08/04/how-to-fix-elasticsearch-exceptions-requesterror-requesterror400-resource_already_exists_exception-index-already-exists-in-python/
def es_create_index_if_not_exists(es, index_name, body, documents):
    """Create the given ElasticSearch index and ignore error if it already exists"""
    try:
        es.indices.create(index=index_name, body=body)
        for doc in tqdm(documents):
            es.index(index=index_name, document=doc)
        for doc in tqdm(documents):
            es.index(index=index_name, document=doc)
        for doc in tqdm(documents):
            es.index(index=index_name, document=doc)
    except elasticsearch.exceptions.RequestError as ex:
        if ex.error == 'resource_already_exists_exception':
            pass # Index already exists. Ignore.
        else: # Other exception - raise it
            raise ex

def build_elastic_search(elasticsearch_url, documents, index_name="course-questions"):
    index_settings = config_elastic_search()
    es_client = Elasticsearch(elasticsearch_url) 
    # es_client.indices.create(index=index_name, body=index_settings)
    es_create_index_if_not_exists(es=es_client, index_name=index_name, 
                                  body=index_settings, documents=documents)
    return es_client

def elastic_search(index_name, elastic_query, es_client):
    response = es_client.search(index=index_name, body=elastic_query)
    result_docs = []
    for hit in tqdm(response['hits']['hits']):
        result_docs.append(hit['_source'])
    return result_docs

In [4]:
def build_documents_from_json(json_path):
    '''
    Convert document json format to a list of
    elements which contain 4 objects: 
    course, text, question, and section
    '''
    with open(json_path, 'rt') as f_in:
        docs_raw = json.load(f_in) 
    documents = []
    
    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)
    return documents


def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}
CONTEXT: {context}
""".strip()
    
    context = ""
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


def build_llm(base_url, api_key):
    client = OpenAI(
        base_url=base_url,
        api_key=api_key
    )
    return client


def query_llm(prompt, client, model_name):
    response = client.chat.completions.create(
        model=model_name,
        messages=[{'role':'user', 'content':prompt}]
    )
    return response.choices[0].message.content


In [6]:
elastic_results

[{'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package

In [5]:
%%time
query = 'how do I run kafka?'

json_doc_path = 'documents.json'
cvt_documents = build_documents_from_json(json_doc_path)
# print(cvt_documents)

elasticsearch_url = 'http://localhost:9200'
index_name = "course-questions2"
es_client = build_elastic_search(elasticsearch_url, cvt_documents, index_name)

num_results = 10
text_boost_fields = ["question^3", "text", "section"]
query_type = "best_fields"
# keyword_fields = ["course"]
filter_dict = {'course': 'data-engineering-zoomcamp'}
elastic_query = build_search_query(num_results=num_results, query=query, 
                                       text_boost_fields=text_boost_fields,
                                       query_type=query_type,filter_dict=filter_dict)
elastic_results = elastic_search(index_name, elastic_query, es_client)

prompt = build_prompt(query=query, search_results=elastic_results)
base_url = 'http://localhost:11434/v1/'
api_key = 'ollama'
model_name = 'phi3'
phi3_client = build_llm(base_url, api_key)
response_res = query_llm(prompt=prompt, client=phi3_client, model_name=model_name)

100%|██████████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 40.58it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 40.77it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 40.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 98922.26it/s]


APIConnectionError: Connection error.

In [6]:
print(response_res)

NameError: name 'response_res' is not defined

In [None]:
def elastic_rag(query):
    elastic_query = build_search_query(num_results=num_results, query=query, 
                                       text_boost_fields=text_boost_fields,
                                       query_type=query_type,filter_dict=filter_dict)
    elastic_results = elastic_search(index_name, elastic_query, es_client)
    prompt = build_prompt(query=query, search_results=elastic_results)
    response_res = query_llm(prompt=prompt, client=phi3_client, model_name=model_name)
    return response_res

In [None]:
print(elastic_rag(query= 'the course has already started, can I still enroll?'))