In [1]:
import minsearch
import os
from dotenv import load_dotenv
import google.generativeai as genai

# Load environment variables from .env
load_dotenv()

api_key = os.getenv("GOOGLE_GEMINI_API_KEY")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")



In [2]:
import json
with open('documents.json') as obj:
    raw_doc=json.load(obj)

In [3]:
documents=[];
for course_info in raw_doc: 
    for doc in course_info['documents']:
        doc['course']=course_info['course']
        documents.append(doc);

In [4]:
index=minsearch.Index(text_fields=['question','text','section'],keyword_fields=['course'])


In [5]:
index.fit(documents)

<minsearch.Index at 0x1cdb84382f0>

In [6]:
def search(query):
    boost={'question':3, 'section':0.4}
    results =index.search(
        query=query,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results


In [7]:
def llm(prompt):
    response = model.generate_content(prompt)
    return response.text;

In [8]:
def build_prompt(query,search_results): 

    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()
    
    context="" 
    
    for info in search_results:
        context=context+ f"section: {info['section']}\n question: {info['question']}\n answer: {info['text']}\n\n"; 
    
    final_prompt=prompt_template.format(question=query,context=context).strip()
    return final_prompt;


In [9]:
def rag(query): 
    retrive_data=search(query)
    prompt_with_context=build_prompt(query,retrive_data)
    # return prompt_with_context
    response=llm(prompt_with_context)
    return response;

In [10]:
rag("Can I still join the course?")

"Yes, you can still join the course even if you don't register before the start date.  However, be aware of deadlines for final projects.\n"

In [None]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch(
    "http://localhost:9200")
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name="faq-document"
es_client.indices.create(index=index_name, body=index_settings)


In [None]:
from tqdm.auto import tqdm 

for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

In [15]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    # print("Raw hits:", response['hits'])  # Debugging
    
    result_docs = []
    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])
    
    return result_docs


In [None]:
results = elastic_search("How do copy a file to a Docker container?")
context_template = ''

for data in results:
    context_template += f"Q: {data['question']}\nA: {data['text']}\n\n"

context_template = context_template.replace('\t', '')

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

prompt = prompt_template.format(
    question="How do I execute a command in a running docker container?",
    context=context_template.strip()
)

print(((prompt)))


In [None]:
elastic_search("How do copy a file to a Docker container?")[2]

In [17]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

test_documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        test_documents.append(doc)

In [18]:
test_documents[943]

{'text': 'Problem description\nThis is the step in the ci yml file definition:\n- name: Run Unit Tests\nworking-directory: "sources"\nrun: ./tests/unit_tests/run.sh\nWhen executing github ci action, error raises:\n…/tests/unit_test/run.sh Permission error\nError: Process completed with error code 126\nSolution description\nAdd execution  permission to the script and commit+push:\ngit update-index --chmod=+x .\\sources\\tests\\unit_tests\\run.sh\nAdded by MarcosMJD',
 'section': 'Module 6: Best practices',
 'question': 'Github actions: Permission denied error when executing script file',
 'course': 'mlops-zoomcamp'}

In [19]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag("How would I join the course")

In [None]:
query = "How do copy a file to a Docker container?"

def elastic_search(query):
    
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
               }
            }
        }
    }
    # ctrl + / for commenting a block of code

    response = es_client.search(index = index_name, body = search_query)

    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

elastic_search(query)

# q5 - building a prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    #This part takes the 'results' from minsearch and formats them into a single string.
    llmcontext = ""

    for doc in search_results:
        # llmcontext += f"section: {doc['section']}\n"
        llmcontext += f"question: {doc['question']}\n"
        llmcontext += f"answer: {doc['text']}\n\n"
    
    # Integrating the user's question 'q' and the 'context'
    prompt = prompt_template.format(question = 'How do I execute a command in a running docker container?', context = llmcontext)

    return prompt

search_results = elastic_search(query)
prompt_response = build_prompt(query, search_results)

len(prompt_response)