In [1]:
import google.generativeai as genai
import os

In [2]:
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
model = genai.GenerativeModel('gemini-2.5-pro')

In [3]:
prompt = 'introduce yourself in 100 words'
response = model.generate_content(prompt)
print(response.text)

Hello! I am a large language model, an AI trained by Google. My purpose is to understand and generate human-like text to assist you with a wide range of tasks. I can answer your questions, draft emails, write stories, summarize complex topics, translate languages, and even help you code. My knowledge is based on the vast amount of information I was trained on, allowing me to engage in detailed conversations. While I don't have personal feelings or consciousness, I am designed to be a helpful and creative partner in your tasks and explorations.


In [7]:
type(response)

google.generativeai.types.generation_types.GenerateContentResponse

In [3]:
!pip install minsearch



In [4]:
import minsearch

In [5]:
import json
with open('/workspaces/dez_llm/01-intro/documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

In [6]:
documents = []
for c_dt in docs_raw:
    for d in c_dt['documents']:
        d['course'] = c_dt['course']
        documents.append(d)

In [7]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [9]:
index.fit(documents)

<minsearch.minsearch.Index at 0x7321fc046180>

In [40]:
q = 'the course has already started, can I still enroll?'
boost = {'question':3.0, 'section':0.5}
results = index.search(
    query=q,
    boost_dict=boost,
    filter_dict={'course':'data-engineering-zoomcamp'},
    num_results=5
)
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [19]:
prompt_template = """
You're a course assistant. Answer the QUESTION based on the CONTEXT.
Use only the facts from CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [17]:
context = ""

for doc in results:
    context = context + f"section:{doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [18]:
print(context)

section:General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section:General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section:General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with t

In [21]:
prompt = prompt_template.format(question=q, context=context).strip()

In [22]:
response = model.generate_content(prompt)

In [24]:
response.text

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects."

# Put search and Prompt into funciton

In [31]:
def search(q):
    boost = {'question':3.0, 'section':0.5}
    results = index.search(
    query=q,
    boost_dict=boost,
    filter_dict={'course':'data-engineering-zoomcamp'},
    num_results=5
    )
    return results

In [52]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Reasonable infer the answer use the facts from CONTEXT when answering QUESTION.
   
    QUESTION: {question}
    
    CONTEXT: {context}
    """.strip()
    context = ""

    for doc in search_results:
        context = f"section:{doc['section']}\nquestion:{doc['question']}\nanswer:{doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [29]:
def llm(prompt):
    model = genai.GenerativeModel('gemini-2.5-pro')
    response = model.generate_content(prompt)
    return response.text

In [53]:
q = "the course has already started, can I still enroll?"

search_results = search(q)


prompt = build_prompt(q, search_results)

llm(prompt)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}, {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'}, {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. Th

'Yes, you should be able to enroll. The course offers a "self-paced mode," which allows you to start the course at your convenience.\n\nAdditionally, you will still be able to get support and ask questions in the Slack channel.'

In [51]:
for doc in search_results:
        context = f"section:{doc['section']}\nquestion:{doc['question']}\nanswer:{doc['text']}\n\n"
print(context)

section:General course-related questions
question:Course - Can I get support if I take the course in the self-paced mode?
answer:Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.
You can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.


