In [1]:
import os
import pandas as pd
import json
import requests

In [2]:
base_url = "https://github.com/khushal2911/llm-zoomcamp/blob/main"
relative_url = "03-vector-search/eval"
doc_url = f'{base_url}/{relative_url}/documents-with-ids.json?raw=1'
doc_response = requests.get(doc_url)
documents = doc_response.json()

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [4]:
truth_url = f'{base_url}/{relative_url}/ground-truth-data.csv?raw=1'
df_ground_truth = pd.read_csv(truth_url)

In [5]:
df_ground_truth

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
3,How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
4,Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4622,How should I destroy infrastructure created us...,mlops-zoomcamp,886d1617
4623,What is the first step to destroy AWS infrastr...,mlops-zoomcamp,886d1617
4624,Can I destroy infrastructure created with GitH...,mlops-zoomcamp,886d1617
4625,What command initializes Terraform with specif...,mlops-zoomcamp,886d1617


In [6]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text
    doc['question_text_vector'] = model.encode(qt)

100%|██████████████████████████████████████████████████████████████████████| 948/948 [00:58<00:00, 16.16it/s]


In [13]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████| 948/948 [00:22<00:00, 42.95it/s]


In [14]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [15]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [16]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [17]:
question_text_vector_knn(ground_truth[0])

[{'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'id': 'c02e79ef'},
 {'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave e

In [18]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [19]:
from openai import OpenAI

client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [20]:
def rag(query):
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [34]:
ground_truth[2]

{'question': 'What is the link for course registration?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

In [35]:
rag(ground_truth[2])

'The link for course registration is included in the document, but the exact link is not provided in the context. Please refer to the document or associated communication for the exact registration link.'

In [36]:
doc_ids = {doc['id']: doc for doc in documents}

In [37]:
doc_ids['c02e79ef']['text']

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

In [38]:
answer_orig = "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."
answer_llm = 'The link for course registration is included in the document, but the exact link is not provided in the context. Please refer to the document or associated communication for the exact registration link.'

In [42]:
v_orig = model.encode(answer_orig)
v_llm = model.encode(answer_llm)

v_orig.dot(v_llm)

0.3583333

In [48]:
from tqdm.auto import tqdm
similarity = []
for i, rec in enumerate(tqdm(ground_truth)):
    id = rec['document']
    answer_llm = rag(rec)
    answer_orig = doc_ids[id]['text']
    
    v_orig = model.encode(answer_orig)
    v_llm = model.encode(answer_llm)

    cos_sim = v_orig.dot(v_llm)
    similarity.append(cos_sim)

  1%|▊                                                                   | 56/4627 [04:09<5:39:50,  4.46s/it]


KeyboardInterrupt: 