In [1]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType
from pymilvus import utility
connections.connect(
    host="localhost", 
    port="19530"
)

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("cointegrated/rubert-tiny")

  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name cointegrated/rubert-tiny. Creating a new one with mean pooling.


In [3]:
output = model.encode("Phải như là cha mày ")

In [5]:
output.shape

(312,)

In [26]:
import json 

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [7]:
utility.drop_collection("course_info")


In [8]:


fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="text_id", dtype=DataType.VARCHAR, max_length=535),
    FieldSchema(name="section", dtype=DataType.VARCHAR, max_length=535),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=50000),
    FieldSchema(name="question", dtype=DataType.VARCHAR, max_length=535),
    FieldSchema(name="course", dtype=DataType.VARCHAR, max_length=535),
    FieldSchema(name="question_vector", dtype=DataType.FLOAT_VECTOR, dim=312), 
    FieldSchema(name="text_vector", dtype=DataType.FLOAT_VECTOR, dim=312), 
    FieldSchema(name="question_text_vector", dtype=DataType.FLOAT_VECTOR, dim=312)]

schema = CollectionSchema(fields=fields,enable_dynamic_field=True)

collection = Collection(name="course_info", schema=schema)

index_params = {
    "metric_type": "IP",
    "params": {},
}

collection.create_index("question_vector", index_params)
collection.create_index("text_vector", index_params)
collection.create_index("question_text_vector", index_params)


entities = []
for doc in documents: 
    try: 
        question = doc['question']
        text = doc['text']
        qt = question + ' ' + text
        doc['question_vector'] = model.encode(question)
        doc['text_vector'] = model.encode(text)
        doc['question_text_vector'] = model.encode(qt)
        entity = {
                "text_id": doc['text_id'],
                "id": doc['id'],
                "section": doc['section'],
                "text": text,
                "question": question,
                "course": doc['course'],
                "question_vector": doc['question_vector'],
                "text_vector": doc['text_vector'], 
                "question_text_vector": doc['question_text_vector'], 
            }
        entities.append(entity)
    except KeyError as e:
        print(f"Missing key {e} in document {doc['text_id']}")
        continue
    
collection.insert(entities)

(insert count: 948, delete count: 0, upsert count: 0, timestamp: 453340693104164867, success count: 948, err count: 0

In [9]:
collection.release()

In [10]:
collection.load()

In [3]:
collection = Collection(name="course_info")


In [4]:
def search(field, vector, course):
    # Perform the search operation
    res = collection.search(
        anns_field=f"{field}", 
        filter=f"course == '{course}'",
        param={"metric_type": "IP", "params": {}},
        data=[vector],
        output_fields=["text_id", "text", "section", "question", "course", "id"], 
        limit=5,  # Max. number of search results to return
    )

    # Initialize an empty list to hold the results
    result_docs = []

    # Loop through the hits
    for hits in res:
        for hit in hits:
            # Append each hit as a dictionary containing the desired fields
            hit_dict = {
                "text_id": hit.entity.get("text_id"),
                "text": hit.entity.get("text"),
                "section": hit.entity.get("section"),
                "question": hit.entity.get("question"),
                "course": hit.entity.get("course"),
            }
            result_docs.append(hit_dict)
    
    # Return the list of result documents
    return result_docs


In [5]:
def question_vector_search(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return search('question_vector', v_q, course)

In [6]:
def question_text_vector_search(q):
    question = q['question']
    course = q['course']
    
    v_q = model.encode(question)

    return search('question_text_vector', v_q, course)

In [19]:
def build_prompt(query, search_result) : 
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the the CONTEXT. 
        Use only the facts from the CONTEXT when answering the QUESTION. 
        QUESTION: {question}

        CONTEXT: 
        {context}
        """.strip()
    context = ""

    for doc in search_result: 
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt
    

In [20]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',)
def llm(prompt):
    response = client.chat.completions.create(
        model='gemma2',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [9]:
import pandas as pd

In [10]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [11]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [12]:
ground_truth[10]

{'question': 'If I join late, will I have access to all the course materials from the beginning?',
 'course': 'data-engineering-zoomcamp',
 'document': '7842b56a'}

In [21]:
def rag(query): 
    results = question_text_vector_search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [30]:
ground_truth[10]

{'question': 'If I join late, will I have access to all the course materials from the beginning?',
 'course': 'data-engineering-zoomcamp',
 'document': '7842b56a'}

In [22]:
llm_answer = rag(ground_truth[10])

In [39]:
orig_answer = [d for d in documents if d.get('text_id') == ground_truth[10]["document"]][0]["text"]

In [44]:
orig_answer

"Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [43]:
llm_answer

'Yes, according to the text "we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes". \n'

In [41]:
v_llm = model.encode(llm_answer)
v_orig = model.encode(orig_answer)


In [42]:
print(v_llm.dot(v_orig))

50.064358
