## Building the Knowledge Base

This document shows how to do perform Parent Document Retrieval using Atlas Vector Search and OpenAI embeddings + LLM with custom splitting logic in Python.

In [None]:
import os
import copy
from typing import List


In [None]:
import os
import openai


In [None]:
# !curl -o paper.pdf https://arxiv.org/pdf/1706.03762.pdf
!curl -o hnsw_paper.pdf https://arxiv.org/pdf/1603.09320.pdf

In [None]:
pdf_path = "./hnsw_paper.pdf"
from pypdf import PdfReader

reader = PdfReader(pdf_path)
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
# docs = loader.load_and_split()

In [None]:
import bson
import time
import uuid
from openai import ChatCompletion

docs = []
min_chunk_size = 200
max_chunk_size = 2000
delimiter = '. '


def find_splits(chunk: str):
    index = 0
    indexes = []
    while index < len(chunk):
        index = chunk.find(delimiter, index)
        if index == -1:
            break
        indexes.append(index)
        index += 1
    return indexes
    

import pymongo


client = pymongo.MongoClient("") # mongodb cluster URI

db = client['vector-test']
coll = db['parent_child_example']
# index_name = "parent_child_example"



def recursive_text_splitter(chunk: str, parent_id):
    chunk_length = len(chunk)
    id = bson.objectid.ObjectId()
    if chunk_length < min_chunk_size:    
        chunk_dict = {"_id": id, "text": chunk}
        chunk_dict["parent_id"] = parent_id
        embeddings = openai.Embedding.create(
            input=chunk,
            model="text-embedding-ada-002"
        )
        chunk_dict['text_embedding'] = embeddings.data[0].embedding
        docs.append(chunk_dict)

        
    indexes = find_splits(chunk)
    if indexes == [] or indexes == [0]:
        return

    split_idx = min(indexes, key=lambda x:abs(x-int(chunk_length / 2)))

    chunk_left, chunk_right = chunk[:split_idx + 1], chunk[split_idx +1:]


    if chunk_length > max_chunk_size:
        id = None
    else:
        chunk_dict = {"_id": id, "text": chunk}
        if parent_id: 
            chunk_dict["parent_id"] = parent_id
            embeddings = openai.Embedding.create(
                input=chunk,
                model="text-embedding-ada-002"
            )
            chunk_dict['text_embedding'] = embeddings.data[0].embedding
        # docs.append(chunk_dict)
        else:
            if 'parent_id' in chunk_dict.keys():
                del chunk_dict['parent_id']
            try:
                response = ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a quiz writer. I will give you an answer, and you will come up with a question that might be asked of it."},
                        {"role": "user", "content": chunk},
                    ]
                )
            except Exception:
                time.sleep(1)
                response = ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a quiz writer. I will give you an answer, and you will come up with a question that might be asked of it."},
                        {"role": "user", "content": chunk},
                    ]
                )
            sample_question = response._previous['choices'][0]['message']['content']
            chunk_dict['sample_question'] = sample_question
            embeddings = openai.Embedding.create(
                input=sample_question,
                model="text-embedding-ada-002"
            )
            chunk_dict['text_embedding'] = embeddings.data[0].embedding
    
        docs.append(chunk_dict)
    # import pdb; pdb.set_trace()
    recursive_text_splitter(chunk_left, id)
    recursive_text_splitter(chunk_right, id)
    # print(chunk_right)

for page in reader.pages:
    text = page.extract_text()
    recursive_text_splitter(text, None)


In [None]:
docs[3]['text']

In [None]:
import sys
coll = db['hnsw_parent_retrieval_example_with_sample_questions']
for elem in docs:
    try:
        coll.insert_one(elem)
    except:
        continue


In [None]:
# Query parent documents using lookup

k = 3
multiplier = 10

embeddings = openai.Embedding.create(
                input="How does HNSW differ from NSW?",
                model="text-embedding-ada-002"
            )

query_vector = embeddings.data[0].embedding

agg_pipeline = [{
	        "$vectorSearch": {
	            "index":'nested_search_index',
	            "path": "text_embedding",
	            "queryVector": query_vector,
                "limit": k,
	            "numCandidates": k * multiplier,
	            },
	        },
	        },
            {
            "$match": {"sample_question": {"$exists": False}}
            },
	        {
    		"$project": {"text_embedding": 0} 
	        },
	    {
        '$lookup' : {"from": "hnsw_parent_retrieval_example",
                      "localField": "parent_id",
                      "foreignField": "_id",
                      "as": 'parent_documents'
                       }},
        {'$unwind': {"path": "$parent_documents"}},
        {"$limit": k}
]


cursor = coll.aggregate(agg_pipeline)

top_result = cursor.next()

context = top_result['parent_documents']['text']



In [None]:
# Query parent documents using graphLookup

k = 3
multiplier = 10

embeddings = openai.Embedding.create(
                input="How does HNSW differ from NSW?",
                model="text-embedding-ada-002"
            )

query_vector = embeddings.data[0].embedding

agg_pipeline = [
			{
	        "$search": {
	            "index":'nested_search_index',
	            "knnBeta": {
	            "path": "text_embedding",
	            "vector": query_vector,
	            "k": k * multiplier,
	            },
	        },
	        },
            {
            "$match": {"sample_question": {"$exists": False}}
            },
	        {
    		"$project": {"text_embedding": 0} 
	        },
	    {
        '$graphLookup' : {"from": "hnsw_parent_retrieval_example",
                          "startWith": "$parent_id",
                          "connectFromField": "parent_id",
                          "connectToField": "_id",
                          "as": 'parent_documents'
                       }},
    {
    "$project": {
      "parent_document": {
        "$arrayElemAt": [
          { "$slice": ["$parent_documents", -1] },
          0
        ]
      }
    }},
        {"$limit": k}
]


cursor = coll.aggregate(agg_pipeline)

top_result = cursor.next()

context = top_result['parent_document']['text']

