# 100% Self-hosted, RAG using using ⌘ R


<img src="./resources/flow.png" width=800px>

In [1]:
import uuid
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display

import qdrant_client
from qdrant_client.models import Distance, VectorParams, Batch

from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.llms.ollama import Ollama
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import SentenceTransformerRerank

from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore

In [2]:
# allows nested access to the event loop
nest_asyncio.apply()

Wait for Qdrant server to start

In [3]:
from qdrant_server_status import wait_for_qdrant_container

# Now, you can use this function directly
wait_for_qdrant_container(timeout=180, interval=20)

# it usually takes around a minute


Waiting for Qdrant vector DB server to start...


Waiting for Qdrant vector DB server to start...
Waiting for Qdrant vector DB server to start...
Waiting for Qdrant vector DB server to start...
Running Qdrant container(s): ['zealous_noyce']
Qdrant vector DB server is up and running.


True

In [4]:
# add your documents in this directory, you can drag & drop
input_dir_path = '/teamspace/studios/this_studio/test-dir'

In [5]:

# setting up the llm, embed_model & reranker
llm=Ollama(model="command-r", request_timeout=120.0)
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-large-en-v1.5")

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

#### `Note:` If the following code throws a connection error, don't worry just run the cell again & it should work.

In [7]:
# load data
loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".pdf"],
            recursive=True
        )
docs = loader.load_data()

Settings.chunk_size = 256
Settings.chunk_overlap = 20

# Creating an index over loaded data
Settings.embed_model = embed_model

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)
unique_collection_name = f"document_chat_{uuid.uuid4()}"
vector_store = QdrantVectorStore(client=client, collection_name=unique_collection_name)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
)

In [8]:
# Create a reranker
rerank = SentenceTransformerRerank( model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=2)

  return self.fget.__get__(instance, owner)()


In [9]:
# Create the query engine, where we use a cohere reranker on the fetched nodes
Settings.llm = llm
query_engine = index.as_query_engine(similarity_top_k=4, node_postprocessors=[rerank])
# query_engine = index.as_query_engine()
# ====== Customise prompt template ======
qa_prompt_tmpl_str = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.\n"
"Query: {query_str}\n"
"Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

# Generate the response
response = query_engine.query("What exactly is DSPy?")

In [10]:
display(Markdown(str(response)))

DSPy is a framework for solving advanced tasks with language and retrieval models by composing and declaring modules. Instead of using free-form string prompts, a DSPy programmer defines specific signatures that instruct the LM on what needs to be done. The core module in DSPy responsible for managing these signatures is called Predict.