In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
import utils

import os
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=['file']
).load_data()

In [None]:
'''print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])'''

In [18]:
# merging all the documents into a single document
from llama_index import Document

document = Document(text='\n\n'.join(doc.text for doc in documents))

## Sentence Window Retrieval Setup

In [19]:
# SentenceWindowNodeParser - an object that will split a document into individual sentences (chunks)
# and augment each sentence (chunk) with the surrounding context around that sentence. 
from llama_index.node_parser import SentenceWindowNodeParser

# creating the sentence window node parser with the default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key='window',
    original_text_metadata_key='original_text'
)

## Building the VectorStore Index

In [23]:
from llama_index.llms import OpenAI

llm = OpenAI(model='gpt-3.5-turbo', temperature=0.1)

In [24]:
# ServiceContext - wrapper object that contains all the context needed for indexing (eg: VectorStore Index)
from llama_index import ServiceContext

sentence_service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model='local:BAAI/bge-small-en-v1.5',
    node_parser=node_parser
)

In [25]:
from llama_index import VectorStoreIndex

sentence_vs_index = VectorStoreIndex.from_documents(
    [document], service_context=sentence_service_context
)

In [26]:
# Below line stores the vector store index locaclly in case you want to load it in later
sentence_vs_index.storage_context.persist(persist_dir='sentence_vector_store_index')

In [27]:
# This block of code is to check if an index file exist, then it will load it if not, it will rebuild it

import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage

if not os.path.exists("sentence_vector_store_index"):
    sentence_vs_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_service_context
    )

    sentence_vs_index.storage_context.persist(persist_dir="sentence_vector_store_index")
else:
    sentence_vs_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="sentence_vector_store_index"),
        service_context=sentence_service_context
    )

## Building the PostProcessor 

##### Post processor is used to replace the embedded text with its surrounded sentences for more context

In [28]:
# MetadataReplacementPostProcessor - takes a value stored in the metadata and 
# replaces the node.text with that value. 
# This is done after retrieving the nodes and before sending it to the LLM (synthesis). 

from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(
    target_metadata_key='window'
)

In [29]:
# We are creating a copy of our original nodes along with a new set nodes assigned to scored_nodes variable. 
# MetadataReplacementPostProcessor will be applied to scored_nodes which replaces the text with the surrounding sentences 
from llama_index.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [30]:
nodes_old[1].text

'How are you? '

In [31]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [32]:
replaced_nodes[1].text

'Hello.  How are you?  I, Anirudh, am a recent gradaute from Boston University with a major in Data Analytics.  This is my project on RAG applications using different RAG techniques. '

#### As can be in the result above, the text in the node (indivdual sentence has been replace by the surrounding sentences for more context)

## Adding a Reranker

##### Re-orders the top K chunks taken (relevant to the user query) using a model. Out of K, top_n will be used as input for LLM

In [33]:
from llama_index.indices.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n=2, model='BAAI/bge-reranker-base'
)


In [34]:
from llama_index import QueryBundle
from llama_index.schema import TextNode, NodeWithScore

query = QueryBundle("I want a dog.")

scored_nodes = [
    NodeWithScore(node=TextNode(text="This is a cat"), score=0.6),
    NodeWithScore(node=TextNode(text="This is a dog"), score=0.4),
]

In [35]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

In [36]:
print([(x.text, x.score) for x in reranked_nodes])

[('This is a dog', 0.91827387), ('This is a cat', 0.001404068)]


## Running the Query Engine

In [37]:
sentence_window_engine = sentence_vs_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc,rerank]
)

In [38]:
window_response = sentence_window_engine.query('What are the technologies used in the project')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
from llama_index.response.notebook_utils import display_response

display_response(window_response)

**`Final Response:`** The project uses Python, Flask, and Docker.

## Putting everything together

In [40]:
# '''
# We will be writing two functions here:
# 1) Vector Store Index - build_sentence_window_index()
# 2) Query engine - build_sentence_window_query_engine() '''

# import os
# from llama_index import ServiceContext, VectorStoreIndex, StorageContext, load_index_from_storage
# from llama_index.node_parser import SentenceWindowNodeParser
# from llama_index.indices.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank

# # Creating a Vector Store Index for sentence window RAG technique
# def build_sentence_window_index(
#         documents,
#         llm,
#         embed_model='local:BAAI/bge-small-en-v1.5',
#         sentence_window_size=3,
#         save_dir='sentence_window_index'
# ):
#     # creating the sentence window node parser with default settings
#     node_parser = SentenceWindowNodeParser.from_defaults(
#         window_size=sentence_window_size,
#         window_metadata_key='window',
#         original_text_metadata_key='original_text'
#     )

#     sentence_context = ServiceContext.from_defaults(
#         llm=llm,
#         embed_model=embed_model,
#         node_parser=node_parser
#     )

#     if not os.path.exists(save_dir):
#         sentence_index = VectorStoreIndex.from_documents(
#             documents, service_context=sentence_context
#         )
#         sentence_index.storage_context.persist(persist_dir=save_dir)
#     else:
#         sentence_index = load_index_from_storage(
#             StorageContext.from_defaults(persist_dir=save_dir),
#             service_context=sentence_context,
#         )

#     return sentence_index

# # creating a sentence window query engine
# def get_sentence_window_query_engine(
#         sentence_index, similarity_top_k=6, rerank_top_n=2
# ):
#     # defining postprocessors
#     postproc = MetadataReplacementPostProcessor(target_metadata_key='window')
#     rerank = SentenceTransformerRerank(top_n=rerank_top_n, model='BAAI/bge-reranker-base')

#     sentence_window_engine = sentence_index.as_query_engine(
#         similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
#     )

#     return sentence_window_engine

In [41]:
# calling the function to build vector store index
from utils import build_sentence_window_index, get_sentence_window_query_engine
from llama_index.llms import OpenAI

index = build_sentence_window_index(
    [document],
    llm=OpenAI(model='gpt-3.5-turbo', temperature=0.1),
    save_dir='./sentece_index'
)

In [42]:
# calling the function to get query engine
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)

## TruLens Evaluation

In [43]:
questions = []
with open('eval_questions.txt','r') as file:
    for line in file:
        item = line.strip()
        questions.append(item)

In [44]:
questions

['What are the technologies used in the project?',
 'What is the installation process?',
 'Can you talk about the dataset used in the project?',
 'Can you explain the usage of the project?',
 'What is the process of deployment used in the project?']

In [45]:
from trulens_eval import Tru

def run_evals(questions, tru_recorder, query_engine):
    for question in questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [46]:
Tru().reset_database()

## Feedback Functions

#### Answer Relevance

In [47]:
from trulens_eval import Feedback
from trulens_eval import OpenAI as fOpenAI

provider = fOpenAI()

In [48]:
f_qa_relevance = Feedback(
    provider.relevance_with_cot_reasons,
    name="Answer Relevance"
).on_input_output()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


#### Context Relevance

In [49]:
from trulens_eval import TruLlama
import numpy as np

context_selection = TruLlama.select_source_nodes().node.text

In [50]:
f_qs_relevance = (
    Feedback(provider.qs_relevance,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .


#### Groundedness

In [51]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

In [52]:
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons,
             name="Groundedness"
            )
    .on(context_selection)
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


### Sentence Window (size=1)

In [53]:
sentence_index_1 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=1,
    save_dir="sentence_index_1",
)

In [54]:
sentence_window_engine_1 = get_sentence_window_query_engine(
    sentence_index_1
)

In [55]:
# tru_recorder_1 = TruLlama(
#     sentence_window_engine_1,
#     app_id='sentence window engine 1',
#     feedbacks = [
#         f_qa_relevance,
#         f_qs_relevance,
#         f_groundedness
#     ]
# )

from utils import get_prebuilt_trulens_recorder
tru_recorder_1 = get_prebuilt_trulens_recorder(sentence_window_engine_1, app_id='sentence window engine 1')

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [56]:
run_evals(questions, tru_recorder_1, sentence_window_engine_1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [57]:
Tru().run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.4.34:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

### Sentence Window (Size=3)

In [58]:
sentence_index_3 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir="sentence_index_3",
)

In [59]:
sentence_window_engine_3 = get_sentence_window_query_engine(
    sentence_index_3
)

In [60]:
tru_recorder_3 = TruLlama(
    sentence_window_engine_3,
    app_id='sentence window engine 3',
    feedbacks = [
        f_qa_relevance,
        f_qs_relevance,
        f_groundedness
    ]
)

In [61]:
run_evals(questions, tru_recorder_3, sentence_window_engine_3)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [62]:
Tru().run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.4.34:8501



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>