# RAG

## Overview

### Document and Documentation Search - Retrieval Step Demo


In [None]:
## OPEN AI EMBEDDINGS:
from langchain_openai import OpenAIEmbeddings
import os
# import
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter

API_KEY = ""

# Create the embeddings function
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key = API_KEY)

# create a text splitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
print('Cell finished')

In [None]:
# load the document and split it into chunks
document_dir = "./"
filename = "powerbi_book.pdf"
file_path = os.path.join(document_dir, filename)

pages = PyPDFLoader(file_path).load_and_split()
docs = text_splitter.split_documents(pages)

# load it into Chroma
db = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")
print('Cell finished')

In [None]:
print(docs[0].page_content, '\n\n')

data = db._collection.get(include=['embeddings'])
print(data['embeddings'][0])

In [None]:
# query it

#user_question = 'How do I build one of those charts that look like a Swiss Cheese?'
#user_question = 'Can I onboard data using SQL in powerBi?'
#user_question = 'Tell me about Line Charts'

user_question = "How can I do a pie chart in PowerBI?"
docs = db.similarity_search(user_question, k=10)

# print results
for doc in docs[0:3]:
    print(doc.page_content, '\n')
    print()

In [None]:
def _get_document_prompt(docs):
    prompt = '\n'
    for doc in docs:
        prompt += '\nContent:\n'
        prompt += doc.page_content + '\n\n'
    return prompt

print(_get_document_prompt(docs))

### Generation Step Demo

In [None]:
from openai import OpenAI

prompt = f"""
## INTRODUCTION
You are a Chatbot designed to help answer technical questions about a software.
The user asked: "{user_question}"

## CONTEXT
Technical Documentation for the software:
'''
{_get_document_prompt(docs)}
'''

## RESTRICTIONS
Refer to the products by their names.
Be clear, transparent, and factual: only state what is in the context without providing opinions or subjectivity.
Answer the question based solely on the context above; if you do not know the answer, be clear with the user that you do not know.
Only respond to questions related to the products, avoiding jokes, offensive remarks, and discussions on religion or sexuality.
If the user does not provide sufficient context, do not answer and instead ask for more information on what the user wants to know.

## TASK
First, answer directly to the user, if possible
Second, point the user int he right direction of the documentation
Lastly, answer in Markdown format

## RESPONSE STRUCTURE:
'''
# [Answer Title]
[answer text]

Source:
• From pages [...] of the Technical Documentation for *Product1* (link)
• From pages [...] of the Technical Documentation for *Product2* (link)
'''

## CONVERSATION:
User: {user_question}
Agent:
"""

client = OpenAI(api_key = API_KEY)

messages = [{'role':'user', 'content':prompt}]
model_params = {'model': 'gpt-4o', 'temperature': 0.4, 'max_tokens': 3000}
completion = client.chat.completions.create(messages=messages, **model_params, timeout=120)


answer = completion.choices[0].message.content
model = completion.model

print(user_question)
print(f'From Model: {model}:\n')
print(answer)
print('\n ------------ \n')
from IPython.display import display, HTML, Markdown
display(Markdown(answer))

## The steps of  RAG System:

### Imports

In [31]:
# Import Necessary Libraries

import os
from langchain.embeddings import OpenAIEmbeddings, SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

# Set your OpenAI API key
#API_KEY = ""



In [None]:
## Load the document
document_dir = "./"
filename = "powerbi_book.pdf"
file_path = os.path.join(document_dir, filename)

# Use PyPDFLoader to load the PDF
pages = PyPDFLoader(file_path).load_and_split()
print(f"Loaded {len(pages)} pages from the document.")

### Embeddings models

In [None]:
# Create the embeddings function using OpenAI's model throught the API
openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=API_KEY)
print("Initialized OpenAI embeddings.")

In [None]:
# Create the embeddings function using SentenceTransformer's model - LOCAL MODEL
from langchain.embeddings import HuggingFaceEmbeddings

sentence_transformer_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print("Initialized SentenceTransformer embeddings.")

In [None]:
#### COMPARE EMBEDDING VECTORS

# Sample text
sample_text = "How can I create a pie chart in PowerBI?"

# Generate embeddings
openai_vector = openai_embeddings.embed_query(sample_text)
sentence_transformer_vector = sentence_transformer_embeddings.embed_query(sample_text)

# Display vector dimensions
print(f"OpenAI Embedding Dimension: {len(openai_vector)}")
print(f"OpenAI Embedding Vector: {openai_vector[0:5]}")
print()
print(f"SentenceTransformer Embedding Dimension: {len(sentence_transformer_vector)}")
print(f"SentenceTransformer Embedding Vector: {sentence_transformer_vector[0:5]}")

### Experimenting with Text Splitters

To understand TextSplitting please check this Notebook:
https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb

In [None]:
# Initialize CharacterTextSplitter
char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# Split documents
docs_char_split = char_text_splitter.split_documents(pages)
print(f"Number of chunks with CharacterTextSplitter: {len(docs_char_split)}")


In [None]:
# Initialize RecursiveCharacterTextSplitter: https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.RecursiveCharacterTextSplitter.html
recursive_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# Split documents
docs_recursive_split = recursive_text_splitter.split_documents(pages)
print(f"Number of chunks with RecursiveCharacterTextSplitter: {len(docs_recursive_split)}")

In [None]:
# Compare first chunk
print("6th chunk using CharacterTextSplitter:")
print(docs_char_split[5].page_content[:500], "\n")

print("6th chunk using RecursiveCharacterTextSplitter:")
print(docs_recursive_split[5].page_content[:500])

#### Honorable mention - Semantic Chunking: still experimental


In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

semantic_chunk_splitter = SemanticChunker(sentence_transformer_embeddings)

docs_semantic_split = semantic_chunk_splitter.split_documents(pages)
print(f"Number of chunks with SemanticChunkerr: {len(docs_semantic_split)}")

![semantic_chunking](./img/semantic_chunking.jpg)

In [None]:
for i in range(100, 110):
    print(docs_semantic_split[i].page_content)
    print("="*80)

### Vector Stores

In [None]:
# Create ChromaDB instance - OpenAI
db_openai = Chroma.from_documents(docs_recursive_split, 
                                  openai_embeddings, 
                                  persist_directory="./chroma_db_openai")

print("ChromaDB with OpenAI embeddings created.")

In [None]:
# Create ChromaDB instance - HuggingFace Local Model
db_sentence_transformer = Chroma.from_documents(docs_recursive_split, 
                                                sentence_transformer_embeddings, 
                                                persist_directory="./chroma_db_sentence_transformer")

print("ChromaDB with SentenceTransformer embeddings created.")

### Retrieval with ChromaDB


In [None]:
#### Retrieve Documents Using OpenAI Embeddings
# User question
user_question = "How can I create a pie chart in PowerBI?"

# Retrieve documents
retrieved_docs_openai = db_openai.similarity_search(user_question, k=5)

# Display results
print("Top 3 documents retrieved using OpenAI embeddings:\n")
for idx, doc in enumerate(retrieved_docs_openai[:3], 1):
    print(f"Document {idx}:\n{doc.page_content[:500]}\n{'-'*80}\n")

In [None]:
#### Retrieve Documents Using SentenceTransformer Embeddings
# Retrieve documents
retrieved_docs_sentence = db_sentence_transformer.similarity_search(user_question, k=5)

# Display results
print("Top 3 documents retrieved using SentenceTransformer embeddings:\n")
for idx, doc in enumerate(retrieved_docs_sentence[:3], 1):
    print(f"Document {idx}:\n{doc.page_content[:500]}\n{'-'*80}\n")

In [None]:
### Analyse Retreival Results:

# Function to extract snippets
def extract_snippets(docs):
    return [doc.page_content[:200] for doc in docs]

# Extract snippets
snippets_openai = extract_snippets(retrieved_docs_openai)
snippets_sentence = extract_snippets(retrieved_docs_sentence)

# Display comparison
print("Comparison of retrieval results:\n")
for i in range(3):
    print(f"Result {i+1} with OpenAI embeddings:\n{snippets_openai[i]}\n")
    print(f"Result {i+1} with SentenceTransformer embeddings:\n{snippets_sentence[i]}\n")
    print("="*80)

### Retrieval Conclusions

In this exercise, we:

- Explored two embedding models: OpenAI's `text-embedding-ada-002` and SentenceTransformer's `all-MiniLM-L6-v2`.
- Compared their embedding dimensions and initialization processes.
- Experimented with two text splitting methods: `CharacterTextSplitter` and `RecursiveCharacterTextSplitter`.
- Observed the number of chunks produced and the content of the first chunk from each splitter.
- Built two separate ChromaDB vector stores using the different embeddings and split documents.
- Performed similarity searches to retrieve documents relevant to the user's question.
- Compared the retrieval results to analyze which combination provided more relevant information.

**Key Takeaways:**

- Different embedding models produce vectors of different dimensions, which may impact retrieval performance.
- The choice of text splitter affects how the document is chunked and can influence the context preserved in each chunk.
- Comparing retrieval results helps in selecting the best combination of embedding models and text splitters for specific use cases.

### Generation

#### Retrieve the best documents for the user query

In [46]:
# User's question
user_question = "How can I create a pie chart in PowerBI?"

# Retrieve documents
# Assume 'retrieved_docs' is the list of documents retrieved from the vector store
# For this example, we will use the documents retrieved using OpenAI embeddings
# If you used SentenceTransformer embeddings, replace 'db_openai' with 'db_sentence_transformer'
retrieved_docs = db_openai.similarity_search(user_question, k=5)

# Function to combine documents into a single string
def _get_document_prompt(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# Prepare the context from the retrieved documents
context = _get_document_prompt(retrieved_docs)


#### Build the prompt

In [47]:
# Build the prompt
prompt = f"""
## INTRODUCTION
You are a Chatbot designed to help answer technical questions about a software.
The user asked: "{user_question}"

## CONTEXT
Technical Documentation for the software:
'''
{context}
'''

## RESTRICTIONS
Refer to the products by their names.
Be clear, transparent, and factual: only state what is in the context without providing opinions or subjectivity.
Answer the question based solely on the context above; if you do not know the answer, be clear with the user that you do not know.
Only respond to questions related to the products, avoiding jokes, offensive remarks, and discussions on religion or sexuality.
If the user does not provide sufficient context, do not answer and instead ask for more information on what the user wants to know.

## TASK
First, answer directly to the user, if possible.
Second, point the user in the right direction of the documentation.
Lastly, answer in Markdown format.

## RESPONSE STRUCTURE:
'''
# [Answer Title]
[answer text]

Source:
• From pages [...] of the Technical Documentation for *Product1* (link)
• From pages [...] of the Technical Documentation for *Product2* (link)
'''

## CONVERSATION:
User: {user_question}
Agent:
"""

#### Generate the answer

In [None]:
# Import necessary libraries
import openai
from IPython.display import display, Markdown

model_gpt = "gpt-4o-mini"

# Prepare the messages payload
messages = [{'role': 'user', 'content': prompt}]

# Set model parameters
model_params = {'model': model_gpt, 'temperature': 0.4, 'max_tokens': 3000}

chat_completion = client.chat.completions.create(
    messages = messages,
    model = model_gpt,
)

answer = chat_completion.choices[0].message.content
print(answer)
print()
