##INSTALLS

In [None]:
!pip install PyPDF2 langchain llama-index nltk faiss-cpu langchain-community


In [None]:
import nltk
nltk.download('punkt')


In [None]:
!pip install --upgrade openai

In [5]:
%pip install -qU langchain-text-splitters

In [None]:
!pip install chromadb

In [7]:
!pip install lark



In [None]:
!pip install unstructured

In [None]:
!pip install unstructured[inference]

In [None]:
!apt-get install -y tesseract-ocr
!pip install pytesseract

In [None]:
 pip install "unstructured[local-inference]"

In [None]:
!pip install pdfminer.six

In [None]:
!pip install pillow_heif

In [None]:
!apt-get install poppler-utils

In [None]:
!python -m nltk.downloader averaged_perceptron_tagger

In [None]:
!pip install llama-index

In [None]:
%pip install llama-index-vector-stores-faiss
%pip install llama-index-vector-stores-pinecone
%pip install llama-index-vector-stores-chroma

In [None]:
pip install faiss-cpu  pinecone-client chromadb

In [None]:
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-langchain

In [None]:
!pip install sentence_transformers

In [24]:
!pip install llama-index>=0.9.31 pinecone-client>=3.0.0

In [25]:
%pip install -qU ragchecker llama-index

In [None]:
!pip install ragchecker
!python -m spacy download en_core_web_sm

In [27]:
!pip install -qU scikit-learn

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
pip install scikit-learn-intelex


In [None]:
!pip install --upgrade scikit-learn
!pip install --upgrade scikit-learn-intelex

In [None]:
!pip install refchecker==0.2.12

In [None]:
!pip install litellm==1.47.1

##CHUNKING FUNCTIONS

In [52]:
############ SET YOUR API KEYS #################


os.environ["AWS_ACCESS_KEY_ID"] = "set-your-key"
os.environ["AWS_SECRET_ACCESS_KEY"] = "set-your-key"
os.environ["AWS_REGION_NAME"] = "us-east-1" # us-east-1, us-east-2, us-west-1, us-west-2
os.environ["PINECONE_API_KEY"] = "set-your-key"
api_key = os.environ["PINECONE_API_KEY"]
openai_api_key = 'set-your-key'
os.environ['OPENAI_API_KEY'] = openai_api_key

In [33]:
import os
import nltk
import openai
from PyPDF2 import PdfReader
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter, CharacterTextSplitter, MarkdownTextSplitter, LatexTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from langchain.schema import Document
from langchain.vectorstores import Chroma

In [34]:
def read_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page_num in range(len(reader.pages)):
        text += reader.pages[page_num].extract_text()
    return text
# Chunking strategies
def fixed_length_chunking(text, chunk_size, chunk_overlap):
    print(f"Text length: {len(text)}")
    print(f"Chunk size: {chunk_size}")
    print(f"Chunk overlap: {chunk_overlap}")
    text_splitter = RecursiveCharacterTextSplitter(
                     chunk_size = chunk_size,
                     chunk_overlap = chunk_overlap)
    chunks = text_splitter.create_documents([text])
    print(f"\n Number of chunks created \n: {len(chunks)}")
    print("------------------")
    return chunks

def markdown_chunking(text, chunk_size, chunk_overlap):
    markdown_splitter = MarkdownTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = markdown_splitter.create_documents([text])
    return chunks

def latex_chunking(text, chunk_size, chunk_overlap):
    latex_splitter = LatexTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = latex_splitter.create_documents([text])
    return chunks

def create_embeddings(documents, persist_directory):
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=persist_directory)
    return vector_store

In [35]:
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.pdf import partition_pdf
from langchain_community.vectorstores.utils import filter_complex_metadata

def context_chunking(file_path):
    elements = partition_pdf(file_path)

    # print(f"Partitioned Elements: {[el.text[:100] for el in elements]}")
    # print ("----------------------------------")

    chunks = chunk_by_title(elements,combine_text_under_n_chars= 500, max_characters = 2048)

    # print(f"\nChunked Elements: {[el.text[:100] for el in chunks]}")
    # print ("----------------------------------")

    document_chunks = [Document(page_content=el.text, metadata={"source": el}) for el in chunks]
    return filter_complex_metadata(document_chunks)

##HELPER FUNTIONS


In [51]:
import csv
import pandas as pd

# Function to save RAGResults to CSV with chunking strategy, embedding, and vector store names
def save_rag_results_to_csv(rag_results, chunking_strategy_name, embedding_strategy_name, vector_store_name, query_num, filename="rag_results.csv"):
    # Extracting metrics from RAGResults
    overall_metrics = rag_results.metrics['overall_metrics']
    retriever_metrics = rag_results.metrics['retriever_metrics']
    generator_metrics = rag_results.metrics['generator_metrics']

    # Creating a row of data with strategy names
    row = {
        'Query Number': query_num,
        'Chunking Strategy': chunking_strategy_name,
        'Embedding Strategy': embedding_strategy_name,
        'Vector Store': vector_store_name,
        'Precision': overall_metrics['precision'],
        'Recall': overall_metrics['recall'],
        'F1 Score': overall_metrics['f1'],
        'Claim Recall': retriever_metrics['claim_recall'],
        'Context Precision': retriever_metrics['context_precision'],
        'Context Utilization': generator_metrics['context_utilization'],
        'Noise Sensitivity (Relevant)': generator_metrics['noise_sensitivity_in_relevant'],
        'Noise Sensitivity (Irrelevant)': generator_metrics['noise_sensitivity_in_irrelevant'],
        'Hallucination': generator_metrics['hallucination'],
        'Self Knowledge': generator_metrics['self_knowledge'],
        'Faithfulness': generator_metrics['faithfulness']
    }

    # Append to CSV (use pandas for convenience)
    try:
        # Check if the file exists, to decide whether to write a header or not
        write_header = not pd.io.common.file_exists(filename)

        # Convert row to DataFrame and append to CSV
        df = pd.DataFrame([row])
        df.to_csv(filename, mode='a', index=False, header=write_header)  # 'a' mode for append

    except Exception as e:
        print(f"Error saving RAG results to CSV: {str(e)}")

In [47]:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def wait_for_index_ready(pc, index_name, timeout=300):  # 5 minutes timeout
    start_time = time.time()
    while time.time() - start_time < timeout:
        try:
            # check if index exists and is ready
            index = pc.Index(index_name)
            stats = index.describe_index_stats()
            if stats:  # index is ready
                print(f"Index {index_name} is ready")
                return True
        except Exception as e:
            print(f"Index not ready yet: {e}")

        await asyncio.sleep(7)  # check every 7 seconds

    raise TimeoutError(f"Index {index_name} not ready after {timeout} seconds")

In [49]:
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def setup_pinecone_index(pc, index_name):
    try:
        existing_indexes = pc.list_indexes()
        if index_name not in [index.name for index in existing_indexes]:
            print(f"Creating new index: {index_name}")
            pc.create_index(
                name=index_name,
                dimension=1536,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1"
                )
            )
            # Wait for index to be ready using polling
            await wait_for_index_ready(pc, index_name)
            print(f"Index {index_name} created and ready")
        else:
            pc.delete_index(index_name)
            print(f"Deleted old index: {index_name}")
            pc.create_index(
                name=index_name,
                dimension=1536,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1"
                )
            )
            # Wait for index to be ready using polling
            await wait_for_index_ready(pc, index_name)
            print(f"Index {index_name} created and ready")

        return pc.Index(index_name)
    except Exception as e:
        print(f"Error in setup_pinecone_index: {e}")
        raise

##QUERIES AND GROUND TRUTH ANSWERS TO USE IN TESTING

In [53]:
# Queries array
queries = [
    #e.g.
    #1
    #"What is the primary function of the Cisco ME 4924-10GE Ethernet switch?",
]

# Ground truth answers array
ground_truth_answers = [
    #e.g.
    #"The Cisco ME 4924-10GE Ethernet switch, also referred to as the switch, is a metro Ethernet switch that can be used as user facing provider edge aggregation equipment to connect to service provider customer routers, switches, or other devices. The switch can be deployed as a broadband aggregation switch, aggregating 1000BASE-X SFP Etherne traffic from other network devices to 10-Gigabit uplinks. It supports Layer 2, Layer 3, and Layer 4 switching services.",

]


## MAIN CODE

In [40]:
import os
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.embeddings.openai import OpenAIEmbedding
import chromadb
import faiss
from llama_index.vector_stores.faiss import FaissVectorStore
from pinecone import Pinecone, ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from IPython.display import Markdown, display
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential
import time
import asyncio
from typing import List
import shutil
from llama_index.core.storage.index_store import SimpleIndexStore
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import Document as LlamaDocument
from langchain.schema import Document as LangchainDocument
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import RelevancyEvaluator
from ragchecker.integrations.llama_index import response_to_rag_results
from ragchecker import RAGResults, RAGChecker
from ragchecker.metrics import all_metrics
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
import litellm
from litellm import completion
from sklearnex import patch_sklearn
patch_sklearn()

In [54]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
file_path = "your_file " #eg. "/content/me492410ge.pdf"
persist_directory = "your_dir" #eg. "/content/vector_store_llamaIndex"

In [48]:
pc = Pinecone(api_key=api_key)

In [None]:
lc_embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2" # 768 dimentions
)
mpnet_embeddings= LangchainEmbedding(lc_embed_model)

openai_small_embeddings = OpenAIEmbeddings(model="text-embedding-3-small") # 1536 dimentions
openai_embeddings = OpenAIEmbeddings() #1536 dimentions (text-embedding-ada-002)


async def test_chunking_strategies_with_llamaIndex_query(docs_path, chunk_size1, chunk_overlap1, chunk_size2, chunk_overlap2, persist_directory=persist_directory):

    text = read_pdf(docs_path)

    # Apply the different chunking strategies
    strategies = {
        "fixed_length1": fixed_length_chunking(text, chunk_size=chunk_size1, chunk_overlap=chunk_overlap1),
        "fixed_length2": fixed_length_chunking(text, chunk_size=chunk_size2, chunk_overlap=chunk_overlap2),
        "context": context_chunking(docs_path)
    }

    for chunking_strategy_name, chunks in strategies.items():
      for embedding_strategy_name, embed_model in [("openai_small_embeddings", openai_small_embeddings),
                                                      ("openai_embeddings", openai_embeddings),
                                                      ("mpnet_embeddings", mpnet_embeddings)]:


        print(f"___TESTING STRATEGY: {chunking_strategy_name} with {embedding_strategy_name}___")

        if embedding_strategy_name == "mpnet_embeddings":
            dimension = 768
            index_name = "your_index_name"
        else:
            dimension = 1536
            index_name = "your_index_name"


        #--------------Initialize vector stores---------------------

        d = dimension

        # ----Initialize FAISS vector store
        faiss_index = faiss.IndexFlatL2(d)
        faiss_vector_store = FaissVectorStore(faiss_index=faiss_index)
        faiss_storage_context = StorageContext.from_defaults(vector_store=faiss_vector_store)


        # ----Initialize Pinecone vector store
        print(f"-----Index name = {index_name}")
        pinecone_index = await setup_pinecone_index(pc, index_name)
        pinecone_vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
        pinecone_storage_context = StorageContext.from_defaults(vector_store=pinecone_vector_store)
        # Check if the index is being created correctly
        index_description = pinecone_index.describe_index_stats()
        time.sleep(8)
        print(f"----- Pinecone Index  status: {index_description}")


        # ----Initialize Chroma vector store
        chroma_client = chromadb.EphemeralClient()
        if "quickstart-me492" in chroma_client.list_collections():
            chroma_client.delete_collection("quickstart-me492")
            chroma_client.create_collection("quickstart-me492")
        else:
            chroma_client.create_collection("quickstart-me492")
        chroma_collection = chroma_client.get_collection("quickstart-me492")
        chroma_vector_store = ChromaVectorStore(chroma_collection=chroma_collection, embedding_function=embed_model)
        chroma_storage_context = StorageContext.from_defaults(vector_store=chroma_vector_store)



        for vector_store_name, storage_context in [("faiss", faiss_storage_context), ("pinecone", pinecone_storage_context),
                                                    ("chroma", chroma_storage_context)]:

          print(f"*Using vector store: {vector_store_name}")

          llama_index_docs = [LlamaDocument(text=c.page_content) for c in chunks]
          i=0
          try:
              index = VectorStoreIndex.from_documents(llama_index_docs,
                                                      embed_model=embed_model,
                                                      storage_context=storage_context)

              # index.storage_context.persist(persist_dir=persist_directory) if you want to persist in local directory

          except Exception as e:
             raise Exception(f"Failed to index or persist the document: {str(e)}")

          # retriever = index.as_retriever(similarity_top_k=2)

          query_engine = index.as_query_engine(similarity_top_k=3)

          for query_num, (query, gt_answer) in enumerate(zip(queries, ground_truth_answers), start=1):

              response = query_engine.query(query)

              print(f"\nResponse from {query_num},   qeustion {query}   , from chunking strategy: {chunking_strategy_name} from embedding strategy:{embedding_strategy_name} and vector store {vector_store_name}\n")
              print(response)

              if response.response != 'Empty Response':
                  rag_result = response_to_rag_results(
                                query=query,
                                gt_answer=gt_answer,
                                response_object=response,
                                )
                  rag_result['query_id'] = str(query_num)
                  print(rag_result)
                  rag_results = RAGResults.from_dict({"results": [rag_result]})
                  print("----Rag Results to evaluate----")
                  print(rag_results)
                  # Initialize RAGChecker
                  evaluator = RAGChecker(
                                extractor_name="openai/gpt-4o-mini",
                                checker_name="openai/gpt-4o-mini",
                                batch_size_extractor=16,
                                batch_size_checker=16
                            )
                  # Evaluate using RAGChecker
                  evaluator.evaluate(rag_results, all_metrics)
                  print("\n----Output Results----\n")
                  print(rag_results)
                  save_rag_results_to_csv(rag_results, chunking_strategy_name, embedding_strategy_name, vector_store_name, query_num, filename="rag_res_me492.csv")

              else:
                  print(f"\nSkipping evaluation from chunking:{chunking_strategy_name} embedding:{embedding_strategy_name} vector:{vector_store_name}due to empty response.\n")


          print("\n#########################################################\n")

        print("Finished looping in vector stores")



# Example usage
await test_chunking_strategies_with_llamaIndex_query(file_path, chunk_size1=1024, chunk_overlap1=128, chunk_size2=2048, chunk_overlap2=256, persist_directory=persist_directory)

In [None]:
from google.colab import files  # Only needed for Google Colab

# Assuming the CSV is saved as 'rag_results.csv'
files.download('rag_res_me492.csv')
