<a href="https://colab.research.google.com/github/kavya7628/Deep_Learning/blob/main/RAG_TRI_CIRCULARS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regulatory RAG using LlamaIndex and KDB.AI vector store



In [None]:
# connecting to google drive drive:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q llama-index llama-index-llms-groq llama-index-embeddings-fastembed llama-index-readers-file
!pip install -q pandas opik litellm
!pip install -q tqdm nest_asyncio python-dotenv
!pip install -q llama-index llama-index-storage-kvstore-redis llama-index-vector-stores-kdbai llama-index-callbacks-opik

## Import dependencies

In [None]:
from getpass import getpass
import re
import os
import shutil
import time
import urllib
import datetime
from dotenv import load_dotenv
import asyncio
from tqdm.asyncio import tqdm_asyncio

import pandas as pd

from llama_index.core import (
    Settings,
    StorageContext,
    VectorStoreIndex,
    SimpleDirectoryReader
)

from llama_index.readers.file import FlatReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.llms.groq import Groq
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.core.storage.kvstore.simple_kvstore import SimpleKVStore as SimpleCache
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.core import set_global_handler

import kdbai_client as kdbai

import litellm
from litellm.integrations.opik.opik import OpikLogger
from opik import track
from opik.opik_context import get_current_span_data


import time
import os
import nest_asyncio
nest_asyncio.apply()  # Apply nest_asyncio to allow nested event loops



# Configure LiteLLM

In [None]:
#Add the LiteLLM OpikTracker to log traces and steps to Opik:
os.environ["OPIK_PROJECT_NAME"] = "omdena_sl_rag"
opik_logger = OpikLogger()
litellm.callbacks = [opik_logger]

In [None]:
# # Configure Opik from google secrets
from google.colab import userdata
os.environ["OPIK_API_KEY"]  = userdata.get('OPIK_API_KEY')
os.environ["OPIK_WORKSPACE"] = userdata.get('OPIK_WORKSPACE')

In [None]:
# setting Opik API Key
#os.environ["OPIK_API_KEY"] = (
#    os.environ["OPIK_API_KEY"]
#    if "OPIK_API_KEY" in os.environ
#    else getpass("OPIK API Key: ")
#)

#os.environ["OPIK_WORKSPACE"] = (
#    os.environ["OPIK_WORKSPACE"]
#    if "OPIK_WORKSPACE" in os.environ
#    else getpass("OPIK_WORKSPACE Key: ")
#)

In [None]:
set_global_handler("opik")

# Load TRI Circular dataset

In [None]:
# The name of the file
CSV_NAME = "/content/drive/MyDrive/Omdena_SL/tri_circular.csv"
# The column that contains the main text
text_column = 'markdown_content'

In [None]:
# Load the CSV into a DataFrame
df = pd.read_csv(CSV_NAME)
df.head()

Unnamed: 0,url,markdown_content,title,issue_date,markdown_path
0,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR\n\n# No.DM JHL 925VynvT\n\...,Protection of Tea from Blister Blight,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
1,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 2\n\nIssued in: Febr...,PROTECTION OF TEA FROM ROOT DISEASES,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
2,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 4\n\n# Issued in: Fe...,PROTECTION OF TEA FROM RED RUST DISEASE IN THE...,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
3,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 5\n\nIssued in: Febr...,PROTECTION OF TEA FROM STEM AND BRANCH CANKER ...,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
4,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 6\n\nIssued in: Febr...,PROTECTION OF TEA FROM COLLAR AND BRANCH CANKE...,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               49 non-null     object
 1   markdown_content  49 non-null     object
 2   title             49 non-null     object
 3   issue_date        49 non-null     object
 4   markdown_path     49 non-null     object
dtypes: object(5)
memory usage: 2.0+ KB


In [None]:
# Convert the 'date_column' to datetime
df['issue_date'] = pd.to_datetime(df['issue_date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   url               49 non-null     object        
 1   markdown_content  49 non-null     object        
 2   title             49 non-null     object        
 3   issue_date        49 non-null     datetime64[ns]
 4   markdown_path     49 non-null     object        
dtypes: datetime64[ns](1), object(4)
memory usage: 2.0+ KB


In [None]:
df.head()

Unnamed: 0,url,markdown_content,title,issue_date,markdown_path
0,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR\n\n# No.DM JHL 925VynvT\n\...,Protection of Tea from Blister Blight,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
1,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 2\n\nIssued in: Febr...,PROTECTION OF TEA FROM ROOT DISEASES,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
2,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 4\n\n# Issued in: Fe...,PROTECTION OF TEA FROM RED RUST DISEASE IN THE...,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
3,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 5\n\nIssued in: Febr...,PROTECTION OF TEA FROM STEM AND BRANCH CANKER ...,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...
4,https://www.tri.lk/wp-content/uploads/2024/03/...,# ADVISORY CIRCULAR No.DM 6\n\nIssued in: Febr...,PROTECTION OF TEA FROM COLLAR AND BRANCH CANKE...,2024-02-01,/content/drive/MyDrive/Omdena_SL/Circulars/Adv...


In [None]:
len(df)

49

In [None]:
def create_metadata(df):
  final_metadata = {}

  for i, row in df.iterrows():
    print(i)
    metadata = {}

    metadata['url'] = row['url']
    metadata['title'] = row['title']
    metadata['issue_date'] = row['issue_date']

    final_metadata[row['markdown_path']] = metadata

  return final_metadata


In [None]:
METADATA = create_metadata(df)
METADATA

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


{'/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM1e_2024.md': {'url': 'https://www.tri.lk/wp-content/uploads/2024/03/Advisory_Circular_DM1e_2024.pdf',
  'title': 'Protection of Tea from Blister Blight',
  'issue_date': Timestamp('2024-02-01 00:00:00')},
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM2e_2024.md': {'url': 'https://www.tri.lk/wp-content/uploads/2024/03/Advisory_Circular_DM2e_2024.pdf',
  'title': 'PROTECTION OF TEA FROM ROOT DISEASES',
  'issue_date': Timestamp('2024-02-01 00:00:00')},
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM4e_2024.md': {'url': 'https://www.tri.lk/wp-content/uploads/2024/03/Advisory_Circular_DM4e_2024.pdf',
  'title': 'PROTECTION OF TEA FROM RED RUST DISEASE IN THE LOW COUNTRY',
  'issue_date': Timestamp('2024-02-01 00:00:00')},
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM5e_2024.md': {'url': 'https://www.tri.lk/wp-content/uploads/2024/03/Advisory_Circular_DM5e_2024.pdf',
  '

In [None]:
local_files = [row['markdown_path'] for _,row in df.iterrows()]
local_files

['/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM1e_2024.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM2e_2024.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM4e_2024.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM5e_2024.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM6e_2024.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/Advisory_Circular_DM7e_2024.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/TRISL_Advisory_Circular_HP01e_Jun2013.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/TRI_HP02e.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/TRI_HP03e.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/TRISL_Advisory_Circular_HP04e_Jun2013.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/TRI_Advisory_Ciculars_LU_01.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/TRI_Advisory_Ciculars_LU_02.md',
 '/content/drive/MyDrive/Omdena_SL/Circulars/TRI_PA01e.md',
 '/content/drive/MyD

In [None]:
# Load local md files with LlamaIndex
def get_metadata(filepath):
    return METADATA[filepath]

documents = SimpleDirectoryReader(
    input_files=local_files,
    file_metadata=get_metadata,
    file_extractor={'.md':FlatReader()}
)

docs = documents.load_data()
print(docs)




#### Set GROQ API key and choose the LLM and Embedding model to use:

In [None]:
# setting Groq API Key from google secrets
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
os.environ["KDBAI_API_KEY"] = userdata.get('KDBAI_API_KEY')
os.environ["KDBAI_ENDPOINT"] = userdata.get('KDBAI_ENDPOINT')

In [None]:
# setting Groq API Key
#os.environ["GROQ_API_KEY"] = (
#    os.environ["GROQ_API_KEY"]
#    if "GROQ_API_KEY" in os.environ
#    else getpass("GROQ API Key: ")
#)

In [None]:
#https://huggingface.co/BAAI/bge-large-en-v1.5
#EMBEDDING_MODEL  = "BAAI/bge-small-en-v1.5"
EMBEDDING_MODEL  = "BAAI/bge-large-en-v1.5"
GENERATION_MODEL = 'llama-3.1-8b-instant'

llm = Groq(model=GENERATION_MODEL, api_key=os.environ['GROQ_API_KEY'], temperature=0.0)
embed_model = FastEmbedEmbedding(model_name=EMBEDDING_MODEL)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

# Create KDB.AI session and table¶

In [None]:
# vector DB imports
import time

In [None]:
# Set up KDB.AI endpoint and API key
KDBAI_ENDPOINT = (
    os.environ["KDBAI_ENDPOINT"]
    if "KDBAI_ENDPOINT" in os.environ
    else input("KDB.AI endpoint: ")
)
KDBAI_API_KEY = (
    os.environ["KDBAI_API_KEY"]
    if "KDBAI_API_KEY" in os.environ
    else getpass("KDB.AI API key: ")
)

session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)

Create the schema for your KDB.AI table¶

!!! Note: The 'dims' parameter in the embedding column must reflect the output dimensions of the embedding model you choose.

In [None]:
schema = [
    {"name": "document_id", "type": "bytes"},
    {"name": "text", "type": "bytes"},
    {"name": "embeddings", "type": "float32s"},
    {"name": "url", "type": "str"},
    {"name": "title", "type": "str"},
    {"name": "issue_date", "type": "datetime64[ns]"},
]


# EMBEDDING_MODEL  = "BAAI/bge-large-en-v1.5" dimension is 1024
# https://kdb.ai/learning-hub/articles/indexing-basics/
indexFlat = {
    "name": "flat_index",
    "type": "flat",
    "column": "embeddings",
    "params": {"dims": 1024, "metric": "L2"},
}

## Setup LlamaIndex RAG pipeline using KDB.AI vector store

In [None]:
KDBAI_TABLE_NAME = "rag"
database = session.database("default")

# First ensure the table does not already exist
for table in database.tables:
    if table.name == KDBAI_TABLE_NAME:
        table.drop()
        break

# Create the table
table = database.create_table(
    KDBAI_TABLE_NAME, schema=schema, indexes=[indexFlat]
)

In [None]:
Settings.llm = llm
Settings.embed_model = embed_model


In [None]:
%%time

vector_store = KDBAIVectorStore(table)
print("vector_store:",vector_store)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
print("storage_context:",storage_context)

index = VectorStoreIndex.from_documents(
    docs,
    use_async=True,  # Async processing
    storage_context=storage_context,  # Your storage context that defines where the vector store is saved
    transformations=[SentenceSplitter(chunk_size=2048, chunk_overlap=0)]
)
print(index)



vector_store: stores_text=True is_embedding_query=True flat_metadata=True hybrid_search=False batch_size=100
storage_context: StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x7818a5e2f760>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x7818a5e2e1d0>, vector_stores={'default': KDBAIVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, hybrid_search=False, batch_size=100), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x7818a5e2e170>, property_graph_store=None)
<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x7818a5e2d210>
CPU times: user 6min 1s, sys: 884 ms, total: 6min 2s
Wall time: 6min 13s


# Pipelines
### Not working doesn't identify KDBAI index name even though it is set so going with VectorStoreIndex.from_documents

In [None]:
indexFlat['name']

'flat_index'

In [None]:
"""
%%time

vector_store1 = KDBAIVectorStore(table,  schema=schema, index=indexFlat['name'])
print("vector_store1:",vector_store1)

#storage_context = StorageContext.from_defaults(vector_store=vector_store1)
#print("storage_context:",storage_context)

# Initialize the cache
cache = IngestionCache(cache=SimpleCache())


# Create the ingestion pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=2048, chunk_overlap=0),
        embed_model,
    ],
    vector_store=vector_store1,  # Use vector store from storage context
    cache=cache,
)


# Run the pipeline
nodes = pipeline.run(documents=docs)

# Print the number of ingested nodes
print(f"Ingested {len(nodes)} nodes.")
"""

vector_store1: stores_text=True is_embedding_query=True flat_metadata=True hybrid_search=False batch_size=100
Ingested 62 nodes.
CPU times: user 5min 59s, sys: 843 ms, total: 6min
Wall time: 6min 12s


In [None]:
#Persist the pipeline state
#pipeline.persist(persist_dir="./pipeline_storage")
# Persist the storage context data
#storage_context.persist(persist_dir="./storage")


    Persistence: Both the pipeline and storage context are persisted to disk using their respective persist methods.
    Loading: The StorageContext and IngestionPipeline are reloaded from disk, allowing you to resume operations with the same configuration and data.


In [None]:
"""
#https://docs.llamaindex.ai/en/stable/module_guides/storing/save_load/
# --- Loading Part ---
# Load the storage context from disk
loaded_storage_context = StorageContext.from_defaults(persist_dir="./storage")
# Load the pipeline state
loaded_pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=2048, chunk_overlap=0),
        embed_model,
    ],
    vector_store=loaded_storage_context.vector_store,  # Use loaded vector store
    cache=cache,
)
# Load the pipeline state from disk
loaded_pipeline.load(persist_dir="./pipeline_storage")
# Now you can use loaded_pipeline and loaded_storage_context as needed
"""

The differences between SimpleKVCache, StorageContext.persist, and Pipeline.persist are based on their roles and functionalities within the LlamaIndex framework:

    SimpleKVCache:
        Purpose: SimpleKVCache is an in-memory key-value store used to cache intermediate results or data during processing. It helps in reducing redundant computations by storing and retrieving data quickly.
        Scope: It is typically used within the context of data processing pipelines to cache results that can be reused, thus improving performance.
        Persistence: SimpleKVCache does not inherently support persistence to disk. It is designed for temporary storage and loses data when the application is restarted.

    StorageContext.persist:
        Purpose: This method is used to persist the state of storage components managed by the StorageContext, such as vector stores, document stores, and other data storage elements.
        Scope: It focuses on saving the data and state of these storage components to ensure data durability across application restarts.
        Use Case: Ideal for persisting data that needs to be durable, such as indexed documents or vectors, allowing them to be reloaded later.

    Pipeline.persist:
        Purpose: This method is used to persist the state of an ingestion pipeline, including its configuration and any intermediate states or results that need to be saved for later use.
        Scope: It involves saving the state of the transformations, cache, and any other components directly associated with the pipeline's operation.
        Use Case: Useful when you want to save the entire pipeline setup and its state, so you can resume processing or debugging without reinitializing everything from scratch.

In summary, SimpleKVCache is for temporary in-memory caching, StorageContext.persist is for persisting data managed by storage components, and Pipeline.persist is for saving the operational state of the pipeline itself. Each serves a distinct purpose within the data processing and management workflow.

## Setup the LlamaIndex Query Engine

Traces can be observed on Comet's Opik after executing the next cell

https://docs.llamaindex.ai/en/stable/examples/observability/OpikCallback/

In [None]:
"""
%%time
# Create a VectorStoreIndex from the vector store
index = VectorStoreIndex.from_vector_store(vector_store=vector_store1)
# Set up a query engine
query_engine = index.as_query_engine()
# Define your query string
query_str = "sumarize all regulatory changes that happened in 2022 related to tea"
# Perform the query
response = query_engine.query(query_str)
# Print the response
print(str(response))
"""

ValueError: Could not run the search. Please provide KDBAI index name.

In [None]:
query_engine = index.as_query_engine()

In [None]:
%%time

result = query_engine.query(
    """
    sumarize all regulatory changes that happened in 2022 related to tea
    """
)
print(result.response)

ValueError: Could not run the search. Please provide KDBAI index name.

## Delete the KDB.AI Table

Once finished with the table, it is best practice to drop it.

In [None]:
table.drop()

In [None]:
#Clear the cache
cache.clear()
