<a href="https://colab.research.google.com/github/krishnannarayanaswamy/GenAI-Relevance-LLM-demo/blob/main/PDF_RAG_Astra_Q%26A_Sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Let us install the binaries, which are important for this code.

In [None]:
!python3 -m pip install pandas
!python3 -m pip install openai
!python3 -m pip install langchain
!python3 -m pip install cassandra-driver
!python3 -m pip install tiktoken
!python3 -m pip install cassio
!python3 -m pip install PyPDF





# New Section

Importing the packages
การนำเข้าแพ็คเกจ

In [None]:
import os
import pandas as pd
import numpy as np
import cassandra
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Cassandra
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
print("Successfully Imported")


Successfully Imported


In [None]:
#Completing the AstraDB connections
import cassandra
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
def get_astra():
    keyspace = 'lab'
    table= 'labtable'
    cloud_config = {'secure_connect_bundle': '<path to astra secure connect bundle>'}
    auth_provider = PlainTextAuthProvider('<your client id>','<your client password>')
    cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
    session = cluster.connect()
    return session, keyspace



Loading the PDF load files as Embedding to AstraDB
กำลังโหลดไฟล์โหลด PDF เป็นการฝังไปยัง AstraDB

In [None]:
#Uploding the pdf with the metadata
SOURCE_DIR = "/content"
FILE_SUFFIX = ".pdf"

if __name__ == "__main__":
    embeddings = OpenAIEmbeddings(openai_api_key="<your openai key<>")
    pdf_loaders = [
        PyPDFLoader(pdf_name)
        for pdf_name in (
            f for f in (
                os.path.join(SOURCE_DIR, f2)
                for f2 in os.listdir(SOURCE_DIR)
            )
            if os.path.isfile(f)
            if f[-len(FILE_SUFFIX):] == FILE_SUFFIX
        )
    ]

    # set up the vector store
    session, keyspace = get_astra()
    vectorstore = Cassandra(
        embedding=embeddings,
        session=session,
        keyspace=keyspace,
        table_name="firsttable",
    )

    # strip and load the docs
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=80,
    )
    documents = [
        doc
        for loader in pdf_loaders
        for doc in loader.load_and_split(text_splitter=text_splitter)
    ]
    #
    texts, metadatas = zip(*((doc.page_content, doc.metadata) for doc in documents))
    vectorstore.add_texts(texts=texts, metadatas=metadatas)
    index = VectorStoreIndexWrapper(vectorstore=vectorstore)
print("Uploaded")

ERROR:cassandra.connection:Closing connection <AsyncoreConnection(137623622127568) 2a5f7bcf-0f64-44f3-952e-3b2eb25f7e78-us-east1.db.astra.datastax.com:29042:bb2b3874-d0a5-4737-8953-999977fe9989> due to protocol error: Error from server: code=000a [Protocol error] message="Beta version of the protocol used (5/v5-beta), but USE_BETA flag is unset"


Uploaded


Querying from AstraDB vector store
การสืบค้นจากร้านค้าเวกเตอร์ AstraDB

In [None]:
QUESTION = 'What is about this document?'

In [None]:
matchesMMR = vectorstore.search(QUESTION, search_type='mmr', k=2)
for i, doc in enumerate(matchesMMR):
    print(f'[{i:2}]: "{doc.page_content}"')



[ 0]: "MIT Technology Review Insights
www.technologyreview.com
insights@technologyreview.com"
[ 1]: "architecture has an open-source technology 
component. Using open source is not mandatory, but 
Rakuten Card’s Ameen recommends developing initial 
data pipelines with open-source technologies, because 
not only does this allow organizations to avoid vendor 
lock-in, it also lets them develop in-house expertise 
and establish a baseline for expectations. In addition, 
whether to use open-source technology will depend 
on the criticality of the data and the need for support."


In [None]:
matchesSim = vectorstore.search(QUESTION, search_type='similarity', k=5)
for i, doc in enumerate(matchesSim):
    print(f'[{i:2}]: "{doc.page_content}"')

Using LLM to answer our queries, we will chose any on below LLM to answer your queries


In [None]:
llmProvider = 'OpenAI'  # 'GCP_VertexAI', 'Azure_OpenAI'
from getpass import getpass
if llmProvider == 'OpenAI':
    apiSecret = getpass(f'Your secret for LLM provider "{llmProvider}": ')
    os.environ['OPENAI_API_KEY'] = apiSecret
elif llmProvider == 'GCP_VertexAI':
    # we need a json file
    print(f'Please upload your Service Account JSON for the LLM provider "{llmProvider}":')
    from google.colab import files
    uploaded = files.upload()
    if uploaded:
        vertexAIJsonFileTitle = list(uploaded.keys())[0]
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(os.getcwd(), vertexAIJsonFileTitle)
    else:
        raise ValueError(
            'No file uploaded. Please re-run the cell.'
        )
elif llmProvider == 'Azure_OpenAI':
    # a few parameters must be input
    apiSecret = input(f'Your API Key for LLM provider "{llmProvider}": ')
    os.environ['AZURE_OPENAI_API_KEY'] = apiSecret
    apiBase = input('The "Base URL" for your models (e.g. "https://YOUR-RESOURCE-NAME.openai.azure.com"): ')
    os.environ['AZURE_OPENAI_API_BASE'] = apiBase
    apiLLMDepl = input('The name of your LLM Deployment: ')
    os.environ['AZURE_OPENAI_LLM_DEPLOYMENT'] = apiLLMDepl
    apiLLMModel = input('The name of your LLM Model (e.g. "gpt-4"): ')
    os.environ['AZURE_OPENAI_LLM_MODEL'] = apiLLMModel
    apiEmbDepl = input('The name for your Embeddings Deployment: ')
    os.environ['AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT'] = apiEmbDepl
    apiEmbModel = input('The name of your Embedding Model (e.g. "text-embedding-ada-002"): ')
    os.environ['AZURE_OPENAI_EMBEDDINGS_MODEL'] = apiEmbModel

    # The following is probably not going to change for some time...
    os.environ['AZURE_OPENAI_API_VERSION'] = '2023-03-15-preview'
else:
    raise ValueError('Unknown/unsupported LLM Provider')

Your secret for LLM provider "OpenAI":  ········


In [None]:
import os
# creation of the LLM resources


if llmProvider == 'GCP_VertexAI':
    from langchain.llms import VertexAI
    from langchain.embeddings import VertexAIEmbeddings
    llm = VertexAI()
    myEmbedding = VertexAIEmbeddings()
    print('LLM+embeddings from Vertex AI')
elif llmProvider == 'OpenAI':
    os.environ['OPENAI_API_TYPE'] = 'open_ai'
    from langchain.llms import OpenAI
    from langchain.embeddings import OpenAIEmbeddings
    llm = OpenAI(temperature=0)
    myEmbedding = OpenAIEmbeddings()
    print('LLM+embeddings from OpenAI')
elif llmProvider == 'Azure_OpenAI':
    os.environ['OPENAI_API_TYPE'] = 'azure'
    os.environ['OPENAI_API_VERSION'] = os.environ['AZURE_OPENAI_API_VERSION']
    os.environ['OPENAI_API_BASE'] = os.environ['AZURE_OPENAI_API_BASE']
    os.environ['OPENAI_API_KEY'] = os.environ['AZURE_OPENAI_API_KEY']
    from langchain.llms import AzureOpenAI
    from langchain.embeddings import OpenAIEmbeddings
    llm = AzureOpenAI(temperature=0, model_name=os.environ['AZURE_OPENAI_LLM_MODEL'],
                      engine=os.environ['AZURE_OPENAI_LLM_DEPLOYMENT'])
    myEmbedding = OpenAIEmbeddings(model=os.environ['AZURE_OPENAI_EMBEDDINGS_MODEL'],
                                   deployment=os.environ['AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT'])
    print('LLM+embeddings from Azure OpenAI')
else:
    raise ValueError('Unknown LLM provider.')

LLM+embeddings from OpenAI


In [None]:

print(index.query(QUESTION, llm=llm))

 This document is about the terms and conditions of a Health Companion Health Insurance Plan.


In [None]:
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
retrieverMMR = vectorstore.as_retriever(
    search_type='mmr',
    search_kwargs={
        'k': 4,
        # ...
    },
)
# Create a "RetrievalQA" chain
chainMMR = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retrieverMMR
)
# Run it and print the results
responseMMR = chainMMR.run(QUESTION)
print(responseMMR)

 This document is about the terms and conditions of a Health Companion Health Insurance Plan.
