In [2]:
import os
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain_astradb import AstraDBVectorStore
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
import cassio

# Load environment variables
load_dotenv()

# Environment variables (helper function)
def get_env_var(key: str, default: str = None) -> str:
    value = os.environ.get(key)
    if not value and default:
        return default
    elif not value:
        raise ValueError(f"Environment variable {key} not set")
    return value

ASTRA_DB_APPLICATION_TOKEN = get_env_var("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = get_env_var("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_NAMESPACE = get_env_var("ASTRA_DB_NAMESPACE")
OPENAI_API_KEY = get_env_var("OPENAI_API_KEY")

# Initialize the Astra DB connection
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id="7d55f754-6018-46c5-8034-0e271ba1f33f")

# Initialize LLM and embeddings
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initialize AstraDBVectorStore
astra_vector_store = AstraDBVectorStore(
    embedding=embedding,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    namespace=ASTRA_DB_NAMESPACE,
    token=ASTRA_DB_APPLICATION_TOKEN,
    collection_name="qa_mini_demo_two",
)

# PDF file reading logic
def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        content = page.extract_text()
        if content:
            text += content
    return text

raw_text = extract_pdf_text('budget_speech.pdf')

# Text splitting for vector indexing
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len
)

texts = text_splitter.split_text(raw_text)

# Add chunks to the vector store
astra_vector_store.add_texts(texts[:50])
print(f"Inserted {len(texts[:50])} text chunks.")

# Vector index setup
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

# Question answering loop
def query_loop():
    first_question = True
    while True:
        prompt = "\nEnter your question (or type 'quit' to exit): " if first_question else "\nWhat's your next question (or type 'quit' to exit): "
        query_text = input(prompt).strip()

        if query_text.lower() == 'quit':
            break

        if not query_text:
            continue

        first_question = False

        # Query for an answer
        print(f"\nQUESTION: \"{query_text}\"")
        answer = astra_vector_index.query(query_text, llm=llm).strip()
        print(f"ANSWER: \"{answer}\"\n")

        # Show top 4 relevant documents
        print("FIRST DOCUMENTS BY RELEVANCE:")
        for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
            print(f"    [{score:.4f}] \"{doc.page_content[:84]} ...\"")

# Run the query loop
query_loop()


Inserted 50 text chunks.

QUESTION: "What is the current GDP?"
ANSWER: "7 per cent."

FIRST DOCUMENTS BY RELEVANCE:
    [0.8953] "estimated to be at 7 per cent. It is notable that this is the highest among all 
the ..."
    [0.8948] "With the theme of ‘ Vasudhaiva Kutumbakam’ , we are steering an 
ambitious, people-c ..."
    [0.8942] "multiplier impact on growth and employment. After the subdued period of 
the pandemi ..."
    [0.8907] "February 1, 2023 
Hon’ble Speaker,  
 I present the Budget for 2023-24. This is the  ..."

QUESTION: "How much the agriculture target will be increased to and what the focus will be"
ANSWER: "The agriculture target will be increased to 20 lakh crore and the focus will be on animal husbandry, dairy, and fisheries."

FIRST DOCUMENTS BY RELEVANCE:
    [0.9325] "of Millet Research, Hyderabad  will be supported as the Centre of Excellence 
for sh ..."
    [0.9205] "Agriculture Accelerator Fund  
17. An Agriculture Accelerator Fund will be set-up to ..."
   