In [19]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.llms import openai
from langchain_openai import OpenAI
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import OpenAIEmbeddings

from datasets import load_dataset

# CassIO is the engine powering the astradb in langchain, will also help in connection
import cassio

In [8]:
import os
from PyPDF2 import PdfReader
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
pdfreader = PdfReader('budget_speech.pdf')

In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text+=content

In [10]:
# Initializing the connnection to db
ASTRA_DB_APPLICATION_TOKEN = os.getenv('ASTRA_DB_APPLICATION_TOKEN')
database_id = os.getenv('ASTRA_DB_ID')
cassio.init(token = ASTRA_DB_APPLICATION_TOKEN, database_id=database_id)

## create the embeddings and llm objects for late use


In [None]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
embedding = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)

  embedding = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)


In [13]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name='qa_mini_demo',
    session=None,
    keyspace=None
)

In [14]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len,
)

texts = text_splitter.split_text(raw_text)

In [15]:
texts[:50]

['GOVERNMENT OF INDIA\nBUDGET 2025-2026\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2025 \nCONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nBudget Theme  1 \nAgriculture as the 1st engine  3 \nMSMEs as the 2nd engine  6 \nInvestment as the 3rd engine  8 \nA. Investing in People  8 \nB. Investing in  the Economy  10 \nC. Investing in Innovation  14 \nExports as the 4th engine  15 \nReforms as the Fuel  16 \nFiscal Policy  18 \n \n \nPART – B \nIndirect taxes  20 \nDirect Taxes   23 \n \nAnnexure to Part -A 29 \nAnnexure to Part -B 31 \n \n   \n \nBudget 202 5-2026 \n \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1 , 202 5 \nHon’ble Speaker,  \n I present the Budget for 2025 -26. \nIntroduction  \n1. This Budget continues our Government ’s efforts to:  \na) accelerate growth,',
 'Minister of Finance  \nFebruary 1 , 202 5 \nHon’ble Speaker,  \n I present the Budget for 2025 -26. \nIntroduction  \n1. This Budget continues our Government

In [16]:
astra_vector_store.add_texts(texts[:50])
print("Inserted %i headlines." % len(texts[:50]))
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)


Inserted 50 headlines.


In [20]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

In [None]:
# qa cycle

first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type quit to exit): ").strip()
    else:
        query_text = input("\nWhat's Your next question: ").strip()
    
    if query_text.lower() == 'quit':
        break
    if query_text == '':
        continue
    
    first_question = False
    
    print("\nQUESTION: \"%s\"" % query_text) 
    answer = astra_vector_index.query(query_text, llm = llm).strip()
    print("ANSWER: \"%s\"\n" % answer)
    
    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))
    


QUESTION: "what is the current gdp"
ANSWER: "I do not have enough information to answer that question. The provided context does not mention the current GDP (gross domestic product) of India."

FIRST DOCUMENTS BY RELEVANCE:
    [0.8940] "geopolitical headwinds suggest lower  global economic growth over the 
medium term.  ..."
    [0.8804] "which needs high skills and talent. Our youth have both. Our Government  will 
suppo ..."
    [0.8797] "Minister of Finance  
February 1 , 202 5 
Hon’ble Speaker,  
 I present the Budget f ..."
    [0.8785] "blended finance facility with contribution from the Government , banks and 
private  ..."

QUESTION: "how much the agritcultural target will be increased to and what the focus will be"
ANSWER: "The agricultural target will be increased by 1.7 crore farmers and the focus will be on rural women, young farmers, rural youth, marginal and small farmers, and landless families."

FIRST DOCUMENTS BY RELEVANCE:
    [0.9178] "rural areas so that migration