In [None]:
!pip install -q cassio datasets langchain openai tiktoken PyPDF2

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
ASTRA_DB_TOKEN = os.getenv("ASTRA_DB_TOKEN")
OPENAI_KEY = os.getenv("OPENAI_KEY")

In [None]:
import cassio
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from datasets import load_dataset
from PyPDF2 import PdfReader


In [None]:
pdfReader = PdfReader('budget_speech.pdf')

In [None]:
# reading text from the pdf

text = ''
for index, page in enumerate(pdfReader.pages):
    content = page.extract_text()
    if content:
        text += content

text

In [None]:
cassio.init(token=ASTRA_DB_TOKEN, database_id=ASTRA_DB_ID)

In [None]:
llm         = OpenAI(openai_api_key = OPENAI_KEY)
embeding    = OpenAIEmbeddings(openai_api_key = OPENAI_KEY)

### Langchain Vectorstore 

In [None]:
astraVectorStore = Cassandra(
    embedding = embeding,
    table_name = "demo",
    session = None,
    keyspace = None
)

In [None]:
from langchain.text_splitter import CharacterTextSplitter

textSplitter = CharacterTextSplitter(
    seperator = '\n',
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len
)

texts = textSplitter.split_text(text) 

In [None]:
texts[:50]

### Load the dataset in the vector-store

In [None]:
astraVectorStore.add_texts(texts[:50])
print("Inserted %i headlines" % len(texts[:50]))

astraVectorStore = VectorStoreIndexWrapper(vectorstore = astraVectorStore)

### QA Cycle

In [None]:
firstQuestion = True

while True:
    if firstQuestion:
        query = input("\nAsk me a question (or type 'exit' to exit): ").strip()
    else:
        query = input("\nWhat else would you like to know? (or type 'exit' to exit): ").strip()
        
    if query.lower() == 'exit':
        break
    if query == '':
        continue
    
    firstQuestion = False
    answer = astraVectorStore.query(query, llm = llm).strip()
    
    print("\nQ: \"%s\"" % query)
    print("\nA: \"%s\n" % answer)
    
    print("\nFirst Document By Relevance: ")
    for doc, score in astraVectorStore.similarity_search_with_score(query, k = 4):
        print("[%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))
    