In [1]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings

from datasets import load_dataset

import cassio

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["HF_TOKEN"] = os.getenv('HUGGINGFACE_TOKEN')
os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY')
ASTRA_DB_APPLICATION_TOKEN = os.getenv('ASTRA_TOKEN')
ASTRA_DB_ID = os.getenv('ASTRA_DB_ID')

In [3]:
from PyPDF2 import PdfReader

In [4]:
pdfreader = PdfReader('9789240094703-eng.pdf')
pdfreader

<PyPDF2._reader.PdfReader at 0x7a4ffd300ac0>

In [5]:
from typing_extensions import Concatenate
raw_text = ''

for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content
raw_text



In [6]:
cassio.init(database_id = ASTRA_DB_ID, token=ASTRA_DB_APPLICATION_TOKEN,)

In [7]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
llm = ChatGroq(model="llama-3.2-11b-vision-preview")

In [8]:
vector_store = Cassandra(embedding=embeddings, table_name='vectorstore', keyspace=None, session=None)

In [9]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(separator="\n", chunk_size=2048, chunk_overlap=256, length_function = len)
chunked_text = splitter.split_text(raw_text)

In [10]:
vector_store.add_texts(chunked_text)
print(f"Added {len(chunked_text)} chunks to the vector store")
vector_store_indx = VectorStoreIndexWrapper(vectorstore=vector_store)
vector_store_indx

Added 147 chunks to the vector store


VectorStoreIndexWrapper(vectorstore=<langchain_community.vectorstores.cassandra.Cassandra object at 0x7a4f10164d30>)

In [14]:
first_question = True
while True:
    if first_question:
        query_text = input("Ask a question(or type quit to exit): ").strip()
    else:
        query_text = input("Ask next question(or type quit to exit): ").strip()
    if query_text == "quit":
        break
    first_question = False
    print(f"\nQuestion: {query_text}")
    answer = vector_store_indx.query(query_text, llm=llm).strip()
    print(f"Answer: {answer}\n")

    print('Docs by relevance:')
    for doc, score in vector_store.similarity_search_with_score(query_text, k=5):
        print(f"{score:.4f} - {doc.page_content[:100]}")


Question: what health issue do refugees face
Answer: According to the provided text, refugees and migrants often face worse health outcomes in countries of transit and destination due to various barriers. Some of the health issues they face include:

1. Infectious diseases
2. Mental health conditions
3. Malnutrition and starvation due to disrupted food supplies
4. Poor access to health services, including:
 * No access to any health services (in 45% of countries)
 * Access to emergency health-care services only (in 8% of countries)
 * Access to all services dependent on migration status (in 37% of countries)
5. Negatively impacted health due to their travels or circumstances in their home country
6. Poorer health outcomes compared to host populations, including:
 * Cancer diagnosed at advanced stages
 * Gaps in cervical cancer awareness and prevention
 * Barriers to HPV vaccination
7. Increased risks of non-communicable diseases (NCDs)

These health issues are exacerbated by factors s