In [4]:
!pip install langchain faiss-cpu sentence-transformers
!pip install -U langchain-community
!pip install -q "unstructured[pdf]"
!apt-get install -y poppler-utils  # For PDF parsing

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain-community)
  Downloading langchain_core-0.3.68-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain-community)
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain<1.0.0,>=0.3.26->langchain-community)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.125 (from langchain-community)
  Downloading langsmith-0.4.4-py3-none-any.whl.metadata (15 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-se

In [11]:
%pip install -qU langchain-groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hNote: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ["GROQ_API_KEY"] = "YOUR_API_KEY"  


In [7]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_paths = [
    "/kaggle/input/corona/ceu176.pdf",
    "/kaggle/input/corona/ceu_175.pdf",
    "/kaggle/input/corona/ceu_177.pdf"
]

all_docs = []
for path in pdf_paths:
    loader = UnstructuredPDFLoader(path)
    docs = loader.load()
    all_docs.extend(docs)

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(all_docs)
print(f"✅ Loaded {len(chunks)} chunks.")


✅ Loaded 416 chunks.


In [9]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(chunks, embedding_model)
print("Done!")

Done!


In [12]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    groq_api_key=os.environ["GROQ_API_KEY"],
    model_name="llama-3.1-8b-instant"  )


In [13]:
from langchain.chains import RetrievalQA

retriever = db.as_retriever(search_kwargs={"k": 3})
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)


In [15]:
query = "What happened in 2025 regarding the number of COVID-19 cases and deaths?"
response = rag_chain.invoke(query)

print("📌 Answer:", response["result"])
print("\n📚 Sources:")
for src in response["source_documents"]:
    print("-", src.metadata['source'])

📌 Answer: According to the provided context, there are two time points mentioned in 2025: 

1. As of 2 February 2025, and 
2. As of 5 January 2025.

For the first time point (2 February 2025), the provided context includes the following information:

- The total number of COVID-19 cases globally is approximately 120 million.
- The total number of COVID-19 deaths globally is approximately 450,000.

However, no information is provided in the context about the overall trend or changes in the number of cases and deaths in 2025, other than the fact that there is a figure about percentage changes in cases and deaths over the last 28 days relative to the previous 28 days (Figure 5), but this figure is not explicitly stated in the provided text.

For the second time point (5 January 2025), no information is provided in the context.

📚 Sources:
- /kaggle/input/corona/ceu_177.pdf
- /kaggle/input/corona/ceu_177.pdf
- /kaggle/input/corona/ceu176.pdf


In [16]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x7a6c1a80de10>