# Lab10: RAG-based Question Answering
The exercise introduces modern approaches to Question Answering using Retrieval Augmented Generation (RAG) with LLMs and vector databases.

## Load PDF

In [13]:
from langchain_community.document_loaders import PyPDFLoader
import numpy as np

loader = PyPDFLoader(
    "./Understanding-Feline-Communication-Automated-Classification-of-Cat-Vocalizations-Using-VGG16.pdf",
)

docs = loader.load()
print("Num of pages (docs):", len(docs))
print("Max doc len", np.max(np.array([len(doc.page_content) for doc in docs ])))
print("Avg doc len", np.mean(np.array([len(doc.page_content) for doc in docs ])))

Num of pages (docs): 7
Max doc len 3608
Avg doc len 2823.0


## Split docs

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=256,
    length_function=len,
    is_separator_regex=False,
)
splitted_docs = text_splitter.create_documents(
    texts=[doc.page_content for doc in docs], 
    metadatas=[doc.metadata for doc in docs]
)

        
print(splitted_docs[0])
print("\nNum of splitted docs", len(splitted_docs))

page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/382973617
Understanding Feline Communication: Automated Classiﬁcation of Cat
Vocalizations Using VGG16
Article · June 2024
DOI: 10.5281/zenodo.11480888
CITATIONS
0
READS
91
5 authors, including:
Oishi Singh
American International University-Bangladesh
3 PUBLICATIONS   0 CITATIONS   
SEE PROFILE
Ashadu Jaman Shawon
American International University-Bangladesh
8 PUBLICATIONS   15 CITATIONS   
SEE PROFILE
All content following this page was uploaded by Ashadu Jaman Shawon on 09 August 2024.
The user has requested enhancement of the downloaded file.' metadata={'source': './Understanding-Feline-Communication-Automated-Classification-of-Cat-Vocalizations-Using-VGG16.pdf', 'page': 0, 'page_label': '1'}

Num of splitted docs 26


## Calculate Embeddings

In [27]:
from langchain_ollama import OllamaEmbeddings

embedder = OllamaEmbeddings(
    model="llama3"
)

documents = [doc.page_content for doc in splitted_docs]

doc_max_len = 1024

embeddings = embedder.embed_documents(documents)

Exception: [401] Unauthorized
Authentication failed
Please check or regenerate your API key.

## Setup Qdrant vector database

In [18]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access the variables
qdrant_api_key = os.getenv("QDRANT_API_KEY")
qdrant_url = os.getenv("QDRANT_URL")

In [19]:
from langchain_qdrant import QdrantVectorStore
qdrant = QdrantVectorStore.from_documents(
    splitted_docs,
    doc_embeddings,
    url=qdrant_url,
    prefer_grpc=True,
    api_key=qdrant_api_key,
    collection_name="feline_comm",
)

AttributeError: 'Tensor' object has no attribute 'embed_documents'