In [None]:
#!pip install -r requirements.txt

In [2]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.0-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.0


In [3]:
# Cell 2: Import all necessary libraries
import os
import json
from typing import List, Dict, Any
from pathlib import Path

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.schema import Document

# Other imports
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

All libraries imported successfully!


In [4]:
# Cell 3: Configuration settings
class RAGConfig:
    def __init__(self):
        # Model settings
        self.model_name = "gemma2:2b"
        self.embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
        
        # Text splitting settings
        self.chunk_size = 1000
        self.chunk_overlap = 200
        
        # Retrieval settings
        self.top_k = 4
        self.similarity_threshold = 0.7
        
        # Vector store settings
        self.vector_store_path = "vector_store"
        
        # Memory settings
        self.memory_key = "chat_history"
        self.return_messages = True

config = RAGConfig()
print("Configuration loaded successfully!")
print(f"Model: {config.model_name}")
print(f"Embedding model: {config.embedding_model}")
print(f"Chunk size: {config.chunk_size}")
print(f"Top K retrieval: {config.top_k}")

Configuration loaded successfully!
Model: gemma2:2b
Embedding model: sentence-transformers/all-MiniLM-L6-v2
Chunk size: 1000
Top K retrieval: 4


In [None]:
# Cell 4: Initialize models and embeddings
def initialize_models():
    try:
        # Initialize Ollama LLM
        llm = OllamaLLM(
            model=config.model_name,
            temperature=0.7,
            top_p=0.9,
            num_ctx=4096
        )
        
        # Test the model
        test_response = llm.invoke("Hello, how are you?")
        print(f"LLM Test Response: {test_response[:100]}...")
        
        # Initialize embeddings
        embeddings = HuggingFaceEmbeddings(
            model_name=config.embedding_model,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        
        # Test embeddings
        test_embedding = embeddings.embed_query("test query")
        print(f"Embedding dimension: {len(test_embedding)}")
        
        return llm, embeddings
        
    except Exception as e:
        print(f"Error initializing models: {e}")
        print("Make sure Ollama is running and gemma2:2b is installed")
        return None, None

llm, embeddings = initialize_models()

LLM Test Response: I am doing well! 😊  

How can I help you today? 
...


In [None]:
# Cell 5: Document processing functions
def load_pdf_documents(pdf_paths: List[str]) -> List[Document]:
    """Load PDF documents from given paths"""
    documents = []
    
    for pdf_path in pdf_paths:
        try:
            loader = PyPDFLoader(pdf_path)
            docs = loader.load()
            
            # Add metadata to identify source
            for doc in docs:
                doc.metadata['source_file'] = os.path.basename(pdf_path)
            
            documents.extend(docs)
            print(f"Loaded {len(docs)} pages from {pdf_path}")
            
        except Exception as e:
            print(f"Error loading {pdf_path}: {e}")
    
    return documents

def split_documents(documents: List[Document]) -> List[Document]:
    """Split documents into chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config.chunk_size,
        chunk_overlap=config.chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    
    splits = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(splits)} chunks")
    
    return splits

def create_vector_store(documents: List[Document], embeddings) -> FAISS:
    """Create vector store from documents"""
    if not documents:
        print("No documents provided for vector store creation")
        return None
    
    try:
        vector_store = FAISS.from_documents(documents, embeddings)
        print(f"Created vector store with {len(documents)} documents")
        return vector_store
        
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return None

print("Document processing functions defined successfully!")

In [None]:
# Cell 6: RAG chain setup
def create_rag_chain(llm, vector_store):
    """Create conversational RAG chain"""
    if not vector_store:
        print("No vector store provided")
        return None
    
    try:
        # Create memory for conversation
        memory = ConversationBufferMemory(
            memory_key=config.memory_key,
            return_messages=config.return_messages,
            output_key='answer'
        )
        
        # Create retriever
        retriever = vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": config.top_k}
        )
        
        # Create conversational chain
        qa_chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=retriever,
            memory=memory,
            return_source_documents=True,
            verbose=True
        )
        
        print("RAG chain created successfully!")
        return qa_chain
        
    except Exception as e:
        print(f"Error creating RAG chain: {e}")
        return None

print("RAG chain setup function defined!")

In [None]:
# Cell 7: Test with sample PDF
# Replace 'sample.pdf' with your actual PDF path
PDF_PATH = "sample.pdf"  # Change this to your PDF path

def test_single_pdf():
    """Test RAG system with single PDF"""
    if not os.path.exists(PDF_PATH):
        print(f"PDF file not found: {PDF_PATH}")
        print("Please update PDF_PATH with your actual PDF file path")
        return None, None
    
    # Load documents
    documents = load_pdf_documents([PDF_PATH])
    
    if not documents:
        print("No documents loaded")
        return None, None
    
    # Split documents
    splits = split_documents(documents)
    
    # Create vector store
    vector_store = create_vector_store(splits, embeddings)
    
    if not vector_store:
        print("Failed to create vector store")
        return None, None
    
    # Create RAG chain
    qa_chain = create_rag_chain(llm, vector_store)
    
    return qa_chain, vector_store

# Test the system
print("Testing RAG system with single PDF...")
qa_chain, vector_store = test_single_pdf()