# Capstone RAG Pipeline
This notebook implements a RAG (Retrieval-Augmented Generation) pipeline for document processing, chunking, embedding, and querying.

In [None]:

# Install dependencies
!pip install langchain openai faiss-cpu tiktoken python-dotenv PyPDF2
    

## Step 1: Import Required Libraries

In [None]:

import os
import faiss
import pickle
import numpy as np
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from dotenv import load_dotenv

load_dotenv()
    

## Step 2: Load and Read Documents

In [None]:

def load_pdfs_from_directory(directory_path):
    docs = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            reader = PdfReader(pdf_path)
            text = ''
            for page in reader.pages:
                text += page.extract_text()
            docs.append({'filename': filename, 'text': text})
    return docs

pdf_directory = "data/pdfs"
documents = load_pdfs_from_directory(pdf_directory)
print(f"Loaded {len(documents)} documents.")
    

## Step 3: Split Documents into Chunks

In [None]:

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = []
for doc in documents:
    for chunk in splitter.split_text(doc['text']):
        texts.append({'filename': doc['filename'], 'content': chunk})
print(f"Generated {len(texts)} text chunks.")
    

## Step 4: Generate Embeddings and Store in FAISS

In [None]:

embedding_model = OpenAIEmbeddings()
texts_only = [t['content'] for t in texts]

faiss_index = FAISS.from_texts(texts_only, embedding_model)
faiss_index.save_local("vectorstore/faiss_index")

print("FAISS index created and saved successfully!")
    

## Step 5: Query Using Retrieval-Augmented Generation (RAG)

In [None]:

retriever = faiss_index.as_retriever(search_kwargs={"k": 3})
llm = ChatOpenAI(model="gpt-4-turbo")

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"
)

query = "Summarize the main points from all documents."
result = qa_chain.run(query)
print(result)
    

## Step 6: Save and Load Index

In [None]:

# Save
faiss_index.save_local("vectorstore/faiss_index")

# Load
new_faiss_index = FAISS.load_local("vectorstore/faiss_index", embedding_model, allow_dangerous_deserialization=True)
retriever = new_faiss_index.as_retriever()
print("Index reloaded successfully!")
    