In [2]:
import PyPDF2

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import LlamaCpp


from langchain.embeddings import HuggingFaceEmbeddings # import hf embedding
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer, util
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Step 1: Preparing pdf metadata

In [10]:
pdf_files=["C:/Users/Mrinal Kalita/Python Projects/AIML Capstone Project - CV - Pneumonia Detection-1.pdf"]

In [13]:
def process_pdf(pdf_files):
    documents = []
    metadata = []
    content = []

    for i in pdf_files:

        pdf_read = PyPDF2.PdfReader(i)
        for ind, text in enumerate(pdf_read.pages):
            doc_page = {'title': i + " page " + str(ind + 1),
                        'content': pdf_read.pages[ind].extract_text()}
            documents.append(doc_page)
    for doc in documents:
        content.append(doc["content"])
        metadata.append({
            "title": doc["title"]
        })
    print("Content and metadata are extracted from the documents")
    return content, metadata

In [17]:
content, metadata = process_pdf(pdf_files)

Content and metadata are extracted from the documents


# Step 2: Split the content into smaller portion

In [26]:
def split_content(content, metadata):
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512,chunk_overlap=256)
    smaller_docs = splitter.create_documents(content, metadatas=metadata)
    print(f"Docs are split into {len(smaller_docs)} passages")
    return smaller_docs

In [27]:
smaller_docs=split_content(content, metadata)

Docs are split into 7 passages


# Step 3: Ingest into Vector Database locally

In [28]:
def ingest_into_vectordb(smaller_docs):
    emb = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})
    db = FAISS.from_documents(smaller_docs, emb)

    DB_FAISS_PATH = 'vectorstore/db_faiss'
    db.save_local(DB_FAISS_PATH)
    return db

In [29]:
ingest_into_vectordb(smaller_docs)

<langchain_community.vectorstores.faiss.FAISS at 0x20a1cdb0ca0>