### RAG - Pre-processing Documents with Pinecone

In [9]:
import os

from dotenv import load_dotenv
from pypdf import PdfReader

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore

In [10]:
load_dotenv()

openai_api_key = os.environ["OPENAI_API_KEY"]

In [11]:
def get_pdf_text(pdf_document):
    text = ""

    pdf_reader = PdfReader(pdf_document)

    for page in pdf_reader.pages:
        text += page.extract_text()

    return text

In [12]:
def create_documents(pdf_files):
    documents = []

    for file in pdf_files:
        chunks = get_pdf_text(file)

        documents.append(
            Document(
                page_content=chunks,
                metadata={
                    "source": file,
                    "type": "PDF",
                    "owner": "Ramkumar"
                }
            )
        )

    return documents

In [16]:
def create_embeddings():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-large",
        openai_api_key=openai_api_key)

    return embeddings

In [14]:
def push_documents_to_vector_db(index_name, embeddings, documents):
    validation = index_name is not None and \
        embeddings is not None and \
        documents is not None

    if validation:
        vector_store = PineconeVectorStore(
            index_name=index_name,
            embedding=embeddings)

        vector_store.add_documents(documents)

        print(f"Totally {
              len(documents)} Embeddings are stored in the Vectore DB successfully!")
    else:
        print("Invalid arguments!")

In [17]:
index_name = "trainingindex"
directory_path = "./Docs"
files = os.listdir(directory_path)
pdf_files = []

for file in files:
    pdf_file = directory_path + "/" + file
    pdf_files.append(pdf_file)

print(f"Totally {len(pdf_files)} PDF files are found!")

documents = create_documents(pdf_files)
embeddings = create_embeddings()

push_documents_to_vector_db(index_name, embeddings, documents)

print("Pre-processing completed ...")

Totally 7 PDF files are found!
Totally 7 Embeddings are stored in the Vectore DB successfully!
Pre-processing completed ...
