In [7]:
from dotenv import load_dotenv
import os
import gradio as gr
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
import time

In [2]:
load_dotenv()

os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME")
AZURE_EMBEDDING = os.getenv("AZURE_EMBEDDING")

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [3]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=AZURE_EMBEDDING,
    openai_api_version="2024-03-01-preview",
)


In [4]:
class CustomFileLoader(UnstructuredPDFLoader):
    def _get_metadata(self) -> dict:
        file_name = os.path.basename(self.file_path)
        return {"source": file_name}

In [9]:
folder_path = '/Users/jaylee/Documents/work/dpi/pdf3'
for filename in os.listdir(folder_path):
      time.sleep(0.1)
      if filename == '.DS_Store':
            continue
      file_path = os.path.join(folder_path, filename)
      splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
      loader = CustomFileLoader(file_path)
      docs = loader.load_and_split(text_splitter=splitter)

      vectorstore = PineconeVectorStore.from_documents(docs, embeddings, index_name="dpi")

In [None]:
vector_store = PineconeVectorStore(index_name='researchers', embedding=embeddings)
retriever = vector_store.as_retriever()