In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
import os
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_ENV')

# Initializing Pinecone Vector DB
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

# Pinecone Vector DB index name
index_name = 'slp'
index = pinecone.Index(index_name)
    

  from tqdm.autonotebook import tqdm


In [2]:
path = ""

In [3]:
loader = DirectoryLoader(path, glob="**/*.pdf", loader_cls=PyPDFLoader, show_progress=True, use_multithreading=True)
documents = loader.load()

100%|██████████| 4467/4467 [00:55<00:00, 80.85it/s] 


In [4]:
embeddings = OpenAIEmbeddings()
text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1000, 
                    chunk_overlap=20,
                    separators=["\n\n", "\n", " ", ""],
                    length_function=len
                )
texts = text_splitter.split_documents(documents)
db = Pinecone.from_documents(
        texts,
        embeddings,
        index_name=index_name
    )

In [10]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.06484,
 'namespaces': {'': {'vector_count': 6484}},
 'total_vector_count': 6484}

In [6]:
pinecone.whoami()

WhoAmIResponse(username='', user_label='', projectname='30876aa')

In [7]:
len(documents)

5303

In [8]:
len(texts)

6484

## Delete your index

Once finished with the index we can delete it to save resources.

In [9]:
# my_index.delete(delete_all=True, namespace='')