In [6]:
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
import os


load_dotenv()

True

In [2]:
index_name = "psa-openai-ada002-embeddings"
namespace = "psa-press-releases"
embedding_model = OpenAIEmbeddings()

vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embedding_model,
    namespace=namespace,
)

In [26]:
root_dir = "./data"
file_paths = [os.path.join(root_dir, file_name) for file_name in os.listdir(root_dir) if file_name.endswith(".pdf") and file_name.startswith("psa")]
file_paths

['./data/psa_sines.pdf']

In [41]:
file_path = file_paths[0] # for demo, we work with 1 pdf file
loader = PyPDFLoader(file_path=file_path)
documents = loader.load() 

In [42]:
# basic text cleaning: remove newlines and consecutive whitespaces
for i in range(len(documents)):
    documents[i].page_content = documents[i].page_content.replace('\n', ' ')
    documents[i].page_content = ' '.join(documents[i].page_content.split())

In [43]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

In [45]:
def insert_document(texts, vectorstore: PineconeVectorStore, embedding_model: OpenAIEmbeddings, index_name:str, namespace:str):
    for text in texts:
        similarity = vectorstore.similarity_search(query=text.page_content, k=1, namespace=namespace)
        if len(similarity)>0 and similarity[0].page_content == text.page_content:
            print("Exact match found, skipping insertion")
        else:
            print("Inserting new document")
            try:
                vectorstore.from_documents([text], embedding_model, index_name=index_name, namespace=namespace)
                print(f"Document inserted: {text.page_content[:50]}...")
            except Exception as e:
                print(f"Error inserting document: {e}")

In [46]:
insert_document(docs, vectorstore, embedding_model, index_name, namespace)

Inserting new document
Document inserted: Page 1 of 2 This communication contains confidenti...
Inserting new document
Document inserted: RUBBER TIRED GANTRY CRANES On 29th February 2024, ...
Inserting new document
Document inserted: the terminal achieve its ambitious sustainability ...
Inserting new document
Document inserted: the company’s sustainability goal of cutting its c...
Inserting new document
Document inserted: TEUs, thereby strengthening its position as one of...
Inserting new document
Document inserted: ambitions. PSA Sines will progressively incorporat...
Inserting new document
Document inserted: Page 2 of 2 This communication contains confidenti...
Inserting new document
Document inserted: of 2.7 MTEUs and a quay wall extension of 1.350 me...
Inserting new document
Document inserted: PSA International PSA International (PSA) is a lea...
Inserting new document
Document inserted: on the deep expertise and experience from a divers...
Inserting new document
Document inserted

Bad pipe message: %s [b'g\x8ewQ\xfd\xe9_\x7f\x8d\nJ\xdb\xe8\xcf\x10\xb7\x97\xf1 \x9d\xaf\xbf\x0c&\x14\xbc\x9f\xaa\xc7\x92VQH7\xe3\xb6\x9e\x81\xb1i3j\x06\xc1\x8f\x93n]\x10^\x85\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f']
Bad pipe message: %s [b'_\x8f\xffu\x8f\xe6\x1f\xd8\xb9\x07\xcf\xa9\tY//\xda\x89\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00']
Bad pipe message: %s [b"j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00"]
Bad pipe message: %s [b'\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05\x03\x06']
Ba