In [37]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
import os

load_dotenv()

True

In [32]:
index_name = "psa-openai-ada002-embeddings"
namespace = "psa-press-releases"
embedding_model = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embedding_model,
    namespace=namespace,
)

### 1. Chunking the incoming texts

In [42]:
root_dir = "./data"
file_paths = [os.path.join(root_dir, file_name) for file_name in os.listdir(root_dir) if file_name.endswith(".txt") and file_name.startswith("psa")]
file_paths

['./data\\psa_jurong_expansion.txt', './data\\psa_mersin_milestones.txt']

In [58]:
documents = []
for file_path in file_paths:
    loader = TextLoader(file_path=file_path, encoding='utf8')
    document = loader.load() 
    # minor cleaning: remove the \n characters:
    for i in range(len(document)):
        document[i].page_content = document[i].page_content.replace('\n', ' ')
    documents.append(document) # needs to append, not extend, to preserve the list, because the splitter expects a list of documents

In [76]:
documents

[[Document(page_content='PSA Singapore (PSA) has announced a strategic expansion of its Jurong Island Terminal (JIT) to  meet growing demand for sustainable, efficient and resilient supply chain solutions from industries  based on Jurong Island. Located on the northwestern seafront of Jurong Island, JIT offers twice-daily barge sailings that  connect beneficial cargo owners (BCOs) on Jurong Island with PSA’s main hubs at Tuas, Pasir  Panjang and Brani, from where they can leverage Singapore’s connectivity for unparalleled access  to global markets. Jurong Island is the nucleus of Singapore’s Energy and Chemicals sector. Managed by JTC, it spans  3,000 hectares and hosts more than 100 global companies carrying out refining, olefins production  and chemical manufacturing operations.  While containerised raw materials and finished products can be trucked to and from Jurong Island  by road, barging is less labour intensive, does not contribute to road congestion, and generates about 30% le

In [78]:
texts=[]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# texts = text_splitter.split_documents(documents[0])
for document in documents:
    texts.extend(text_splitter.split_documents(document))

In [79]:
texts

[Document(page_content='PSA Singapore (PSA) has announced a strategic expansion of its Jurong Island Terminal (JIT) to  meet growing demand for sustainable, efficient and resilient supply chain solutions from industries  based on Jurong Island. Located on the northwestern seafront of Jurong Island, JIT offers twice-daily barge sailings that  connect beneficial cargo owners (BCOs) on Jurong Island with PSA’s main hubs at Tuas, Pasir  Panjang and Brani, from where they can leverage Singapore’s connectivity for', metadata={'source': './data\\psa_jurong_expansion.txt'}),
 Document(page_content='they can leverage Singapore’s connectivity for unparalleled access  to global markets. Jurong Island is the nucleus of Singapore’s Energy and Chemicals sector. Managed by JTC, it spans  3,000 hectares and hosts more than 100 global companies carrying out refining, olefins production  and chemical manufacturing operations.  While containerised raw materials and finished products can be trucked to and

### 2. Naively Upserting

In [7]:
# we can insert the embeddings into the Pinecone vector store
docsearch = vectorstore.from_documents(texts, embedding_model, index_name=index_name, namespace=namespace)

In [8]:
# what if we re-run with the same index_name and namespace?
# this will simply create a new copy with a different ID
docsearch = vectorstore.from_documents(texts, embedding_model, index_name=index_name, namespace=namespace)


### 3. Upserting if text not exist

In [85]:
# to avoid creating a new copy, we can perform a similarity search with top_k=1
# only insert if the top_k result is not an exact match of the incoming text


# perform a similarity search on a candidate:
# query = texts[0].page_content
# similarity = vectorstore.similarity_search(query=query, k=1, namespace=namespace)
# if similarity[0].page_content == query:
#     print("Exact match found, skipping insertion")
# else:
#     print("Inserting new document")

# let's wrap this in a function:
def insert_document(texts, vectorstore: PineconeVectorStore, embedding_model: OpenAIEmbeddings, index_name:str, namespace:str):
    for text in texts:
        similarity = vectorstore.similarity_search(query=text.page_content, k=1, namespace=namespace)
        if len(similarity)>0 and similarity[0].page_content == text.page_content:
            print("Exact match found, skipping insertion")
        else:
            print("Inserting new document")
            try:
                vectorstore.from_documents([text], embedding_model, index_name=index_name, namespace=namespace)
                print(f"Document inserted: {text.page_content[:50]}...")
            except Exception as e:
                print(f"Error inserting document: {e}")



In [86]:
insert_document(texts, vectorstore, embedding_model, index_name, namespace)

Inserting new document
Document inserted: PSA Singapore (PSA) has announced a strategic expa...
Inserting new document
Document inserted: they can leverage Singapore’s connectivity for unp...
Inserting new document
Document inserted: labour intensive, does not contribute to road cong...
Inserting new document
Document inserted: Goods (DG) handling and cargo transloading facilit...
Inserting new document
Document inserted: key stakeholders and partners, was held at PSA’s c...
Inserting new document
Document inserted: Energy and Chemicals sector will continue to reap ...
Inserting new document
Document inserted: Mersin International Port celebrates milestone of ...
Inserting new document
Document inserted: representatives,  and industry associations.  MIP ...
Inserting new document
Document inserted: its facilities and equipment.  To meet the evolvin...
Inserting new document
Document inserted: of 17,5 metres. This will allow two Ultra Large Co...
Inserting new document
Document inserted

In [23]:
# alternatively, we can convert the text to embeddings first:
# however this method is not supported for Pinecone yet
embedding_model = OpenAIEmbeddings()
embedding = embedding_model.embed_query(texts[0].page_content)
docs = vectorstore.similarity_search_by_vector(embedding)
print(docs[0].page_content)

### 4. Managing the index

In [35]:
vectorstore.delete(index_name=index_name, namespace=namespace, delete_all=True)