In [28]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_pinecone import PineconeVectorStore
import pinecone
from dotenv import load_dotenv
from langchain.chains import RetrievalQA
import os


load_dotenv()

True

### 1. Character splitting, embedding and Upserting

Note that if the texts are the same, upserting will overwrite duplicates existing vectors

In [48]:
# loader = TextLoader(file_path='./data/psa_press_release.txt', encoding='utf8')
loader = TextLoader(file_path='./data/psa_jurong_expansion.txt', encoding='utf8')
document = loader.load() # this returns a list of Document objects
# minor cleaning: remove the \n characters:
for i in range(len(document)):
    document[i].page_content = document[i].page_content.replace('\n', ' ')

In [49]:
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator=' ') # we can try with using separator=' ' to represent words
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(document)

In [50]:
[len(text.page_content) for text in texts]

[489, 493, 498, 497, 495, 430]

In [51]:
texts

[Document(page_content='PSA Singapore (PSA) has announced a strategic expansion of its Jurong Island Terminal (JIT) to  meet growing demand for sustainable, efficient and resilient supply chain solutions from industries  based on Jurong Island. Located on the northwestern seafront of Jurong Island, JIT offers twice-daily barge sailings that  connect beneficial cargo owners (BCOs) on Jurong Island with PSA’s main hubs at Tuas, Pasir  Panjang and Brani, from where they can leverage Singapore’s connectivity for', metadata={'source': './data/psa_jurong_expansion.txt'}),
 Document(page_content='they can leverage Singapore’s connectivity for unparalleled access  to global markets. Jurong Island is the nucleus of Singapore’s Energy and Chemicals sector. Managed by JTC, it spans  3,000 hectares and hosts more than 100 global companies carrying out refining, olefins production  and chemical manufacturing operations.  While containerised raw materials and finished products can be trucked to and 

In [52]:
embeddings = OpenAIEmbeddings(api_key=os.environ.get("OPENAI_API_KEY"))
# we can insert the embeddings into the Pinecone vector store
docsearch = PineconeVectorStore.from_documents(texts, embeddings, index_name='psa-openai-ada002-embeddings')

### Delete vectors

In [27]:
# pip install pinecone-client
# from pinecone import Pinecone

# pc = Pinecone()
# index = pc.Index("psa-openai-ada002-embeddings")

# index.delete(delete_all=True)

{}

### 2. Retrieval with Stuffing

In [53]:
# we have to go through the as_retriever() method to get the retriever object because it abstracts the underlying vector db and its method of searching
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(), chain_type='stuff', retriever=docsearch.as_retriever(), return_source_documents=True
)

In [58]:
# query="How much container volume has Mersin Port handled since its inception?"
# query="What is the future port capacity expansion?"
# query="list assets under PSA International's ports and cargo solutions portfolio and their locations"
# query="What is the expansion plan for Jurong Port with PSA?"
query="How much volume did Jurong Island Terminal (JIT) handle in 2023?"
result=qa({"query":query})
print(result['result'])

 149,000 twenty-foot equivalent units of containers (TEU).
