In [5]:
! pip install -U langchain-openai
! pip install langchain-pinecone
! pip install pypdf

[0mCollecting langchain-pinecone
  Using cached langchain_pinecone-0.1.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pinecone-client<4.0.0,>=3.2.2 (from langchain-pinecone)
  Using cached pinecone_client-3.2.2-py3-none-any.whl.metadata (16 kB)
Using cached langchain_pinecone-0.1.1-py3-none-any.whl (8.4 kB)
Using cached pinecone_client-3.2.2-py3-none-any.whl (215 kB)
Installing collected packages: pinecone-client, langchain-pinecone
Successfully installed langchain-pinecone-0.1.1 pinecone-client-3.2.2
[0m

In [66]:
import os
import time
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

In [21]:
openai_api_key = 'xx'
pinecone_api_key = 'xx'

os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

Initiate documents in vectorDB

In [23]:
# Pinecone object
pc = Pinecone(api_key=pinecone_api_key)

In [24]:
# Create index in pinecone
index_name = "edukaone"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [22]:
# Embedding model
embeddings = OpenAIEmbeddings()

In [13]:
# Load data
loader = PyPDFLoader("HRK Physics vol1.pdf")
# pages = loader.load_and_split() # split pages
documents = loader.load()

In [29]:
# Chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

len(docs)

313

In [25]:
# Add documents
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)
store = vectorstore.add_documents(docs)

Simple Chat

In [100]:
# LLM object
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.3)

# Embedding object
embeddings = OpenAIEmbeddings()

# Pinecone object
pc = Pinecone(api_key=pinecone_api_key)

# Vectorstore object
pinecone_index = 'edukaone'
vectorstore = PineconeVectorStore(index_name=pinecone_index, embedding=embeddings)

# Retriver object
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

# Output parser object
parser = StrOutputParser()

In [101]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [102]:
# Create prompt template
message_template = """
You are physics expert. Please answer my question based on context provide. 
If you cannot find the solution in the context and you dont know the answer, answer with 'I dont have the information'

Context:
{context}

Question:
{question}
"""
prompt = ChatPromptTemplate.from_messages([("human", message_template)])

In [103]:
# Chain
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | parser
)

In [111]:
# Chat
user_prompt = 'What is physics?'
response = chain.invoke(user_prompt)
response

'Physics is the branch of science that deals with the study of matter, energy, motion, and the interactions between them. It seeks to understand how the universe behaves at a fundamental level through observation, experimentation, and mathematical analysis.'

In [105]:
# Chat
user_prompt = 'What is biology?'
response = chain.invoke(user_prompt)
response

'I dont have the information'