1. Create venv
python3 -m venv chatbot
source chatbot/bin/activate
pip3 install -r requirement.txt

Delete All
pip3 uninstall -y $(pip freeze | cut -d'=' -f1)

2. File loading

In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

import os
import openai

openai.api_key  = os.environ['OPENAI_API_KEY']

from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/MLInterview.pdf")
pages = loader.load()
print(len(pages))





135


3. Doc splitting
CharacterTextSplitter


In [3]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)
chunks = text_splitter.split_documents(pages)
print(len(chunks))

257


4. connect to postgresql
5. code embedding

In [4]:
import os

host = os.getenv("PG_VECTOR_HOST")
user = os.getenv("PG_VECTOR_USER")
password = os.getenv("PG_VECTOR_PASSWORD")
database = os.getenv("PGDATABASE")
COLLECTION_NAME = "langchain_collection"

CONNECTION_STRING = f"postgresql+psycopg2://{user}:{password}@{host}:5432/{database}"
CONNECTION_STRING



'postgresql+psycopg2://jli943:admin123!@pgvectordatabase1.postgres.database.azure.com:5432/pgvector'

In [5]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))

vector_store = PGVector(
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [6]:
from langchain.indexes import SQLRecordManager, index
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = SQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)
record_manager.create_schema()
index(
    chunks,
    record_manager,
    vector_store,
    cleanup=None,
    source_id_key="source",
)

{'num_added': 0, 'num_updated': 0, 'num_skipped': 257, 'num_deleted': 0}

5. retriever from db

In [7]:
retriever = vector_store.as_retriever()
retriever.get_relevant_documents("difference between Supervised and Unsupervised Learning")

[Document(page_content='reward is maximum even though there might be another path for\nwhich the overall reward is more. In Reinforcement Learning, after\nevery few steps, you take a less greedy step to explore the full\nterrain. After much exploration and exploitation, you would know\nthe best way to walk through the terrain so as to maximize your\ntotal reward.\n11 How would you differentiate between Supervised and Unsupervised\nLearning?\nSupervised Learning is where you have both the input variable x and the\noutput variable y and you use an algorithm to learn the mapping function\nfrom x to y and predict the output of the new data. Supervised Learning can\nfurther be classified as a Classification or a Regression technique.\nUnsupervised Learning, on the other hand, is where you only have the input\nvariable x but no corresponding output variable y. The goal in Unsupervised\nLearning is to model the underlying structure and distribution of the data.', metadata={'source': 'docs/MLI

6. Add prompts

In [8]:
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)


template = """Answer the question based only on the following context:
{context}
Question: {question}
"""

ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")


7.define whole chain


In [9]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.runnable import RunnableParallel
from langchain.schema.messages import get_buffer_string
from langchain_openai import ChatOpenAI
from langchain.schema import StrOutputParser

_inputs = RunnableParallel(
    standalone_question=RunnablePassthrough.assign(
        chat_history=lambda x: get_buffer_string(x["chat_history"])
    )
    | CONDENSE_QUESTION_PROMPT
    | ChatOpenAI(temperature=0)
    | StrOutputParser(),
)

from langchain.schema import format_document

def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

from operator import itemgetter
_context = {
    "context": itemgetter("standalone_question") | retriever | _combine_documents,
    "question": lambda x: x["standalone_question"],
}

conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI() | StrOutputParser()


conversational_qa_chain.invoke(
    {
        "question": "what the email of nitinsuri",
        "chat_history": [],
    }
)



"Nitin Suri's email address is nitinsuri.705@gmail.com."