In [1]:
! pip install docarray 
! pip install --upgrade langchain
! pip install openai

Collecting docarray
  Obtaining dependency information for docarray from https://files.pythonhosted.org/packages/27/bf/90439e206a5d2df089e3467a703dfa0349f17d73f003ec51367db23bf8de/docarray-0.40.0-py3-none-any.whl.metadata
  Downloading docarray-0.40.0-py3-none-any.whl.metadata (36 kB)
Collecting sentence-transformers
  Obtaining dependency information for sentence-transformers from https://files.pythonhosted.org/packages/9d/36/6d5ee64ed96aef3b1a366574ed1dfe07838304913c74159943363da67807/sentence_transformers-2.4.0-py3-none-any.whl.metadata
  Downloading sentence_transformers-2.4.0-py3-none-any.whl.metadata (11 kB)
Collecting types-requests>=2.28.11.6 (from docarray)
  Obtaining dependency information for types-requests>=2.28.11.6 from https://files.pythonhosted.org/packages/ce/ca/82c7f75616c524856488cece6b37e459de626cad49b2a24a0b571c20be06/types_requests-2.31.0.20240218-py3-none-any.whl.metadata
  Downloading types_requests-2.31.0.20240218-py3-none-any.whl.metadata (1.8 kB)
Collecting 

In [4]:

import openai

from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.docstore.document import Document

from typing import List
from datetime import datetime, timezone
import os
from time import time

In [5]:
# set up open api key
os.environ['OPENAI_API_KEY'] = '..'
openai.api_key = os.environ['OPENAI_API_KEY']

In [6]:
# # Load document
docs = []
path = "/kaggle/input/python-docs"
for file in os.listdir(path):
    loader = PyPDFLoader(file_path=os.path.join(path,file))
    docs.extend(loader.load())

print("Length of doc:",len(docs))
print(docs[0])

In [67]:
def split_docs(docs):
    MARKDOWN_SEPARATORS = [
        "\n#{1,6} ",
        "```\n",
        "\n\\*\\*\\*+\n",
        "\n---+\n",
        "\n___+\n",
        "\n\n",
        "\n",
        " ",
        "",
    ]

    chunk_size = 512
    chunk_overlap = int(chunk_size/10)

    spliter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        add_start_index=True,  # If `True`, includes chunk's start index in metadata
        strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in docs:
        docs_processed += spliter.split_documents([doc])


    # update metadata
    for doc in docs_processed:
        timestamp = datetime.now().replace(tzinfo=timezone.utc).timestamp()
        doc.metadata = {'source': doc.metadata['source'],'page': doc.metadata['page'],
                        'access_level': 5, 'start_index': doc.metadata['start_index'], 'datetime':timestamp}
        

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [None]:
docs_processed_unique = split_docs(docs)

In [None]:
# define embedding and vector db
embedding_model = OpenAIEmbeddings()

vectordb = DocArrayInMemorySearch.from_documents(documents=docs_processed_unique, 
                                                 embedding=embedding_model)
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [43]:
user_query = "What is list comprehension in python?"
query_vector = embedding_model.embed_query(user_query)

In [None]:
general_system_template = r"""Use the bellow information to answer the question at the end.
You are a helpful, respectful and honest assistant.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
give a comprehensive answer to the question.
Please ensure that your responses are socially unbiased and positive
{context}
"""
general_user_template = """{question}"""

messages = [
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template(general_user_template)
]

# create Chat prompt
qa_cv_prompt = ChatPromptTemplate.from_messages( messages )
qa_cv_prompt

In [73]:
class FilteredRetriever(VectorStoreRetriever):
    vectorstore: VectorStoreRetriever
    search_type: str = "similarity"
    search_kwargs: dict = dict(default_factory=dict)
    filter_prefix: int
    
    def get_relevant_documents(self, query: str) -> List[Document]:
        results = self.vectorstore.get_relevant_documents(query=query)
        return [doc for doc in results if doc.metadata['datetime'] < self.filter_prefix]

In [74]:
filtered_retriever = FilteredRetriever(vectorstore=retriever, filter_prefix=1234567)

In [80]:
# model name
llm_name = "gpt-3.5-turbo-0301"
chain_type = 'stuff'

# create a chatbot chain. Memory is managed externally.
llm=ChatOpenAI(model_name=llm_name, temperature=0, max_tokens=512)

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer'
)

# define Converational Chain with ll model and retriever
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    chain_type=chain_type, 
    retriever= filtered_retriever,
    return_source_documents=True,
    return_generated_question=True,
    combine_docs_chain_kwargs={'prompt': qa_cv_prompt},
#     memory=memory,
#     verbose=True,
)

In [83]:
# querying using Conversationl chain
def test(model,query, ch=[], al = 0):
    print(query,'\n')
    time_1 = time()
    result = model({"question": query, "chat_history": ch})
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result['answer'])