In [None]:
! pip install docarray
! pip install --upgrade langchain
! pip install openai

#### Ollama langchain with DocArrayInMemorySearch

In [10]:

import openai

from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.docstore.document import Document

from typing import List
from datetime import datetime, timezone
import os
from time import time

In [11]:
# # Load document
docs = []
path = "pdf_data/"
for file in os.listdir(path):
    loader = PyPDFLoader(file_path=os.path.join(path,file))
    docs.extend(loader.load())

print("Length of doc:",len(docs))
print(docs[0])

Length of doc: 94
page_content='Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 ____________________________________
FORM 10-K
____________________________________ 
(Mark One)
☒ ANNUAL  REPOR T PURSUANT  TO SECTION 13 OR 15(d) OF  THE SECURITIES EXCHANGE ACT  OF 1934
For the fiscal year ended December 31, 2023
or
☐ TRANSITION REPOR T PURSUANT  TO SECTION 13 OR 15(d) OF  THE SECURITIES EXCHANGE ACT  OF 1934
For the transition period from            to             .
Commission File No. 000-22513
____________________________________
AMAZON .COM, INC.
(Exact name of registrant as specified in its charter)
Delaware  91-1646860
(State or other jurisdiction of
incorporation or organization)  (I.R.S. Employer
Identification No.)
410 Terry Avenue North
Seattle, Washington 98109-5210
(206) 266-1000
(Addr ess and telephone number , including ar ea code, of r egistrant’ s principal executive offices)
Securities registered pursuant to Section 12(b) of the 

In [12]:
def split_docs(docs):
    MARKDOWN_SEPARATORS = [
        "\n#{1,6} ",
        "```\n",
        "\n\\*\\*\\*+\n",
        "\n---+\n",
        "\n___+\n",
        "\n\n",
        "\n",
        " ",
        "",
    ]

    chunk_size = 512
    chunk_overlap = int(chunk_size/10)

    spliter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        add_start_index=True,  # If `True`, includes chunk's start index in metadata
        strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in docs:
        docs_processed += spliter.split_documents([doc])


    # update metadata
    for doc in docs_processed:
        timestamp = datetime.now().replace(tzinfo=timezone.utc).timestamp()
        doc.metadata = {'source': doc.metadata['source'],'page': doc.metadata['page'],
                        'start_index': doc.metadata['start_index'], 'datetime':timestamp}


    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [13]:
docs_processed_unique = split_docs(docs)

In [15]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch

In [16]:
embedding_model = OllamaEmbeddings(model='nomic-embed-text')

In [17]:
list_chunks=[x.page_content for x in docs_processed_unique]

In [18]:
vectordb = DocArrayInMemorySearch.from_documents(documents=docs_processed_unique,
                                                 embedding=embedding_model)
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})

2024-08-24 22:56:23.683197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-24 22:56:23.760348: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-24 22:56:23.784432: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-24 22:56:23.920995: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [19]:
user_query = "What is list comprehension in python?"
query_vector = embedding_model.embed_query(user_query)

In [20]:
general_system_template = r"""Use the bellow information to answer the question at the end.
You are a helpful, respectful and honest assistant.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
give a comprehensive answer to the question.
Please ensure that your responses are socially unbiased and positive
{context}
"""
general_user_template = """{question}"""

messages = [
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template(general_user_template)
]

# create Chat prompt
qa_cv_prompt = ChatPromptTemplate.from_messages( messages )
qa_cv_prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template='Use the bellow information to answer the question at the end.\nYou are a helpful, respectful and honest assistant.\nYour answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.\ngive a comprehensive answer to the question.\nPlease ensure that your responses are socially unbiased and positive\n{context}\n')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))])

In [21]:
from langchain_community.chat_models import ChatOllama

In [22]:
# model name
llm_name = "gpt-3.5-turbo-0301"
chain_type = 'stuff'

# create a chatbot chain. Memory is managed externally.
llm = ChatOllama(model = "llama3", temperature=0)

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='answer'
)

# define Converational Chain with ll model and retriever
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    chain_type=chain_type,
    retriever= retriever,
    return_source_documents=True,
    return_generated_question=True,
    combine_docs_chain_kwargs={'prompt': qa_cv_prompt},
#     memory=memory,
#     verbose=True,
)

In [23]:
# querying using Conversationl chain
def test(model,query, ch=[], al = 0):
    print(query,'\n')
    time_1 = time()
    result = model({"question": query, "chat_history": ch})
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result['answer'])

In [24]:
test(qa_chain, query=user_query)

What is list comprehension in python? 



  warn_deprecated(


Inference time: 182.263 sec.

Result:  A great question!

List comprehension is a powerful feature in Python that allows you to create a new list from an existing iterable (such as a list, tuple, or set) by applying a transformation or filtering operation. It's a concise way to create a new list without having to use explicit loops.

The basic syntax of a list comprehension is:
```
[expression for variable in iterable]
```
Here:

* `expression` is the operation you want to perform on each element of the iterable.
* `variable` is the temporary variable that takes on the value of each element in the iterable, one at a time.
* `iterable` is the original list, tuple, or set from which you want to create a new list.

For example, let's say you have a list of numbers and you want to create a new list with only the even numbers:
```
numbers = [1, 2, 3, 4, 5, 6]
even_numbers = [x for x in numbers if x % 2 == 0]
print(even_numbers)  # Output: [2, 4, 6]
```
In this example:

* `expression` is `x