In [31]:
# ! pip3 install langchain
# ! pip3 install pypdf
# ! pip3 install cohere
# ! pip3 install chromadb
# ! pip3 install typing_extensions
# ! pip3 install fsspec==2023.9.2
# ! pip3 install llama_index
# ! pip3 install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 0.0/86.0 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/86.0 kB ? eta -:--:--
     ------------- ------------------------ 30.7/86.0 kB 330.3 kB/s eta 0:00:01
     ------------------ ------------------- 41.0/86.0 kB 326.8 kB/s eta 0:00:01
     ------------------------------- ------ 71.7/86.0 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 86.0/86.0 kB 440.2 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py): started
  Building wheel for sentence-transformers (setup.py): finished with status 'done'
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125960 sha256=e92a1c4ced1d64b34d5

In [3]:
import requests, json
import gradio as gr

In [7]:
model = 'llama2:latest' #You can replace the model name if needed
context = [] 

# Through the Ollama API

In [8]:
def generate(prompt, context, top_k, top_p, temp):
    r = requests.post('http://localhost:11434/api/generate',
                     json={
                         'model': model,
                         'prompt': prompt,
                         'context': context,
                         'options':{
                             'top_k': top_k,
                             'temperature':top_p,
                             'top_p': temp
                         }
                     },
                     stream=False)
    r.raise_for_status()

 
    response = ""  

    for line in r.iter_lines():
        body = json.loads(line)
        response_part = body.get('response', '')
        print(response_part)
        if 'error' in body:
            raise Exception(body['error'])

        response += response_part

        if body.get('done', False):
            context = body.get('context', [])
            return response, context



def chat(input, chat_history, top_k, top_p, temp):

    chat_history = chat_history or []

    global context
    output, context = generate(input, context, top_k, top_p, temp)

    chat_history.append((input, output))

    return chat_history, chat_history
  #the first history in return history, history is meant to update the 
  #chatbot widget, and the second history is meant to update the state 
  #(which is used to maintain conversation history across interactions)


In [9]:
block = gr.Blocks()


with block:

    gr.Markdown("""<h1><center> Jarvis </center></h1>
    """)

    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder="Type here")

    state = gr.State()
    with gr.Row():
        top_k = gr.Slider(0.0,100.0, label="top_k", value=40, info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)")
        top_p = gr.Slider(0.0,1.0, label="top_p", value=0.9, info=" Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)")
        temp = gr.Slider(0.0,2.0, label="temperature", value=0.8, info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)")


    submit = gr.Button("SEND")

    submit.click(chat, inputs=[message, state, top_k, top_p, temp], outputs=[chatbot, state])


block.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Hello
!
 It
'
s
 nice
 to
 meet
 you
.
 Is
 there
 something
 I
 can
 help
 you
 with
 or
 would
 you
 like
 to
 chat
?

Like
wise
!
 It
'
s
 always
 great
 to
 connect
 with
 new
 people
.
 How
 are
 you
 today
?
 Is
 there
 anything
 on
 your
 mind
 that
 you
'
d
 like
 to
 talk
 about
 or
 ask
?
 I
'
m
 here
 to
 listen
 and
 help
 if
 I
 can
.

Keyboard interruption in main thread... closing server.




# Through LangChain

### Loading a document

### Web page

### PDF

In [13]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader('falcon-paper.pdf')
pages = loader.load()

In [14]:
len(pages)

231

In [15]:
pages = pages[:39]

In [16]:
pages[0].page_content[:100]

'PHD THESIS\nIn Partial Fulﬁlment of the Requirements for the\nDegree of Doctor of Philosophy from Sorb'

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [21]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0, 
)

In [22]:
from langchain_core.documents.base import Document
# docs = r_splitter.split_documents(pages)
paper_content = ' '.join([p.page_content for p in pages])
print(paper_content[:100])
docs = r_splitter.split_documents([Document(page_content=paper_content)])

PHD THESIS
In Partial Fulﬁlment of the Requirements for the
Degree of Doctor of Philosophy from Sorb


In [23]:
print(docs[0])
print(docs[-1])

page_content='PHD THESIS\nIn Partial Fulﬁlment of the Requirements for the\nDegree of Doctor of Philosophy from Sorbonne University\nSpecialization: Data Science\nRepresentation, Information Extraction, and\nSummarization for Automatic Multimedia\nUnderstanding\nIsmail HARRANDO\nDefended on XX/XX/2022 before a committee composed of:\nReviewer Johanna BJÖRKLUND , Umeå University, Umeå Sweden\nReviewer Andreas Lothe OPDAHL , University of Bergen, Bergen, Norway\nExaminer Paolo PAPOTTI , EURECOM, Sophia Antipolis, France'
page_content='we also deﬁne 3 new classes and 10 new properties1. The MeMAD ontology provides mappings\nbetween the legacy metadata models of INA and Yle with the standard EBUCore data model\nand could therefore be used by those industries to improve their metadata interoperability\nsystems. The labels of classes and properties are provided in both English and French.\n1The list can be accessed through the following link: https://data.memad.eu/ontology.\n21'


### embedding model

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

In [32]:
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L12-v2'
# EMBEDDING_MODEL_NAME = 'OrdalieTech/Solon-embeddings-large-0.1'

print(f"Loading embeddings model: {EMBEDDING_MODEL_NAME} ...")

embedding_model = LangchainEmbedding(HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    encode_kwargs = {"normalize_embeddings": False}
  )
)

Loading embeddings model: sentence-transformers/all-MiniLM-L12-v2 ...


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

### llm

In [33]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import Ollama

llm = Ollama(model="llama2")

llm = Ollama(
    model="llama2",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)

In [34]:
_ = llm("Tell me about the history of AI")

 Artificial intelligence (AI) has a rich and varied history that spans several decades. Here is a brief overview of some of the key milestones in the development of AI:

1. 1950s: The Dartmouth Conference - The field of AI was founded at a conference held at Dartmouth College in 1956. Attendees included computer scientists, mathematicians, and cognitive scientists who shared an interest in exploring the possibilities of creating machines that could simulate human intelligence.
2. 1951: The Turing Test - British mathematician Alan Turing proposed a test to measure a machine's ability to exhibit intelligent behavior equivalent to, or indistinguishable from, that of a human. The Turing Test has since become a benchmark for measuring the success of AI systems.
3. 1956: The First AI Program - Computer scientist John McCarthy created the first AI program, called the Logical Theorist, which was designed to reason and solve problems using logical deduction.
4. 1960s: Rule-Based Expert Systems 

### llama-index

In [41]:
from llama_index import Document

document = Document(text=paper_content)

In [36]:
from llama_index import VectorStoreIndex
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embedding_model
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context)

In [37]:
query_engine = index.as_query_engine()

In [38]:
response = query_engine.query("What about Hardware scalability?")

  warn_deprecated(


Hardware scalability is an important aspect of building a large-scale knowledge graph. As the size of the graph grows, it becomes increasingly challenging to scale the system using traditional hardware. Here are some strategies that can help improve hardware scalability:

1. Distributed architecture: Design the system to be distributed across multiple nodes, each node handling a subset of the graph. This allows for better scalability and fault tolerance.
2. Parallel processing: Use parallel processing techniques such as map-reduce or multi-threading to process the large amount of data in parallel.
3. Memory optimization: Optimize the use of memory by using compression techniques, reducing the number of redundant data, and using an efficient data structure for storing the graph.
4. Data partitioning: Partition the graph into smaller sub-graphs that can be stored and processed independently, reducing the computational complexity and memory requirements.
5. Graph reduction: Use graph redu

In [42]:
from langchain_core.documents.base import Document
from langchain.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

docs = r_splitter.split_documents([Document(page_content=paper_content)])
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding_function)

In [43]:
query = "What about Hardware scalability?"
docs = vectorstore.similarity_search(query)

# print results
print(docs[0].page_content)

ruled research landscape, it is also a matter of what is our current technology is allowing us
to do: because of the need to parallelize (to somewhat extreme degrees5) all the processing
needed for the backpropagation-fueled deep learning, the Transformers seem to be the perfect
conduit for such convergence.
5Megatron-Turing NLG 530B, the World’s Largest and Most Powerful Generative Language Model
17  Chapter 3
Multimedia Content Representation


In [44]:
from langchain.chains import RetrievalQA
qachain=RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), verbose=True)
qachain({"query": query})

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m
Hardware scalability refers to the ability of a system or device to handle increased workload or processing power without significant degradation in performance. In the context of deep learning and specifically Transformers, hardware scalability is crucial for two reasons:

1. Parallelization: Transformers require a large amount of computational resources to parallelize the processing needed for backpropagation-fueled training. As the size of the model increases, so does the need for more powerful hardware to keep up with the processing demands.
2. Scalability of attention: Attention mechanisms in Transformers are shown to be Turing Complete [170], meaning they can handle any computational problem that can be solved by a Turing machine. This means that as the size of the model and the amount of data being processed increases, the hardware scalability of the system becomes even more critical.

In summary, hardware scalability is essential fo

{'query': 'What about Hardware scalability?',
 'result': 'Hardware scalability refers to the ability of a system or device to handle increased workload or processing power without significant degradation in performance. In the context of deep learning and specifically Transformers, hardware scalability is crucial for two reasons:\n\n1. Parallelization: Transformers require a large amount of computational resources to parallelize the processing needed for backpropagation-fueled training. As the size of the model increases, so does the need for more powerful hardware to keep up with the processing demands.\n2. Scalability of attention: Attention mechanisms in Transformers are shown to be Turing Complete [170], meaning they can handle any computational problem that can be solved by a Turing machine. This means that as the size of the model and the amount of data being processed increases, the hardware scalability of the system becomes even more critical.\n\nIn summary, hardware scalabilit

# Chain it together

In [None]:
# https://medium.com/@shrinath.suresh/implementing-streaming-chatbot-with-langchain-callbacks-a-step-by-step-guide-a527a7d65b8b

In [45]:
import gradio as gr
from typing import Any
from queue import Queue, Empty
from langchain.llms import LlamaCpp
from langchain.callbacks.base import BaseCallbackHandler
from langchain.prompts import PromptTemplate
from threading import Thread

In [46]:
q = Queue()
job_done = object()

In [47]:
class QueueCallback(BaseCallbackHandler):
    """Callback handler for streaming LLM responses to a queue."""

    def __init__(self, q):
        self.q = q

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        self.q.put(token)

    def on_llm_end(self, *args, **kwargs: Any) -> None:
        return self.q.empty()

In [48]:
callbacks = [QueueCallback(q)]
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [52]:
def answer(question):
    def task():
        response = llm(question)
        q.put(job_done)

    t = Thread(target=task)
    t.start()

In [50]:
llm.callbacks = callbacks

In [53]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        question = history[-1][0]
        print("Question: ", question)
        history[-1][1] = ""
        answer(question=question)
        while True:
            try:
                next_token = q.get(True, timeout=1)
                if next_token is job_done:
                    break
                history[-1][1] += next_token
                yield history
            except Empty:
                continue

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Question:  Hello


# RAG

In [54]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

In [55]:
template = """You are an smart and helpful assistant of an AI researcher. Given the following context, answer the question:
Context:{context}

Question: {question}
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [56]:
prompt = ChatPromptTemplate.from_template(template)

model = llm

In [57]:
chain = (
    {"context": vectorstore.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
)

In [60]:
def answer_rag(question):
    def task():
        response = chain.invoke(question)
        q.put(job_done)
  
    t = Thread(target=task)
    t.start()

In [61]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        question = history[-1][0]
        print("Question: ", question)
        history[-1][1] = ""
        answer_rag(question=question)
        while True:
            try:
                next_token = q.get(True, timeout=1)
                if next_token is job_done:
                    break
                history[-1][1] += next_token
                yield history
            except Empty:
                continue

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




Question:  What is hardware scalability?
Question:  what is special about the Falcon models?
Question:  which embeddings are used?
