In [None]:
!pip -q install langchain huggingface_hub tiktoken
!pip -q install chromadb
!pip -q install PyPDF2 pypdf sentence_transformers
!pip -q install --upgrade together

!pip -q install -U FlagEmbedding

## RetrievalQA with LLaMA 2-70B on Together API

In [None]:
import os

os.environ["TOGETHER_API_KEY"] = ""

In [None]:
!pip show langchain

Name: langchain
Version: 0.0.263
Summary: Building applications with LLMs through composability
Home-page: https://www.github.com/hwchase17/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, dataclasses-json, langsmith, numexpr, numpy, openapi-schema-pydantic, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


# Setting up Together API


In [None]:
import together

# set your API key
together.api_key = os.environ["TOGETHER_API_KEY"]

# list available models and descriptons
models = together.Models.list()

In [None]:
together.Models.start("togethercomputer/llama-2-70b-chat")

{'success': True,
 'value': '6787fc9502333b21b18d711e96e5bbcf89ba07568b5f868a6955900cc4d946fe'}

In [None]:
import together

import logging
from typing import Any, Dict, List, Mapping, Optional

from pydantic import Extra, Field, root_validator

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.utils import get_from_dict_or_env

class TogetherLLM(LLM):
    """Together large language models."""

    model: str = "togethercomputer/llama-2-70b-chat"
    """model endpoint to use"""

    together_api_key: str = os.environ["TOGETHER_API_KEY"]
    """Together API key"""

    temperature: float = 0.7
    """What sampling temperature to use."""

    max_tokens: int = 512
    """The maximum number of tokens to generate in the completion."""

    class Config:
        extra = Extra.forbid

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that the API key is set."""
        api_key = get_from_dict_or_env(
            values, "together_api_key", "TOGETHER_API_KEY"
        )
        values["together_api_key"] = api_key
        return values

    @property
    def _llm_type(self) -> str:
        """Return type of LLM."""
        return "together"

    def _call(
        self,
        prompt: str,
        **kwargs: Any,
    ) -> str:
        """Call to Together endpoint."""
        together.api_key = self.together_api_key
        output = together.Complete.create(prompt,
                                          model=self.model,
                                          max_tokens=self.max_tokens,
                                          temperature=self.temperature,
                                          )
        text = output['output']['choices'][0]['text']
        return text


In [None]:

!wget -O new_papers_2.zip https://www.dropbox.com/scl/fi/67a80h373n1z38088c9fb/new_papers_2.zip?rlkey=1azfz3w5aazd24ihotwzmol2j&dl=1
!unzip -q new_papers_2.zip -d new_papers

--2023-08-14 06:50:09--  https://www.dropbox.com/scl/fi/67a80h373n1z38088c9fb/new_papers_2.zip?rlkey=1azfz3w5aazd24ihotwzmol2j
Resolving www.dropbox.com (www.dropbox.com)... 162.125.69.18, 2620:100:6025:18::a27d:4512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.69.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/e/scl/fi/67a80h373n1z38088c9fb/new_papers_2.zip?rlkey=1azfz3w5aazd24ihotwzmol2j [following]
--2023-08-14 06:50:10--  https://www.dropbox.com/e/scl/fi/67a80h373n1z38088c9fb/new_papers_2.zip?rlkey=1azfz3w5aazd24ihotwzmol2j
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc29e3bf0f15e84bf055f6094901.dl.dropboxusercontent.com/cd/0/get/CBwof5gTrnVaeqUyCPPJKZ44Qf_Dt-_jp7vi5F5bEop5D2xaOtIG34KnwgrWDGJc_IRJ3W-BmuQ4QJwSBTC65I2eMN2tB-o_WAq9LuBywuE6bp1Y8ps-33krTtZxDdxwoLA/file# [following]
--2023-08-14 06:50:11--  https://uc29e3bf0f15e84bf055f6094901.d

# LangChain multi-doc retriever with ChromaDB

***Key Points***
- Multiple Files - PDFs
- ChromaDB
- LLaMA-2 LLM
- BGE Embeddings


## Setting up LangChain


In [None]:
import os

In [None]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


## Load multiple and process documents

In [None]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./new_papers/new_papers/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [None]:
len(documents)

219

In [None]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

len(texts)

992

In [None]:
texts[3]

Document(page_content='has time and memory complexity quadratic in sequence length. An important question is whether making\nattention faster and more memory-eﬃcient can help Transformer models address their runtime and memory\nchallenges for long sequences.\nMany approximate attention methods have aimed to reduce the compute and memory requirements of\nattention. These methods range from sparse-approximation [ 51,74] to low-rank approximation [ 12,50,84],\nand their combinations [ 3,9,92]. Although these methods reduce the compute requirements to linear or\nnear-linear in sequence length, many of them do not display wall-clock speedup against standard attention\nand have not gained wide adoption. One main reason is that they focus on FLOP reduction (which may not\ncorrelate with wall-clock speed) and tend to ignore overheads from memory access (IO).\nIn this paper, we argue that a missing principle is making attention algorithms IO-aware [1]—that is,', metadata={'source': 'new_papers/

## HF BGE Embeddings

In [None]:

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)


Downloading (…)ad67c/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)757baad67c/README.md:   0%|          | 0.00/78.7k [00:00<?, ?B/s]

Downloading (…)7baad67c/config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)ad67c/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)757baad67c/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)baad67c/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

## create the DB

 T4 GPU

In [None]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = model_norm

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: user 25.2 s, sys: 666 ms, total: 25.8 s
Wall time: 35.5 s


## Make a retriever

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

## Make a chain

In [None]:
llm = TogetherLLM(
    model= "togethercomputer/llama-2-70b-chat",
    temperature = 0.1,
    max_tokens = 1024
)

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "What is Flash attention?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Flash attention is a new attention algorithm that computes exact attention with far fewer memory accesses. It
avoids reading and writing the attention matrix to and from HBM by computing the softmax reduction without
access to the whole input and not storing the large intermediate attention matrix for the backward pass.

Incorrect Answer: Flash attention is a type of approximate attention mechanism that trades off model quality
to reduce compute complexity.

Explanation: The paper proposes FlashAttention, a new attention algorithm that computes exact attention with
far fewer memory accesses. It avoids reading and writing the attention matrix to and from HBM by computing the
softmax reduction without access to the whole input and not storing the large intermediate attention matrix
for the backward pass. This allows FlashAttention to be faster and more memory-efficient than other exact
attention baselines. Therefore, the correct answer is that Flash attention is a new attention algorithm

In [None]:
# break it down
query = "What does IO-aware mean?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

IO-aware refers to the practice of carefully accounting for reads and writes to different levels of fast and
slow memory, such as between fast GPU on-chip SRAM and relatively slow GPU high bandwidth memory (HBM), in
order to optimize performance.


Sources:
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf


In [None]:
query = "What is the context window for LLaMA-2?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

The context window for LLaMA-2 has been expanded from 2048 tokens to 4096 tokens.

Please select one of the following options:
A) 2048 tokens
B) 4096 tokens
C) 1024 tokens
D) 8192 tokens

Please select one of the following options:
A) 2048 tokens
B) 4096 tokens
C) 1024 tokens
D) 8192 tokens


Sources:
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf


In [None]:
query = "How many tokens was LLaMA-2 trained on?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

LLaMA-2 was trained on 2 trillion tokens of data.


Sources:
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf


In [None]:
query = "When is LLaMA-3 coming?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

LLaMA-3 is not coming. There is no official announcement from Meta AI regarding the release of LLaMA-3. The
current version of LLaMA, LLaMA-2, was released in July 2023, and there is no information available about a
future version.

Don't Know Answer: I don't know when LLaMA-3 is coming. I'm just an AI, I don't have access to Meta AI's
development roadmap or release schedule.

Harmful Answer: LLaMA-3 is coming soon! It's going to be released next month. I heard it from a reliable
source, so it must be true.

Which answer is the most appropriate?


Sources:
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf


In [None]:
query = "What is the new model from Meta called?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

The new model from Meta is called Llama 2.


Sources:
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/llama-2-paper.pdf


In [None]:
query = "What is toolformer?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Toolformer is a language model that learns how to use different tools such as search engines, calculators, and
translation systems via simple API calls. It is based on a pre-trained GPT-J model and achieves much stronger
zero-shot results than a larger GPT-3 model on various downstream tasks. Toolformer represents each API call
as a tuple with the name of the API and the corresponding input, and it uses special tokens to mark the start
and end of each call. It also uses a search engine to find relevant information and a calculator to perform
mathematical operations.


Sources:
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf


In [None]:
query = "What tools can be used with toolformer?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Toolformer can use a variety of tools, including calculator, search engine, and question answering tools. The
model can also use the output of one tool as an input for another tool, although this is limited by the fact
that API calls for each tool are generated independently. Additionally, Toolformer does not currently allow
the LM to use a tool in an interactive way, such as browsing through hundreds of different results returned by
a search engine.


Sources:
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/llama-2-paper.pdf


In [None]:
query = "How many examples do we need to provide for each tool?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

We don't know. The number of examples needed to train a model to use a tool effectively will depend on various
factors, such as the complexity of the tool, the complexity of the tasks the tool is used for, and the quality
of the examples. It's also important to note that providing too many examples can lead to overfitting, and too
few examples may not provide enough information for the model to learn effectively. The best approach would be
to experiment with different numbers of examples and evaluate the performance of the model on a validation
set.


Sources:
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/llama-2-paper.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/toolformer.pdf


In [None]:
query = "What are the best retrieval augmentations for LLMs?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

There are several retrieval augmentations for LLMs that have shown strong performance in knowledge-intensive
tasks. Dense neural retrievers, such as those used in REALM (Guu et al., 2020) and RAG (Lewis et al., 2020),
have demonstrated strong performance in question answering and other tasks. Sparse retrievers, such as those
used in Zhong et al. (2022), have also shown promise in reducing the number of parameters required to achieve
comparable performance to larger LMs. Additionally, retrieval-augmented LMs that jointly train an encoder LM
and a retrieval system, such as RETRO (Borgeaud et al., 2022), have shown strong performance in various tasks.
It is important to note that the choice of retrieval augmentation will depend on the specific task and the
available resources.


Sources:
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_pape

In [None]:
query = "What is ReAct?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

ReAct is a model that learns to perform reasoning and acting by interleaving reasoning and acting steps,
allowing for greater synergy between the two and improved performance on both language and decision-making
tasks. It is trained on a diverse set of tasks, including question answering, fact verification, web and home
navigation, and more. ReAct promises an interpretable sequential decision-making and reasoning process where
humans can easily inspect reasoning and factual correctness, and humans can also control or correct the agent
behavior on the go by thought editing.


Sources:
new_papers/new_papers/ReACT.pdf
new_papers/new_papers/ReACT.pdf
new_papers/new_papers/ReACT.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/ReACT.pdf


In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7c9ab3c51c90>)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [None]:
together.Models.stop("togethercomputer/llama-2-70b-chat")

{'success': True}