This notebook demonstrates the concept of Retrieval Augmented Generation (RAG) using the Mistral 7B Instruct model (https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/blob/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf) with LlamaIndex.

In [6]:
!pip install -q pypdf
!pip install -q python-dotenv

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/277.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/277.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m143.4/277.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.6/277.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
!pip install -q transformers


In [2]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install  llama-cpp-python --no-cache-dir

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.20.tar.gz (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.20-cp310-cp310-manylinux_2_35_x86_64.whl size=7138150 sha256=9d753e00ce07de182a8d8fe08c933fa97f0dfd62fe006cab13623c7608aa1855
  Stored in directory: /tmp/pip-ephem-wheel-cache-kjhrfn7x/wheels/ef/f2/d2/0becb03047a348d7bd9a5b91ec88f4654d6fa7d67ea4e84d43
Successfully built llama-cpp-python
Installing collected packages: llama-cpp-python
Successfully installed llama-c

In [3]:
!pip install -q llama-index


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m929.3/929.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.5/221.5 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependen

In [4]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext


In [7]:
documents = SimpleDirectoryReader("/content/Data").load_data()

In [8]:
import torch

from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=350,
    context_window=4000,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)


Downloading url https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf to path /tmp/llama_index/models/mistral-7b-instruct-v0.1.Q4_K_M.gguf
total size (MB): 4368.44


4167it [00:22, 188.67it/s]                          
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [9]:
!pip -q install sentence-transformers
!pip -q install langchain

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext, set_global_service_context

embed_model = HuggingFaceEmbeddings(model_name="thenlper/gte-large")



.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

onnx/special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

onnx/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

onnx/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [11]:
service_context = ServiceContext.from_defaults(
    chunk_size=300,
    llm=llm,
    embed_model=embed_model
)

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [13]:
query_engine = index.as_query_engine()
response = query_engine.query("Who are the author of this research?")

In [14]:
print(response)

 I apologize, but the provided context information does not contain any information about the authors of the research. It only provides performance metrics of LLMs on "CNN/Daily Mail 3.0".


In [None]:

while True:
  query=input()
  response = query_engine.query(query)
  print(response)

what is the top of this research?


Llama.generate: prefix-match hit


 Based on the provided context information, it appears that the research being discussed is related to the performance metrics of large language models (LLMs) on the CNN/Daily Mail 3.0 dataset. The table shows the average word count, ROUGE-1, ROUGE-2, ROUGE-L, and BLEU RT score for several LLM models, including falcon-7b-instruct, mpt-7b-instruct, text-davinci-003, and others. However, it is not clear from the given information what specific aspect of this research is being referred to as "top." Could you please provide more context or clarify your question?
What are the datasets used in this research?


Llama.generate: prefix-match hit


 The datasets used in this research are CNN/Daily Mail 3.0.0 Dataset and XSum dataset.
What are the models used in this research?


Llama.generate: prefix-match hit


 The models used in this research are falcon-7b-instruct, mpt-7b-instruct, and text-davinci-003.
Which model is performing better?


Llama.generate: prefix-match hit


 Based on the provided context information, it appears that the OpenAI model, text-davinci-003, consistently exhibits strong performance in text summarization tasks, achieving high BLEU, ROUGE, and BERT scores. However, when comparing two 7b parameter fine-tuned models, MPT-7b-instruct performed slightly better than Falcon-7b - instruct.
What is the experiment setup for this research?


Llama.generate: prefix-match hit


 Based on the provided context information, it appears that this research involves evaluating the performance of large language models (LLMs) on the CNN/Daily Mail 3.0 dataset using ROUGE metrics and a RT score. The LLMs being evaluated are falcon-7b-instruct, mpt-7b-instruct, text-davinci-003, and potentially others. Each model was tested on both the CNN and XSum tasks with 25 samples per task. The ROUGE metrics evaluated include ROUGE-1, ROUGE-2, and ROUGE-L, while the RT score is also reported.
Who is funding this research?


Llama.generate: prefix-match hit


 Based on the provided context information, it does not mention who is funding the research.
What this research concludes?


Llama.generate: prefix-match hit


 Based on the provided context information, it appears that this research compares the performance of several large language models (LLMs) on two different datasets: CNN/Daily Mail 3.0 and XSum. The performance is measured using various metrics such as average word count, ROUGE-1, ROUGE-2, ROUGE-L, BLEU score, and RT score.

The results show that the falcon-7b-instruct model performs well on both datasets, achieving high scores in terms of ROUGE-1, ROUGE-2, ROUGE-L, and BLEU score. The mpt-7b-instruct model also performs well, particularly on the CNN/Daily Mail 3.0 dataset. The text-davinci-003 model shows promising results on both datasets as well.

Overall, this research concludes that these LLMs are capable of generating high-quality text based on the given prompts and datasets, with falcon-7b-instruct and mpt-7b-instruct models performing particularly well.
Why you hallucinate?


Llama.generate: prefix-match hit


 The provided context information does not mention any instances of "hallucination" or provide any relevant information about why LLMs might hallucinate. It discusses various evaluation metrics for LLM models and their performance on different datasets, including ROUGE-1, ROUGE-2, ROUGE-L, BERT Score, and P/R/F1.
Thank you


Llama.generate: prefix-match hit


 You're welcome! Is there anything else I can help you with?
I am somewhat satisfied with your performance.


Llama.generate: prefix-match hit


 Thank you for letting me know. Is there anything specific that I can help you with or any further information you would like to provide?
