<a href="https://colab.research.google.com/github/kazcfz/LlamaIndex-RAG/blob/main/LLM_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install llama-index pypdf
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip -q install llama-cpp-python

In [2]:
import os
import time

from llama_index import Prompt, StorageContext, load_index_from_storage, ServiceContext, VectorStoreIndex, SimpleDirectoryReader, set_global_tokenizer
from llama_index.llms import LangChainLLM, HuggingFaceLLM, LlamaCPP
from llama_index.embeddings import HuggingFaceEmbedding

from transformers import AutoTokenizer

In [3]:
# Preference settings - change as desired
pdf_path = '/content/rag_data.pdf'
text_embedding_model = 'jinaai/jina-embeddings-v2-base-en'  #Alt: thenlper/gte-base
llm_url = 'https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_K_M.gguf'
set_global_tokenizer(AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [4]:
# Load PDF
filename_fn = lambda filename: {'file_name': os.path.basename(pdf_path)}
loader = SimpleDirectoryReader(input_files=[pdf_path], file_metadata=filename_fn)
documents = loader.load_data()

In [None]:
# Load models and service context
embed_model = HuggingFaceEmbedding(model_name=text_embedding_model)
llm = LlamaCPP(model_url=llm_url, temperature=0.7, max_new_tokens=256, context_window=4096, generate_kwargs={}, model_kwargs={"n_gpu_layers": -1}, verbose=True)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model, chunk_size=512)

In [5]:
# Indexing
start_time = time.time()

index = VectorStoreIndex.from_documents(documents, service_context=service_context)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed indexing time: {elapsed_time:.2f} s")

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-en and are newly initialized: ['encoder.layer.6.output.dense.weight', 'encoder.layer.8.intermediate.dense.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.4.output.LayerNorm.weight', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.1.output.dense.bias', 'encoder.layer.11.output.LayerNorm.weight', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.7.output.dense.weight', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.6.output.LayerNorm.bias', 'embeddings.position_embeddings.weight', 'encoder.layer.9.output.LayerNorm.bias', 'encoder.layer.11.intermediate.dense.weight', 'encoder.layer.8.output.dense.bias', 'encoder.layer.9.intermediate.dense.weight', 'encoder.layer.8.output.LayerNorm.weight', 'encoder.layer.5.

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading url https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_K_M.gguf to path /tmp/llama_index/models/llama-2-7b.Q4_K_M.gguf
total size (MB): 4081.0


3892it [00:41, 94.31it/s]                          
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


Elapsed indexing and loading time: 57.94 s


In [10]:
text_qa_template = Prompt("""<s>[INST] Following is the context information: {context_str} \n\nGiven this information, please answer the question: {query_str} [/INST] """)
query_engine = index.as_query_engine(text_qa_template=text_qa_template, streaming=True, service_context=service_context)

In [None]:
# Inferencing

user_query = ""
while (user_query != "exit"):
  user_query = input("User: ")

  start_time = time.time()
  response = llm.complete(user_query)
  print(response)
  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Elapsed inference time: {elapsed_time:.2f} s")

while (True):
  user_query = input("User: ")

  start_time = time.time()
  response = query_engine.query(user_query)
  response.print_response_stream()
  # print(response)
  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Elapsed inference time: {elapsed_time:.2f} s")

