## Import Libraries

In [1]:
import torch
from transformers import BitsAndBytesConfig
from llama_index import VectorStoreIndex, StorageContext, Document, ServiceContext, PromptTemplate
from llama_index.prompts import PromptTemplate
from llama_index.llms import HuggingFaceLLM
from llama_index.vector_stores import PineconeVectorStore
from pinecone import Pinecone, PodSpec
from datasets import load_dataset
import os
import random

  from .autonotebook import tqdm as notebook_tqdm


## Create Hugging Face LLM

In [2]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [3]:
def messages_to_prompt(messages):
  prompt = ""
  for message in messages:
    if message.role == 'system':
      prompt += f"<|system|>\n{message.content}\n"
    elif message.role == 'user':
      prompt += f"<|user|>\n{message.content}\n"
    elif message.role == 'assistant':
      prompt += f"<|assistant|>\n{message.content}\n"
# ensure we start with a system prompt, insert blank if needed
  if not prompt.startswith("<|system|>\n"):
    prompt = "<|system|>\n\n" + prompt

  # add final assistant prompt
  prompt = prompt + "<|assistant|>\n"

  return prompt

In [4]:
llm = HuggingFaceLLM(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    query_wrapper_prompt=PromptTemplate("<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    generate_kwargs={"temperature": 0.3, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    device_map="auto",
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.62s/it]


## Create Pinecone VectorDB Index

In [5]:
os.environ['PINECONE_API_KEY'] = 'ad2c4c6e-833e-4ef5-b9b1-eeca2b976564'

In [6]:
pc = Pinecone(
    api_key = os.getenv('PINECONE_API_KEY')
    )

try:
    pc.create_index(
        name='my-index',
        dimension=1024,
        metric='euclidean',
        spec=PodSpec(
            replicas= 1, 
            shards= 1, 
            pod_type="p1",
            environment='gcp-starter'
        )        
    )
    print('Index created')
except:
    print('Index already exists')

pinecone_index = pc.Index("my-index")

Index already exists


### Load Dataset

In [7]:
dataset = load_dataset("neural-bridge/rag-dataset-12000")

questions = dataset['train']['question']
contexts = dataset['train']['context']
answers = dataset['train']['answer']

# Remove records where question is None
indices_to_pop = [i for i, value in enumerate(questions) if value is None]
indices_to_pop.sort(reverse=True)

for index in indices_to_pop:
    questions.pop(index)
    contexts.pop(index)
    answers.pop(index)

documents = [Document(text=q, metadata={"context": c}) for q, c in zip(questions, contexts)]

### Store Embeddings in Pinecone

In [8]:
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    add_sparse_vector=True,
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:WhereIsAI/UAE-Large-V1", chunk_size=2048)

# To initially load document embeddings to Pinecone index
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, service_context=service_context
# )

# Create index without loading in any additional embeddings
index = VectorStoreIndex.from_documents(
    [], storage_context=storage_context, service_context=service_context
)

query_engine = index.as_query_engine()

## Document Retrieval

In [9]:
response = query_engine.query("Who gave the author the pasta salad recipe?")
# response = query_engine.query("What are some of the features of Fabiana Filippi's shirts and blouses?")
print(response.response)

Ann Marie gave the author the pasta salad recipe.


In [10]:
rand_num = random.randrange(len(questions))
rand_num = 1345
print(f"index: {rand_num}")
print(questions[rand_num])
# print(contexts[rand_num])
print(answers[rand_num])

index: 1345
Who gave the author the pasta salad recipe?
Ann Marie gave the author the pasta salad recipe.


In [None]:
# Other areas for improvement on RAGs,
# Embed the context instead of questions
# Deploy using TGIF and handle it in a basic front end (Next JS with a simple simple read out and search box)
# Just use Postman for API showing and streaming. How to do that with FastAPI app