In [None]:
%pip install llama-index-llms-huggingface llama-index
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor

In [None]:
from llama_index_core import VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
# Loading documents (Neural Machine Translation Attention)
document = SimpleDirectoryReader("./data/").load_data()

In [None]:
document

Setting up prompts

In [None]:
# Setup prompts - specific to Stabel LLM
system_prompt = """ <|SYSTEM|># You are a Q&A assistant. Your goal is to answer questions as 
accurate as possible based on the instructions and context provided.
"""

# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")


Loading Model

In [None]:
llm = HuggingFaceLLM(
    context_window = 4096,
    max_new_tokens = 256,
    generate_kwargs = {"temerature":0.7, "do_sample":False},
    system_prompt = system_prompt,
    query_wrapper_prompt = query_wrapper_prompt,
    tokenizer_name = "mistralai/Mistral-7B-Instruct-v0.1",
    model_name = "mistralai/Mistral-7B-Instruct-v0.1",
    device_map = "auto",
    stopping_ids = [50278, 50279, 50277, 1, 0],
    tokenizer_kwargs = {"max_length":4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs = {"torch_dtype": torch.folat16}
)

In [None]:
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
service_context = ServiceContext.from_defaults(
    chunk_size = 1024,
    llm = llm,
    embed_model = embed_model
)

In [None]:
index = VectorStoreIndex.from_documents(document, service_context=service_context)

In [None]:
query_engine = index.as_query_engine()

In [None]:
query_engine.query("What is attention?")