In [97]:
import glob
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SemanticSplitterNodeParser, TokenTextSplitter
from llama_index.core import VectorStoreIndex, load_index_from_storage, Settings
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from operator import itemgetter
import torch
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig, pipeline
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

from getpass import getpass
import mistralai
from mistralai.client import MistralClient
from mistralai.models import models
from mistralai.models.chat_completion import ChatMessage

Connect to Mistral client

In [11]:
# Setup your API KEY here
api_key = input("Put your API key here")

client = MistralClient(api_key=api_key)

Load LLM model, prepare retriever and create vector db


In [94]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# LlamaCpp implementation with langchain is slightly diffrent from llama_index
llm = LlamaCpp(
    model_path='/Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf',  # Q6_K was used too but quite slow
    #model_path="langchain_intro/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    temperature=0.0,
    max_tokens=512,
    n_ctx=8192,
    n_batch=2048,  # How many tokens are processed in parallel
    callback_manager=callback_manager,
    n_gpu_layers=10,  # Gpu layers to use, 10 is the max on M3 base version
    streaming=True,  # Whether to print one by one tokens
    verbose=True  # Verbose is required to pass to the callback manager
)

from langchain.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import TextLoader
from langchain.document_loaders.merge import MergedDataLoader

documents = glob.glob("./data/Arnell**.txt")
docs_to_merge = []
for doc in documents:
    loader = TextLoader(doc)
    docs_to_merge.append(loader)
all_loaders = MergedDataLoader(loaders=docs_to_merge)
all_docs = all_loaders.load()


model_name = "BAAI/bge-small-en-v1.5"

encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

hf_bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'mps'},
    encode_kwargs=encode_kwargs
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base", chunk_size=100, chunk_overlap=0)

docs = text_splitter.split_documents(all_docs)

chroma_db = Chroma()

vectorstore = chroma_db.from_documents(docs, hf_bge_embeddings)

base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 5})

relevant_docs = base_retriever.get_relevant_documents("Who is Shadowheart ?")


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
l

In [96]:
#print(len(all_docs))
relevant_docs = base_retriever.get_relevant_documents("Who is Shadowheart ?")
relevant_docs[0].page_content

"Shadowheart : Shadowheart is a Companion in Baldur's Gate 3 (BG3). BG3 Shadowheart can be rescued during the Nautiloid segment in Act 1, or found by the locked crypt door on the beach after crashing. Shadowheart is a Half-Elf and a Trickery Domain Cleric, which gives your party heals, buffs and interesting stealth and deception spells. Companions assist the player by joining their party and have their own backstories and unique characteristics. They all have their own Classes and starting equipment, but you"

In [45]:
from langchain.prompts import ChatPromptTemplate

template = """<human>: Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}

\n

<bot>:
"""

prompt = ChatPromptTemplate.from_template(template)

In [47]:
relevant_docs = retriever.get_relevant_documents("What are the challenges in evaluating Retrieval Augmented Generation pipelines?")

AttributeError: 'VectorIndexRetriever' object has no attribute 'get_relevant_documents'

In [48]:
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

TypeError: unsupported operand type(s) for |: 'operator.itemgetter' and 'str'