In [55]:
import glob
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SemanticSplitterNodeParser, TokenTextSplitter
from llama_index.core import VectorStoreIndex, load_index_from_storage, Settings
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from operator import itemgetter
import torch
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig, pipeline
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

from getpass import getpass
import mistralai
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

Connect to Mistral client

In [11]:
# Setup your API KEY here
api_key = input("Put your API key here")

client = MistralClient(api_key=api_key)

Load LLM model, prepare retriever and create vector db


In [52]:
# Craft questions and context pairs which can be used in the assessment of the RAG system of both Retrieval and Response Evaluations
input_folder = "./data"
documents = SimpleDirectoryReader(input_dir=input_folder, recursive=True).load_data()
print(f"\n\nNumber of documents : {len(documents)}\n\n")

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# LlamaCpp implementation with langchain is slightly diffrent from llama_index
llm = LlamaCpp(
    model_path='/Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf',  # Q6_K was used too but quite slow
    #model_path="langchain_intro/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    temperature=0.0,
    max_tokens=512,
    n_ctx=8192,
    n_batch=2048,  # How many tokens are processed in parallel
    callback_manager=callback_manager,
    n_gpu_layers=10,  # Gpu layers to use, 10 is the max on M3 base version
    streaming=True,  # Whether to print one by one tokens
    verbose=True  # Verbose is required to pass to the callback manager
)

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import TextLoader
from langchain.document_loaders.merge import MergedDataLoader

documents = glob.glob("./data/*.txt")
docs_to_merge = []
for doc in documents:
    loader = TextLoader(doc)
    docs_to_merge.append(loader)
all_loaders = MergedDataLoader(loaders=docs_to_merge)
all_docs = all_loaders.load()

print(all_docs)

# model_name = "BAAI/bge-large-en-v1.5"

# encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

# hf_bge_embeddings = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs={'device': 'cuda'},
#     encode_kwargs=encode_kwargs
# )

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512,
#                                                chunk_overlap=51,
#                                                length_function=len)

# docs = text_splitter.split_documents(all_docs)

# vectorstore = Chroma.from_documents(docs, hf_bge_embeddings)

# # parsers = {}

# # Semantic splitter
# embed_model = HuggingFaceEmbedding(
# model_name="BAAI/bge-small-en-v1.5",
# embed_batch_size=128,
# normalize=True)

# Settings.llm = llm
# Settings.embed_model = embed_model

# semantic_splitter = SemanticSplitterNodeParser(
# buffer_size=1, 
# breakpoint_percentile_threshold=95, 
# embed_model=embed_model)
# parsers["semantic_splitter"] = semantic_splitter

# # Token splitter 512
# token_splitter_512 = TokenTextSplitter(chunk_size=512, chunk_overlap=50, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
# parsers["token_splitter_512"] = token_splitter_512

# # Token splitter 1024
# token_splitter_1024 = TokenTextSplitter(chunk_size=1024, chunk_overlap=102, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
# parsers["token_splitter_1024"] = token_splitter_1024


# nodes = semantic_splitter.get_nodes_from_documents(documents=documents)
# vector_index = VectorStoreIndex(nodes)
# retriever = vector_index.as_retriever(similarity_top_k=3)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
l



Number of documents : 2




llama_kv_cache_init:      Metal KV buffer size =   320.00 MiB
llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
ggml_backend_metal_buffer_type_alloc_buffer: allocated buffer, size =   560.02 MiB, ( 9711.62 / 10922.67)
llama_new_context_with_model:      Metal compute buffer size =   560.00 MiB
llama_new_context_with_model:        CPU compute buffer size =   560.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 3
AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | 
Model metadata: {'general.quantization_version': '2', 'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 =

[Document(page_content="Help Kith'rak Voss : Help Kith'rak Voss is a Quest in Baldur's Gate 3. Help Kith'rak Voss can be acquired during Act TBA. This quest is part of the Personal Quests that you can get in the game.    \n\nHelp Kith'rak Voss Objectives : Kith'rak Voss visited our camp and asked us to help him free someone held prisoner within the Astral Prism. We agreed to bring the Prism to Baldur's Gate and meet him there.  \n\nHelp Kith'rak Voss Walkthrough : Help Kith'rak Voss takes place during Act 3. Talk to Lae'zel, she I'll say she wants to speak with Kith'rak Voss at Sharess' Caress. Voss can be found inside a room on the third floor of Sharess' Caress next to Raphael. Once Voss leaves the room, starts the new Deal with the DevilQuest (This doesn't deviate us from our main mission).Once we finish Raphael's Quest, we meet Voss again on the second floor of Sharess' Caress. The outcome of this interaction varies based on whether you accepted Raphael's contract or not. If you ag

In [54]:
all_docs[1]

Document(page_content="Arnell Hallowleaf : Arnell Hallowleaf is an NPC in Baldur's Gate 3. Arnell Hallowleaf can be found at The Chamber of Loss in the House of Grief. Arnell Hallowleaf is the father of Shadowheart whom she has since lost the memory of. There is a massive amount of NPCs in Baldur's Gate 3, and their ideals, needs and way of living are strictly related to the actions they perform during the course of the game. Their attitude towards you and your party may be affected by the deeds and decisions you and your party have taken on.   \n\nWhere to find Arnell Hallowleaf : Can be found at: Act 3 The Chamber of Loss, House of Grief  \n\nArnell Hallowleaf Related Quests : Daughter of Darkness  \n\nArnell Hallowleaf Dialogue Options : Arnell Hallowleaf is one of the two prisoners inside the Chamber of Loss who become part of Shadowheart's trial is she is still aligned with Shar. Your interactions here will be recorded in your journals for the Daughter of Darkness. Shadowheart can

In [45]:
from langchain.prompts import ChatPromptTemplate

template = """<human>: Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}

\n

<bot>:
"""

prompt = ChatPromptTemplate.from_template(template)

In [47]:
relevant_docs = retriever.get_relevant_documents("What are the challenges in evaluating Retrieval Augmented Generation pipelines?")

AttributeError: 'VectorIndexRetriever' object has no attribute 'get_relevant_documents'

In [48]:
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

TypeError: unsupported operand type(s) for |: 'operator.itemgetter' and 'str'