In [1]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from transformers import pipeline

import chromadb
from chromadb.config import Settings

2024-01-22 17:03:28.351205: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Change to any PDF you like this particular one was NLP book which seemed relevant 
PATH_TO_PDF = 'https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf'
CHROMA_DB_PATH = './db'
LLM_MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.2'
EMBEDDING_MODEL_ID = 'BAAI/bge-base-en-v1.5'
LLM_TEMPERATURE = 0.1
MAX_TOKENS_INPUT = 1000
DB_SEARCH_TYPE = 'mmr' # this is used to set retrieval algorithm on the db - mmr stands for 
DB_K_RETURNED_DOCS = 5
#Change this to . if running in Colab
MODEL_PATH='/mnt/e/Model/mistral-7b-instruct-v0.2.Q5_K_M.gguf'

In [3]:
def load_pdf_pages(uri):
    loader = PyPDFLoader(uri)
    pages = loader.load_and_split()
    return pages

In [4]:
pages = load_pdf_pages(PATH_TO_PDF)

In [5]:
text_splitter = RecursiveCharacterTextSplitter(separators=['\n',' '])

In [6]:
texts = text_splitter.split_documents(pages)

In [7]:
#Get embedding model - this is one of the top MTEB models and is small enough to run on the laptop 
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = EMBEDDING_MODEL_ID
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embedding_model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,    
)


In [8]:
#setup or reload db 
def get_db_client():
    
    settings = Settings(persist_directory=CHROMA_DB_PATH,  anonymized_telemetry=False)
    client = chromadb.PersistentClient(settings=settings , path=CHROMA_DB_PATH)
    return client

In [39]:
#Upload new data:
def upload_data(texts, embeddings_model):
    client = get_db_client()   
    db = Chroma.from_documents(texts, embedding_model, client=client)
    return db

In [40]:
db = upload_data(texts, embedding_model)

In [28]:
def load_db(embedding_model):
    
    db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embedding_model)
    return db
    

In [18]:
def get_llm_model():
    from transformers import AutoModelForCausalLM, AutoTokenizer

    device = "cuda" # the device to load the model onto

    n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
    n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

    # Callbacks support token-wise streaming
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    
    # Make sure the model path is correct for your system!
    llm = LlamaCpp(
        model_path=MODEL_PATH,
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        callback_manager=callback_manager,
        verbose=True,  # Verbose is required to pass to the callback manager
    )

    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, truncation=True)
    
    return llm, tokenizer

In [41]:
db.similarity_search("what is ranked semantic search?")

[Document(page_content='270 CHAPTER 14 • Q UESTION ANSWERING AND INFORMATION RETRIEVAL\nrelevant throughout NLP: information retrieval (a key component of IR-based QA)\nandentity linking (similarly key for knowledge-based QA). We’ll start in the next\nsection by introducing the task of information retrieval.\nThe focus of this chapter is factoid question answering, but there are many\nother QA tasks the interested reader could pursue, including long-form question\nanswering (answering questions like “why” questions that require generating long\nanswers), community question answering , (using datasets of community-created\nquestion-answer pairs like Quora or Stack Overﬂow), or even answering questions\non human exams like the New York Regents Science Exam (Clark et al., 2019) as\nanNLP/AI benchmark to measure progress in the ﬁeld.\n14.1 Information Retrieval\nInformation retrieval orIRis the name of the ﬁeld encompassing the retrieval of allinformation\nretrieval\nIR manner of media bas

In [17]:
llm, tokenizer = get_llm_model()

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2080 Ti, compute capability 7.5
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /mnt/e/Model/mistral-7b-instruct-v0.2.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q5_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q5_K     [  4096, 14336,     1,     1 ]
l

In [19]:
def get_prompt():
    
    #note the parameters are not replaced in this string it's just a template for later
    #INST is a instruction token specific to Mistral model if you swap the model make sure you update the query as needed
    text_prompt_template = """
    ### [INST] Instruction: Answer the question based on your knowledge and provided context. If you don't know the answer say you don't know. The context:
    {context}

    ### QUESTION:
    {question} 
    [/INST]
    """
    
    #Langchain prompt for LLM 
    prompt_template = PromptTemplate(input_variables=["context", "question"], template=text_prompt_template)
    return prompt_template

In [20]:
def build_model_pipeline():
    
    llm, tokenizer = get_llm_model()
        
    transformers_pipeline = pipeline(
        model=llm,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=LLM_TEMPERATURE,
        return_full_text=True,
        max_new_tokens=MAX_TOKENS_INPUT,
    
    )

    #HuggingFace in here comes from Langchain confusingly not the HuggingFace transformers package hence double packaging
    pipeline_llm = HuggingFacePipeline(pipeline=transformers_pipeline)
    return pipeline_llm

In [22]:
llm, tokenizer = get_llm_model()
llm_chain = LLMChain(llm=llm, prompt=get_prompt())

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /mnt/e/Model/mistral-7b-instruct-v0.2.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q5_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q5_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,

In [25]:
def get_rag_retriver(db):
    rag_retriever = db.as_retriever(
        search_type=DB_SEARCH_TYPE,
        search_kwargs={'k': DB_K_RETURNED_DOCS}
    )
    return rag_retriever


In [42]:
question = "What techniques can you apply to information retrieval?"

#this applies embedding model to the query and runs chroma db query to retrieve the relevant context 
context = get_rag_retriver(db)
llm_chain.run(question=question, context=context)

Llama.generate: prefix-match hit


 Based on the context provided, some of the techniques that can be applied to information retrieval using the Chroma vector database and Hugging Face BGE embeddings are as follows:

     1. **Semantic Search**: By utilizing semantically relevant vectors for queries, semantic search can provide more accurate results than traditional keyword-based search methods.
      
     2. **Multi-Modal Retrieval (MMR)**: This technique involves retrieving information based on multiple types of modalities such as text, images, and audio. By using MMR, information retrieval systems can provide more comprehensive and accurate results by taking into account the interplay between various modalities.
      
     3. **Diverse Retrieval**: This technique aims to retrieve a diverse set of results rather than just the most relevant ones. This can be achieved by employing techniques such as query expansion, relevance feedback, and diverse ranking methods to retrieve a more comprehensive and representative set


llama_print_timings:        load time =    1742.67 ms
llama_print_timings:      sample time =      80.92 ms /   249 runs   (    0.33 ms per token,  3076.92 tokens per second)
llama_print_timings: prompt eval time =    1311.12 ms /   100 tokens (   13.11 ms per token,    76.27 tokens per second)
llama_print_timings:        eval time =   16952.20 ms /   248 runs   (   68.36 ms per token,    14.63 tokens per second)
llama_print_timings:       total time =   19322.16 ms


' Based on the context provided, some of the techniques that can be applied to information retrieval using the Chroma vector database and Hugging Face BGE embeddings are as follows:\n\n     1. **Semantic Search**: By utilizing semantically relevant vectors for queries, semantic search can provide more accurate results than traditional keyword-based search methods.\n      \n     2. **Multi-Modal Retrieval (MMR)**: This technique involves retrieving information based on multiple types of modalities such as text, images, and audio. By using MMR, information retrieval systems can provide more comprehensive and accurate results by taking into account the interplay between various modalities.\n      \n     3. **Diverse Retrieval**: This technique aims to retrieve a diverse set of results rather than just the most relevant ones. This can be achieved by employing techniques such as query expansion, relevance feedback, and diverse ranking methods to retrieve a more comprehensive and representat