Follows steps in https://www.youtube.com/watch?v=sVcwVQRHIc8&t=2s

Part 1: https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb

In [1]:
import numpy as np
import pandas as pd
import tiktoken
from tiktoken._educational import *

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import bs4

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
from langchain_community.vectorstores import Chroma

In [8]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [9]:
## ChatPromptTemplate: Creates a chat template consisting of a single message assumed to be from the human (https://api.python.langchain.com/en/latest/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html)
from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate

In [10]:
from langchain_core.prompts import PromptTemplate

In [11]:
from langchain_community.chat_models import ChatOllama

In [12]:
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

In [13]:
from langchain import hub

In [14]:
from langchain.load import dumps, loads

In [15]:
import langchain

In [16]:
from operator import itemgetter

In [17]:
from typing import Literal, Optional, Tuple
from langchain_core.pydantic_v1 import BaseModel, Field

In [18]:
from langchain.utils.math import cosine_similarity

In [19]:
from langchain_experimental.llms.ollama_functions import OllamaFunctions

In [20]:
from langchain_core.messages import HumanMessage, SystemMessage

In [21]:
import datetime

## Get embeddings model (SentenceBERT) 

In [22]:
## Sentence BERT for sentence embeddings
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

## Get embeddings model (Nomic embddings)
## Original : https://blog.nomic.ai/posts/nomic-embed-text-v1, https://static.nomic.ai/reports/2024_Nomic_Embed_Text_Technical_Report.pdf
## https://huggingface.co/nomic-ai/nomic-embed-text-v1
## https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.huggingface.HuggingFaceBgeEmbeddings.html#langchain_community.embeddings.huggingface.HuggingFaceBgeEmbeddings

In [23]:
## nomic-embed-text-v1	is 8192 seq len embedder that is open source
model_name = "nomic-ai/nomic-embed-text-v1"
model_kwargs = {
    'device': 'cpu',
    'trust_remote_code':True
    }
encode_kwargs = {'normalize_embeddings': True}

In [24]:
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="search_query:", embed_instruction="search_document:")

  from tqdm.autonotebook import tqdm, trange
<All keys matched successfully>


In [25]:
question = "What kinds of pets do I like?"
document = "My favorite pet is a cat."

In [26]:
question_embedding = embeddings.embed_query(question)
document_embedding = embeddings.embed_query(document)
print(f"Dimensionality of question_embedding: {len(question_embedding)}")
print(f"Dimensionality of document_embedding: {len(document_embedding)}")

Dimensionality of question_embedding: 768
Dimensionality of document_embedding: 768


## Define similarity metric (cosine)

In [27]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product/(norm_vec1*norm_vec2)

In [28]:
sim = cosine_similarity(question_embedding, document_embedding)
print(f"Cosine Similarity between questiona dn document: {sim}")

Cosine Similarity between questiona dn document: 0.7388467122620958


## Get token count (as per BPE implemented in tiktoken library by OpenAI). Does not make sense for SentenceBert embedding 

In [29]:
def count_token_number(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens 

In [30]:
## cl100k_base is GPT-4 tokenizer
count_token_number(question, "cl100k_base")

8

## Visualize tokenization done by cl100k_base (GPT-4)

In [31]:
enc = SimpleBytePairEncoding.from_tiktoken("cl100k_base")
enc.encode(question)

[48;5;167mW[48;5;179mh[48;5;185ma[48;5;77mt[0m
[48;5;167mW[48;5;179mh[48;5;185mat[0m
[48;5;167mWh[48;5;185mat[0m
[48;5;167mWhat[0m

[48;5;167m [48;5;179mk[48;5;185mi[48;5;77mn[48;5;80md[48;5;68ms[0m
[48;5;167m [48;5;179mk[48;5;185min[48;5;80md[48;5;68ms[0m
[48;5;167m [48;5;179mk[48;5;185mind[48;5;68ms[0m
[48;5;167m k[48;5;185mind[48;5;68ms[0m
[48;5;167m kind[48;5;68ms[0m
[48;5;167m kinds[0m

[48;5;167m [48;5;179mo[48;5;185mf[0m
[48;5;167m o[48;5;185mf[0m
[48;5;167m of[0m

[48;5;167m [48;5;179mp[48;5;185me[48;5;77mt[48;5;80ms[0m
[48;5;167m p[48;5;185me[48;5;77mt[48;5;80ms[0m
[48;5;167m p[48;5;185met[48;5;80ms[0m
[48;5;167m p[48;5;185mets[0m
[48;5;167m pets[0m

[48;5;167m [48;5;179md[48;5;185mo[0m
[48;5;167m d[48;5;185mo[0m
[48;5;167m do[0m

[48;5;167m [48;5;179mI[0m
[48;5;167m I[0m

[48;5;167m [48;5;179ml[48;5;185mi[48;5;77mk[48;5;80me[0m
[48;5;167m l[48;5;185mi[48;5;77mk[48;5;80me[0m
[48;5;1

[3923, 13124, 315, 26159, 656, 358, 1093, 30]

## INDEXING (load, split and embed documents)

## 1. Load data from web page and use Beautifuloup to parse it

In [32]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

## 2. Split data using chunking strategies

In [33]:
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap=50)

In [34]:
splits = splitter.split_documents(blog_docs)

In [35]:
print(f"Number of splits from blog_docs: {len(splits)} ")

Number of splits from blog_docs: 52 


## SPlits doc into splits. Each split has 300 tokens with overlap of 50 tokens 

In [None]:
content_length_list = [len(sp.page_content) for sp in splits]

In [None]:
sns.boxplot(x=content_length_list)

In [None]:
## Metadata is the extra information around the doc that is split using recursive splitter
splits[0].metadata

In [None]:
! pwd

## 3. Store embeddings to chromadb

In [None]:
## Initialize chroma db client
vectorstore = Chroma.from_documents(collection_name="rag_store_nomic", persist_directory="notebooks/chroma", documents=splits, embedding=embeddings)

In [None]:
vectorstore.get('043be940-5eb3-4ca9-ae66-6d07e094b493')#.keys()

In [None]:
# vectorstore.delete_collection()

In [None]:
# retreiver = vectorstore.as_retriever(search_type="similarity", search_kwargs = {"k":4})

## This will prevent from getting unsure documents
retreiver = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.35}
)

In [None]:
relevant_docs = await retreiver.ainvoke("What is task decoposition?")

## These retreived splits are embedded into the context of LLM(GPT or LLama) as prompt to answer the query. 

## 4. Create prompt (https://api.python.langchain.com/en/latest/prompts/langchain_core.prompts.chat.ChatPromptTemplate.html#langchain_core.prompts.chat.ChatPromptTemplate)

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
print(prompt)

In [None]:
print(type(prompt))

In [None]:
local_llm = ChatOllama(model="llama2:latest") ## num_predict

In [None]:
rag_chain = {"context":retreiver, "question": RunnablePassthrough()} | prompt | local_llm | StrOutputParser()

In [None]:
# chain = prompt | local_llm

# chain.invoke({"context":relevant_docs,"question":"What is Task Decomposition?"})

In [None]:
rag_chain.invoke("What is task decomposition?")

In [None]:
rag_chain.invoke("Summarize algorithm distillation for me")

In [None]:
rag_chain.invoke("What are the different types of agent memory?")

## Updated prompt template

In [None]:
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [None]:
updated_rag_chain = {"context":retreiver, "question": RunnablePassthrough()} | prompt_hub_rag | local_llm | StrOutputParser()

In [None]:
## This is from the pretrained knowledge of Llama2. It is not in the context
updated_rag_chain.invoke("What can you say about training of large language models")

In [None]:
prompt_hub_rag

In [None]:
relevant_docs1 = await retreiver.ainvoke("What can you say about Narendra Modi?")

In [None]:
relevant_docs1

In [None]:
updated_rag_chain.invoke("What can you say about Narendra Modi?")

# Part2: https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_5_to_9.ipynb

# QUERY TRANSLATION: Rewriting the query input by the user in a way that makes retreival easier

### Way1: MultiQuery:
1. Break input query into multiple queries from different perspectives (Ask the LLm to do so)
2. Retreive documents for each of the above queries parallely
3. Union of documents returned above

In [None]:
multiquery_template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""

In [None]:
prompt_perspectives = ChatPromptTemplate.from_template(multiquery_template)

In [None]:
prompt_perspectives

In [None]:
generate_related_queries_chain = prompt_perspectives | local_llm  | StrOutputParser() | (lambda x: x.split("\n"))

In [None]:
perspective_questions = await generate_related_queries_chain.ainvoke({"question": "What is task decomposition for LLM agents?"})

In [None]:
perspective_questions

In [None]:
def get_unique_union(documents: list[int]):
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

In [None]:
multiquery_retreival_chain = generate_related_queries_chain | retreiver.map() | get_unique_union

In [None]:
multiquery_question = "What is task decomposition for LLM agents?"
multiquery_docs = multiquery_retreival_chain.invoke({"question":multiquery_question})

In [None]:
len(multiquery_docs)

In [None]:
multiquery_rag_chain = {"context":multiquery_retreival_chain, "question": RunnablePassthrough()} | prompt_hub_rag | local_llm | StrOutputParser()

In [None]:
multiquery_rag_chain.invoke(multiquery_question)

## My understanding is mutliquery query translation makes sense when the question is very broad since it breaks initial question into queries from different perspectives. This might be problematic when initial questions are very pointed. 

### Way2: RAG-Fusion
1. Break query into different queries from varying perspectives
2. Ask LLM to answer these queries parallely
3. Combine the answers using a special technique called reciprocal rank fusion (RRF) 
4. Return the answer.

In [None]:
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    ## results is a list of lists of size 4 (# subqueries). Each element is a list of retreived documents
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(docs) ## Convert doc to string
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            #previous_score = fused_scores[doc_str]
            fused_scores[doc_str]+=1/(rank+k)

    reranked_results = [(loads(doc), rrf_score) for doc, rrf_score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)]
    return reranked_results

In [None]:
template_for_splitting_single_query = """You are a helpful assistant that generates multiple search queries based on single input query.
Generate multiple search queries related to : {question}
Output 4 queries:"""

In [None]:
prompt_splitting_single_query = ChatPromptTemplate.from_template(template_for_splitting_single_query)

In [None]:
#cc = {"question": RunnablePassthrough()} | prompt_splitting_single_query
#cc.invoke("how to train LLM?")

In [None]:
generate_queries_chain = prompt_splitting_single_query | local_llm | StrOutputParser() | (lambda x: x.split("\n"))

In [None]:
## Generates 4 related queries
#generate_queries_chain.invoke({"question": "how to train LLM?"})

In [None]:
rag_fusion_retreival_chain = generate_queries_chain | retreiver.map() | reciprocal_rank_fusion

In [None]:
langchain.debug=True

In [None]:
returned_docs_as_per_rrf = await rag_fusion_retreival_chain.ainvoke({"question": "what can you tell me about chain of thought?"})

In [None]:
template_for_generation = """Answer the following question based on this context:

{context}

Question: {question}"""

In [None]:
prompt_for_generation = ChatPromptTemplate.from_template(template_for_generation)

In [None]:
final_rag_fusion_generation_chain = {"context":rag_fusion_retreival_chain, "question": itemgetter("question")} | prompt_for_generation | local_llm | StrOutputParser()

In [None]:
rag_fusion_response = await final_rag_fusion_generation_chain.ainvoke({"question":"What is task decomposition for LLM agents?"})

In [None]:
print(rag_fusion_response)

### Way 3: Query decomposition using Least to Most prompting and IR-CoT (Information retreival with Chain of Thought)
Note: 
- Chain of Thought prompting combines natural language based rationale with few shot prompt. (https://arxiv.org/pdf/2201.11903)
- CoT is further improved by adding self-consistency decoder (opposed to greedy decoder in vanilla CoT) (https://arxiv.org/pdf/2203.11171)
- However, CoT has limitations that it performs poorly on tasks that require generalization of solving problems harder than few shot prompt examples.This is where Least to Most prompting comes in picture

Least to Most prompting:
1. Decompose a query into easier subqueries
2. Sequentially solve subqueries using the reponse/ answer to previous subqueries
3. Both stages are implemented by few-shot prompting, so that there is no training or
finetuning in either stage

IR-CoT (Information retreival with Chain of Thought): (https://arxiv.org/pdf/2212.10509)
1. How can we augment chain-of-thought prompting for open-domain, knowledge-intensive tasks that require complex, multi-step reasoning?
2. Use retrieval to guide the chain-of-thought (CoT) reasoning steps and use CoT reasoning to guide the retrieval.
Steps:
a. We begin by retrieving a base set of paragraphs using the question as a query.
b. Subsequently, we alternate between the following two steps: (i) extend CoT: use the question, the paragraphs collected thus far, and the CoT sentences generated thus far to generate the next CoT sentence; (ii) expand retrieved information: use the last CoT sentence as a query to retrieve,additional paragraphs to add to the collected set.

## So, Query decomposition means break query into simpler subproblems and then dynamically retreive answers for smaller problem and use to answer next subproblem.Useful only if it makes sense to decompose query into sub problems. e.g. complicated reasoning question where answer of simpler query will help answer more complicated query. else this approach is an overkill

In [None]:
simpler_subproblems_template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. Only return the sub problems, nothing else \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_simpler_subproblems_query = ChatPromptTemplate.from_template(simpler_subproblems_template)

In [None]:
print(simpler_subproblems_template)

In [None]:
generate_queries_decomposition_chain = prompt_simpler_subproblems_query | local_llm | StrOutputParser() | (lambda x: [ss for ss in x.split("\n") if ss.strip() != '' and ss[0].isdigit()])

In [None]:
decomposed_questions = await generate_queries_decomposition_chain.ainvoke({"question": "What are the main components of an LLM-powered autonomous agent system?"})

In [None]:
decomposed_questions

In [None]:
ir_cot_template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""
prompt_ir_cot = ChatPromptTemplate.from_template(ir_cot_template)

In [None]:
def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip() 

In [None]:
q_a_pairs = ""
for question in decomposed_questions:
    ## Retreive docs relevant to question (context) + all previously answered questions (q_a_pairs) and answer current question
    rag_chain = {"context": itemgetter("question") | retreiver, "question": itemgetter("question"), "q_a_pairs": itemgetter("q_a_pairs")} | prompt_ir_cot | local_llm | StrOutputParser()

    answer = await rag_chain.ainvoke({"question": question, "q_a_pairs": q_a_pairs})
    q_a_pair = format_qa_pair(question, answer)
    q_a_pairs = q_a_pairs + "\n---\n" + q_a_pair

In [None]:
print(answer)

### Way 4: Step back prompting: Useful for knowledge intensive tasks where it will benifit from thinking from first principles/ overall level/ step back to get broader picture.
- Step back question: derived from original question at a higher level of abstraction (This step back question should be much easier to answer)

2 steps:
1. Abstraction: Get step back question and retreive relevant concepts for it.. This step back question is task dependent.
2. Reasoning (Abstraction grounded reasoning): Answer original question but stay grounded on facts obtained from Abstraction phase.

In [None]:
## 2 shot examples
## input: query, output: step back version of query
examples = [{"input": "Could the members of The Police perform lawful arrests?", "output": "what can the members of The Police do?"}, {"input": "Jan Sindel’s was born in what country?", "output": "what is Jan Sindel’s personal history?"}]

In [None]:
example_prompt = ChatPromptTemplate.from_messages([("human", "{input}"), ("ai", "{output}")])

In [None]:
few_shot_prompt = FewShotChatMessagePromptTemplate(example_prompt=example_prompt, examples=examples)

In [None]:
step_back_prompt = ChatPromptTemplate.from_messages([("system", """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:"""), few_shot_prompt, ("user", "{question}")]) 

In [None]:
step_back_prompt.invoke({"question": "What is task decomposition for LLM agents?"})

In [None]:
generate_step_back_queries_chain = step_back_prompt | local_llm | StrOutputParser()
step_back_query = await generate_step_back_queries_chain.ainvoke({"question": "What is task decomposition for LLM agents?"})

In [None]:
print(step_back_query)

In [None]:
step_back_response_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

{normal_context}
{step_back_context}

Original Question: {question}
Answer:"""

In [None]:
step_back_response_prompt = ChatPromptTemplate.from_template(step_back_response_template)

In [None]:
step_back_response_prompt

In [None]:
step_back_chain = {"question": itemgetter("question"), "normal_context": itemgetter("question") | retreiver, "step_back_context": generate_step_back_queries_chain | retreiver } | step_back_response_prompt | local_llm | StrOutputParser()

In [None]:
step_back_answer = await step_back_chain.ainvoke({"question": "What is task decomposition for LLM agents?"})

In [None]:
print(step_back_answer)

### Way 5: HyDE (Hypothetical Document Embedding) => https://docs.haystack.deepset.ai/docs/hypothetical-document-embeddings-hyde
1. Given a query, the Hypothetical Document Embeddings (HyDE) first zero-shot prompts an instruction-following language model to generate a “fake” hypothetical document that captures relevant textual patterns from the initial query - in practice, this is done five times. (basically ask Instruction following LLM to generate paragraphs to answer user query 5 times)
2. Embed the 5 results from Step 1 using the same embedder as the one used to embed docs and save in Chroma DB
3. Average the embeddings (for 5 documents) to get a single Hypothetical Document embedding
4. Now, search top_k documents with embedding similar to the hypothetical document created in previous step. 

When to use?
The HyDE method is highly useful when:

1. The performance of the retrieval step in your pipeline is not good enough (for example, low Recall metric).
2. Your retrieval step has a query as input and returns documents from a larger document base.
3. Particularly worth a try if your data (documents or queries) come from a special domain that is very different from the typical datasets that Retrievers are trained on.

In [None]:
hyde_prompt_template = """Given a question, generate a paragraph of text that answers the question
Question: {question}
Passage:"""
hyde_prompt = ChatPromptTemplate.from_template(hyde_prompt_template)

In [None]:
count_hypothetical_docs = 5
hypothetical_doc_list = []
for c in range(count_hypothetical_docs):
    hyde_chain = {"question": itemgetter("question")} | hyde_prompt | local_llm | StrOutputParser()
    hyp_doc = await hyde_chain.ainvoke({"question": "What is task decomposition for LLM agents?"})
    hypothetical_doc_list.append(hyp_doc)

In [None]:
hypothetical_document_embedding_list = await embeddings.aembed_documents(hypothetical_doc_list)

In [None]:
hypothetical_embedding_matrix = np.array(hypothetical_document_embedding_list)
average_hypothetical_document_embedding = np.mean(hypothetical_embedding_matrix, axis=0)

In [None]:
average_hypothetical_document_embedding_list = average_hypothetical_document_embedding.tolist()

In [None]:
## Now find tok_k relvant docs for this query
## https://api.python.langchain.com/en/v0.1/vectorstores/langchain_community.vectorstores.chroma.Chroma.html#langchain_community.vectorstores.chroma.Chroma.asimilarity_search_by_vector
hyde_relevant_documents = await vectorstore.asimilarity_search_by_vector(average_hypothetical_document_embedding_list, k=4)

In [None]:
final_hyde_generation_template = """Answer the following question based on this context:

{context}

Question: {question}"""
final_hyde_generation_prompt = ChatPromptTemplate.from_template(final_hyde_generation_template)

In [None]:
final_hyde_generation_chain = {"context": itemgetter("context"), "question": itemgetter("question")} | final_hyde_generation_prompt | local_llm | StrOutputParser()

In [None]:
hyde_generated_response = await final_hyde_generation_chain.ainvoke({"context": hyde_relevant_documents, "question": "What is task decomposition for LLM agents?"})

In [None]:
print(hyde_generated_response)

# Part3: https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_10_and_11.ipynb

# ROUTING: Get query to the correct source. 2 types:
1. Logical routing: Route query to correct retreiver/ database e.g. vecotr db, graph db etc (via structured functions as LLM output). Consider it as a classification system that given the query, returns the db system to be used for query answering
2. Semantic routing: Embed multiple prompts and query using same embedder and choose the prompt that has highest similarity with query

## 1. Logical routing: Use function-calling for classification (LLM for classification with structured output)

In [None]:
# ## Create a class/ data model that will be the output of LLM

# class RouteQuery(BaseModel):
#     datasource: Literal["python_docs", "js_docs", "golang_docs"] = Field(description="")

In [None]:
## Understandinfg use of pydantic to create Data model to which the output of LLM should confootm to
## https://python.langchain.com/v0.1/docs/modules/model_io/output_parsers/types/pydantic/

class Joke(BaseModel):
    setup: str = Field(description="question to setup a joke")
    punchline: str = Field(description="answer to resolve the joke")

joke_query = "Tell me a joke"
joke_parser = PydanticOutputParser(pydantic_object=Joke)
joke_prompt = PromptTemplate(template = "Answer the user query. \n{format_instructions}\n{query}\n", input_variables=["query"], partial_variables={"format_instructions": joke_parser.get_format_instructions()})
#structured_local_llm = local_llm.with_structured_output(BaseModel)
joke_chain = joke_prompt | local_llm | StrOutputParser()

In [None]:
#joke_prompt.invoke({"query": joke_query})

In [None]:
# joke_template = """You are a great comedian who makes scientific jokes. You tell jokes according to the question asked below.\n
# question: {question}
# Answer:"""

# simple_joke_chain = {"question": RunnablePassthrough()} | ChatPromptTemplate.from_template(joke_template) | local_llm |StrOutputParser()

# simple_joke = simple_joke_chain.invoke("Tell me a joke related to plants")

# print(simple_joke)

In [None]:
res = joke_chain.invoke({"query": joke_query})

In [None]:
print(res)

## LLama 2 model is not able to follow the datamodel. Atleast that is what I could understand. This might work with ChatGPT based LLMs.
## Check https://python.langchain.com/v0.1/docs/modules/model_io/chat/structured_output/#groq

## Doing function calling with open source local models like llama are experimental in langchain till now.
## https://api.python.langchain.com/en/latest/llms/langchain_experimental.llms.ollama_functions.OllamaFunctions.html#langchain_experimental.llms.ollama_functions.OllamaFunctions
## Video to explain function calling in LLama models: https://www.youtube.com/watch?v=Ss_GdU0KqE0

## Experiment with Llama and Phi3 models for function calling: https://export.arxiv.org/pdf/2404.14219 in FunctionCallingWithLocalModels notebook

## Implement logical routing using langchain_experimental.llms.ollama_functions.OllamaFunctions

In [None]:
class RouteQuery(BaseModel):
    datasource: Literal["python_docs", "js_docs", "golang_docs"] = Field(description="Given a user question choose which datasource would be most relevant for answering their question")

In [None]:
experimental_local_llm = OllamaFunctions(model="llama2:latest", format="json", temperature=0)

In [None]:
structured_experimental_local_llm = experimental_local_llm.with_structured_output(RouteQuery)

In [None]:
routing_messages = [
    SystemMessage(content="You are an expert at routing a user question to the appropriate data source. Based on the programming language the question is referring to, route it to the relevant data source."),
    HumanMessage(content="{query}")
]

In [None]:
routing_prompt = ChatPromptTemplate.from_messages(routing_messages)
print(routing_prompt)

In [None]:
router = routing_prompt | structured_experimental_local_llm

In [None]:
question = """Why doesn't the following code work:

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(["human", "speak in {language}"])
prompt.invoke("french")
"""

In [None]:
router_result = router.invoke({"query": question})

In [None]:
RouteQuery.parse_obj(router_result)

## It appears as if Llama 2 model is not able to distinguish between coding languages. It is possible that it is not trained on code. But the concept of making LLM output confirm to a json/ data model is very critical concept

## 2. Semantic routing: Choosing between multiple prompts based on similarity with the query

In [None]:
physics_template = """You are a very smart physics professor. \
You are great at answering questions about physics in a concise and easy to understand manner. \
When you don't know the answer to a question you admit that you don't know.

Here is a question:
{query}"""

math_template = """You are a very good mathematician. You are great at answering math questions. \
You are so good because you are able to break down hard problems into their component parts, \
answer the component parts, and then put them together to answer the broader question.

Here is a question:
{query}"""

In [None]:
multiple_prompt_templates = [physics_template, math_template]
multiple_prompt_embeddings = await embeddings.aembed_documents(multiple_prompt_templates)

In [None]:
def prompt_router(input_dict):
    query_embedding = embeddings.embed_query(input_dict["query"])
    similarity = cosine_similarity([query_embedding], multiple_prompt_embeddings)[0]
    most_similar = multiple_prompt_templates[similarity.argmax()]
    print(f"Using Math termplate" if most_similar == math_template else "Using Physics template")
    return ChatPromptTemplate.from_template(most_similar)

In [None]:
semantic_router_chain = {"query": RunnablePassthrough()} | RunnableLambda(prompt_router) | local_llm | StrOutputParser()
semantic_router_result = semantic_router_chain.invoke("Explain to me special theory of relativity")

In [None]:
print(semantic_router_result)

In [None]:
maths_router_result = semantic_router_chain.invoke("Explain me the concept of imaginary numbers")

In [None]:
print(maths_router_result)

# QUERY CONSTRUCTION: Based on the database used (vectorDb/ GraphDb etc), translate human language query into form suitable to be run on dbase.
## e.g. If dbase used is Vector DB, extract metadata filter information from the input query and run them against the vector dbase like Chroma
## https://python.langchain.com/v0.2/docs/tutorials/query_analysis/

## In this example we will use youtube video transcripts etc as document

In [None]:
youtube_urls = [
    "https://www.youtube.com/watch?v=HAn9vnJy6S4",
    "https://www.youtube.com/watch?v=dA1cHGACXCo",
    "https://www.youtube.com/watch?v=ZcEMLz27sL4",
    "https://www.youtube.com/watch?v=hvAPnpSfSGo",
    "https://www.youtube.com/watch?v=EhlPDL4QrWY",
    "https://www.youtube.com/watch?v=mmBo8nlu2j0",
    "https://www.youtube.com/watch?v=rQdibOsL1ps",
    "https://www.youtube.com/watch?v=28lC4fqukoc",
    "https://www.youtube.com/watch?v=es-9MgxB-uc",
    "https://www.youtube.com/watch?v=wLRHwKuKvOE",
    "https://www.youtube.com/watch?v=ObIltMaRJvY",
    "https://www.youtube.com/watch?v=DjuXACWYkkU",
    "https://www.youtube.com/watch?v=o7C9ld6Ln-M",
]
youtube_docs = []
for url in youtube_urls:
    youtube_docs.extend(YoutubeLoader.from_youtube_url(url, add_video_info=True).load())

In [None]:
for doc in youtube_docs:
    doc.metadata["publish_year"] = int(datetime.datetime.strptime(doc.metadata["publish_date"], "%Y-%m-%d %H:%M:%S").strftime("%Y"))

In [None]:
[doc.metadata["title"] for doc in youtube_docs]

In [None]:
[doc.metadata["view_count"] for doc in youtube_docs]

In [None]:
youtube_docs[0].metadata

In [None]:
## Chunk youtube docs using RecursiveCharacterTextSplitter, use nomic embedder to get embeddings and store it into a vector database (Chroma)
youtube_splits = splitter.split_documents(youtube_docs)

In [None]:
youtube_vectorstore = Chroma.from_documents(collection_name="youtube_store_nomic", documents=youtube_splits, embedding=embeddings, persist_directory="./chroma")

## Now create a BaseModel that helps extract relevant metadata from input query that can then be applied on top on chroma db (youtube_vectorstore)

In [None]:
class TutorialSearch(BaseModel):
    content_search: str = Field(description="Similarity search queries that can be applied to video transcripts")
    title_search: str = Field(description="Succinct title with only the keywords")
    min_view_count: Optional[int] = Field(description="Minimum view count filter, inclusive. Only use if explicitly specified.")
    max_view_count: Optional[int] = Field(description="Maximum view count filter, exclusive. Only use if explicitly specified.")
    publish_year: Optional[int] = Field(description="Year when the video was published. Use only if explicitly specified")
    min_length_sec: Optional[int] = Field(
        None,
        description="Minimum video length in seconds, inclusive. Only use if explicitly specified.",
    )
    max_length_sec: Optional[int] = Field(
        None,
        description="Maximum video length in seconds, exclusive. Only use if explicitly specified.",
    )

In [None]:
structured_experimental_youtube_local_llm = experimental_local_llm.with_structured_output(TutorialSearch)

In [None]:
# youtube_messages = [
#     SystemMessage(content="""You are an expert at converting user questions into database queries.
#     You have access to a database of tutorial videos about a software library for building LLM-powered applications.
#     Given a question, return a database query optimized to retrieve the most relevant results.
#     If there are acronyms or words you are not familiar with, do not try to rephrase them."""),
#     HumanMessage(content="{question}")
# ]

# youtube_prompt = ChatPromptTemplate.from_messages(youtube_messages)

## Need to ensure that the prompt matches the template that was used during LLama2 training
## Check https://ollama.com/library/llama2/blobs/2e0493f67d0c for more details

In [None]:
youtube_prompt = PromptTemplate.from_template(
    """[INST] <<SYS>>
    {system_prompt}
    <</SYS>>
    {question} [/INST]"""
)

In [None]:
youtube_prompt

In [None]:
query_constructor_chain = youtube_prompt | structured_experimental_youtube_local_llm

In [None]:
youtube_llama2_system_prompt = """You are an expert at converting user questions into database queries.
You have access to a database of tutorial videos about a software library for building LLM-powered applications.
Given a question, return a database query optimized to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""

In [None]:
youtube_resp = query_constructor_chain.invoke({"system_prompt": youtube_llama2_system_prompt, "question": "how to use multi-modal models in an agent, only videos under 5 minutes published in 2021"})

In [None]:
youtube_resp

## LLama 2 does not appear to have the capability to extract elevant information from the query in the above case. It is worth experimenting with other LLMs that are trained specifically for function calling. For example:
1. Phi models
2. Nexusravn: https://ollama.com/library/nexusraven/blobs/cf200ab0155f

## https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.chroma.Chroma.html#langchain_community.vectorstores.chroma.Chroma.as_retriever

## The above can also be done using json instead of DataModel using Pydantic

# INDEXING: Ways to improve indexing in vector dbases
### Way 1: Multi representation indexing => Take a document, ask LLM to summarize it and then index this summary in the vectorDB. It is possible only in case of LLMs with large context window. Otherwise, it is not possible.
e.g. Llama2 model with 32K context (llama-2-7B-32K), Original LLama2 has 4K tokens as input context

Basically change the granularity at which information is stored in Vector DB

Paper: https://arxiv.org/pdf/2312.06648