# Install Ollama on Colab


In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

# Install dependencies

In [None]:
!pip install -qU pinecone-client pinecone-datasets langchain-pinecone

In [None]:
! pip install --quiet langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python langchain-mistralai gpt4all

# Load libraries

In [47]:
import os
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.load import dumps, loads
from operator import itemgetter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.load import dumps, loads
import pinecone
from pinecone import Pinecone, ServerlessSpec, PodSpec
import time

In [48]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = <your_langsmith_api_key>
os.environ['PINECONE_API_KEY'] = <your_pinecone_api_key>
pinecone_api_key = os.environ['PINECONE_API_KEY']
use_serverless = True

In [49]:
# Load blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
blog_docs = loader.load()

In [50]:
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100,
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

In [51]:
pc = Pinecone(api_key=pinecone_api_key)
if use_serverless:
    spec = ServerlessSpec(cloud='aws', region='us-west-2')
else:
    # if not using a starter index, you should specify a pod_type too
    spec = PodSpec()
# check for and delete index if already exists
index_name = 'langchain-retrieval-augmentation-fast'
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
# create a new index
pc.create_index(
    index_name,
    dimension=384,  # dimensionality of GPT4ALLEmbeddings
    metric='dotproduct',
    spec=spec
)
# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [52]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [53]:
from langchain.vectorstores import Pinecone
embedding = GPT4AllEmbeddings()
vectorstore = Pinecone.from_documents(splits, embedding, index_name = index_name)
retriever = vectorstore.as_retriever()

In [55]:
# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [56]:
# LLM
local_llm = "gemma:7b"

llm = ChatOllama(model=local_llm, format="json", temperature=0)

In [57]:
generate_queries = (
    prompt_rag_fusion
    | llm
    | JsonOutputParser()
    | (lambda x: list(x.values()))
)

In [58]:
question = "What is prompt engineering?"

In [59]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents
        and an optional parameter k used in the RRF formula """

    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

In [61]:
# Getting the different resulting queries
queries = generate_queries.invoke({"question": question})
print(queries)

['Define prompt engineering', 'How does prompt engineering work in AI models?', 'What are the benefits of effective prompt engineering?', 'Examples of successful prompt engineering in different applications']


In [62]:
# Retrieving the relevent documents for the queries
total_retrieved_docs = []
for query in queries:
    retrieved_docs = retriever.get_relevant_documents(query)
    total_retrieved_docs.extend(retrieved_docs)

In [64]:
# Ranking the documents
docs = reciprocal_rank_fusion(total_retrieved_docs)

In [66]:
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": itemgetter("docs"),
    "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question, "docs": docs})

'{ "Prompt Engineering" : "Prompt Engineering refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics." }'