# **Generation Experiment**

In [1]:
%pip install --quiet --upgrade bitsandbytes langchain langchain-community langchain-huggingface transformers beautifulsoup4 faiss-gpu rank_bm25 lark qdrant-client langchain-chroma langchain_groq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m123.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.8 MB/s[0m eta [36m0:0

In [31]:
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
import torch
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import BitsAndBytesConfig
from langchain_core.output_parsers import StrOutputParser
import re
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.storage import InMemoryStore
from operator import itemgetter
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.retrievers import EnsembleRetriever # Supports Ensembling of results from multiple retrievers
from langchain_community.retrievers import BM25Retriever
from pydantic import BaseModel, Field
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from google.colab import userdata
from langchain import PromptTemplate
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
import json
from google.colab import files
import time
from langchain_groq import ChatGroq
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.callbacks.manager import CallbackManagerForRetrieverRun

## **User Action Required**

1. Run the code below to create the ```data``` folder

2. Choose to upload the following files
- ```iceland_articles_updated.csv```
- ```finland_articles_updated.csv```

In [3]:
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

In [4]:
uploaded_files = files.upload()

Saving iceland_articles_updated.csv to iceland_articles_updated.csv
Saving finland_articles_updated.csv to finland_articles_updated.csv


In [5]:
for file_name in uploaded_files.keys():
    os.rename(file_name, os.path.join(data_folder, file_name))

Your folder structure should now look as such:

```
data
  - iceland_articles_updated.csv
  - finland_articles_updated.csv
```

In [6]:
article_names = ['finland_articles_updated.csv', 'iceland_articles_updated.csv']
article_fps = [os.path.join(data_folder, article_name) for article_name in article_names]

docs = []
for article_fp in article_fps:
  df = pd.read_csv(article_fp)
  for _, row in df.iterrows():
    text = row['Title'] + " " + row['Content']

    doc = Document(
        page_content=text,
        metadata={'country': row['Country'], 'source': row['Source'], 'link': row['Article Links']}
    )

    docs.append(doc)

## **Query Decomposition:**

Decomposition is a query re-writing technique that focuses on decomposing a question into a set of subquestions.

This is applicable and effective for our use case as users planning a holiday tend to string together many requests in a single query. By breaking down a large queries into sub-queries, the retriever can retrieve more relevant documents to each sub-query and therefore, support the LLM in answering the whole query better

In [17]:
# Prompt Decomposition template used by the LLM to help break a question into sub questions

template = """You are an expert at converting user travel questions into sub-questions. \
You have access to several documents about the different travel destinations. \

Perform query decomposition. Given a user question, break it down into the most specific sub questions you can \
which will help you answer the original question. Each sub question should be about a single concept/fact/idea.

If there are acronyms or words you are not familiar with, do not try to rephrase them.
"""

template_2 = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries only):"""

prompt_decomposition = PromptTemplate.from_template(template_2)

In [18]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
llm = ChatGroq()

In [19]:
query_decomp = ( prompt_decomposition | llm | StrOutputParser())

**Simple experiment test queries for query decomposition**

In [20]:
test_queries = ["When is the best time to go Finland and what is there to do", "Help me plan a trip to Iceland, I am adventurous and like activities such as hiking", "How do I get around Iceland"]

In [21]:
# Apply the decompsition template and break down the questions into sub questions using the prompt decompsition pipeline
for q in test_queries:
  print(f'Query: {q}')
  print('Query Decomposed:')
  print(query_decomp.invoke({"question":q}))
  print('###################################################################')

Query: When is the best time to go Finland and what is there to do
Query Decomposed:
1. "What is the best time to visit Finland based on weather?"
2. "What are the popular activities to do in Finland during summer?"
3. "What are the recommended winter activities in Finland?"
###################################################################
Query: Help me plan a trip to Iceland, I am adventurous and like activities such as hiking
Query Decomposed:
1. "What are the best hiking trails in Iceland?"
2. "What is the weather like in Iceland during hiking season?"
3. "What permits or safety equipment are required for hiking in Iceland?"
###################################################################
Query: How do I get around Iceland
Query Decomposed:
1. "What are the transportation options available in Iceland?"
2. "How do I rent a car in Iceland?"
3. "Is there public transportation in Iceland?"
###################################################################


<br/>
<br/>

## **Use Best Retriever from Retreiver Evaluation: FAISS (Index Flat L2) with Re-ranking+ParentDocumentRetriever**

- Bi-Encoder/Embeddings model: all-mpnet-base-v2
- Retriever: FAISS with Index Flat L2
- Documents retrieved: 20
- Documents after re-ranking: 10
- Text Split Method: Recursive Character Text Splitter
- Child Chunk Size: 250
- Child Chunk Overlap: 50
- Parent Chunk Size: 650
- Parent Chunk Overlap: 160
- Cross-Encoder/Re-ranking model: BAAI/bge-reranker-large
- Similarity Search type: MMR

In [45]:
bi_encoder_embeddings_model_name = "sentence-transformers/all-mpnet-base-v2"

embeddings_model = HuggingFaceEmbeddings(model_name=bi_encoder_embeddings_model_name)
vector_store_index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
faiss_vector_store = FAISS(
    embedding_function=embeddings_model,
    index=vector_store_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
store = InMemoryStore()

num_docs_retrieved = 20

top_k = 10

child_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250, chunk_overlap=50, add_start_index=True
)

parent_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=650, chunk_overlap=160, add_start_index=True
)

cross_encoder_embedings_model_name = "BAAI/bge-reranker-large"

cross_encoder_model = HuggingFaceCrossEncoder(model_name=cross_encoder_embedings_model_name)

search_type = 'mmr'

In [80]:
class ParentDocumentReranker(ParentDocumentRetriever):
    model_config = {"extra": "allow"}

    def __init__(self, cross_encoder_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cross_encoder_model = cross_encoder_model

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Get documents relevant to a query.
        Args:
            query: String to find relevant documents for
            run_manager: The callbacks handler to use
        Returns:
            List of relevant documents
        """
        if self.search_kwargs['search_type'] == 'mmr':
            print('mmr search type')
            sub_docs = self.vectorstore.max_marginal_relevance_search(
                query, **self.search_kwargs
            )
        else:
            print('non-mmr search type')
            sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs)

        # Reranking logic here with sub_docs (child documents)
        for r in sub_docs:
          print(r)
        print('')
        print('')
        sub_docs_rerank_vals = []
        for r in sub_docs:
          sub_docs_rerank_vals.append((r,self.cross_encoder_model.score((r.page_content,query))))

        sub_docs = [key for key, value in sorted(sub_docs_rerank_vals, key=lambda x: x[1], reverse=True)]

        # We do this to maintain the order of the ids that are returned
        ids = []
        for d in sub_docs:
            if self.id_key in d.metadata and d.metadata[self.id_key] not in ids:
                ids.append(d.metadata[self.id_key])
        docs = self.docstore.mget(ids)
        return [d for d in docs if d is not None]

In [81]:
reranked_retriever = ParentDocumentReranker(
    cross_encoder_model=cross_encoder_model,
    vectorstore=faiss_vector_store,
    docstore=store,
    child_splitter=child_text_splitter,
    parent_splitter=parent_text_splitter,
    search_kwargs={"k": num_docs_retrieved, "fetch_k": num_docs_retrieved, "search_type": search_type}
)

reranked_retriever.add_documents(docs)

In [82]:
reranked_retriever.invoke('best food in iceland')

mmr search type
page_content='ENJOY ICELANDIC FOOD Iceland has emerged as one of Europe's most dynamic gastronomic destinations, full of exciting places to taste thrilling new recipes. Chefs create modern dishes with traditional ingredients, influenced by the philosophy of the' metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/enjoy-icelandic-food', 'start_index': 0, 'doc_id': '46eb038b-9808-4dcb-9f71-61ea9ab7ab51'}
page_content='"íslenskt grænmeti"), flatbrauð (flat rye-bread) with Icelandic cheese and kleinur (donut-like pastry) are perfect for any adventure!  THE ICELANDIC HOTDOG The Icelandic hotdog has a lot of fans. Order "Eina með öllu" to learn what the fuss is about' metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/iceland-on-a-budget', 'start_index': 198, 'doc_id': '868ff055-e085-4b7a-8856-a192bb6f4b16'}
page_content='SEAFOOD - ICELAND’S TOP PRIDE Local fish of the day

[Document(metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/enjoy-icelandic-food', 'start_index': 976}, page_content='SEAFOOD - ICELAND’S TOP PRIDE Local fish of the day is a must for any foodie. A staple of Icelandic cuisine is freshly caught fish, as fishing has been an essential part of Icelandic history and culture for centuries. Fish is the main export product of Iceland, and sustainable fishing and optimal treatment of the cath are a priority. Try one of the many Icelandic fish restaurants, and you will understand why the locals will have you believe they invented the thing. The Icelandic “lobster,” or langoustine, is also delicious and many Icelanders’ favorite food. Most restaurants offer langoustine, either in soup or roasted in garlic butter - a'),
 Document(metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/enjoy-icelandic-food', 'start_index': 976}, page_content='SEAFO

<br/>
<br/>

## **Re-order re-ranked results to mitigate "lost-in-the-middle" effect**

<u>Build the Final Answer Individually</u>

In [None]:
# Answer each sub-question individually
# RAG prompt
'''
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question}

Context: {context}

Answer:
'''
prompt_rag = hub.pull("rlm/rag-prompt")
def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """Perform RAG on each sub-question"""
    # Generate the sub questions using the chain
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    # Initialize a list to hold RAG results of each sub-question
    rag_results = []
    for sub_question in sub_questions:
        # Retrieve documents for each sub-question
        retrieved_docs = retriever.get_relevant_documents(sub_question)
        # Use retrieved documents and sub-question to answer the sub question
        answer = (prompt_rag | llm.bind(skip_prompt=True) | StrOutputParser()).invoke({"context": retrieved_docs,
                                                                "question": sub_question})
        # Append the answer to the sub question
        rag_results.append(answer)
    # Return the list of sub questions and their answers
    return rag_results,sub_questions

answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  retrieved_docs = retriever.get_relevant_documents(sub_question)


In [None]:
# Utility function to format a given question and answer
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

# Format the list of sub questions and their answers from just now
context = format_qa_pairs(questions, answers)

# Prompt template to use each individual sub-question and answer, as well as the main question
template = """Here is a set of Q+A pairs:

{context}

Use these to synthesize an answer to the question: {question}
"""

prompt = PromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm.bind(skip_prompt=True)
    | StrOutputParser()
)

final_rag_chain.invoke({"context":context,"question":question})

"To answer your question, the best time to visit Finland for tourism is during winter months like December through February when the Northern Lights (auroras) are most visible due to long nights and cold temperatures. This period provides opportunities for activities like snowshoeing, skiing, and aurora watching. However, summer from June to August also offers unique experiences such as midnight sun and vibrant nature with activities including hiking, kayaking, and berry-picking. Additionally, there's a chance to experience the country's unique glass huts in Lapland during the winter season. Overall, both winter and summer offer distinct and memorable experiences in Finland. To get the most out of your trip, it's recommended to check local forecasts and plan accordingly based on your interests and preferences. Here’s a summary of the key points:\n\n- **Best Time to Visit:** Winter (December through February) for Northern Lights and snowy landscapes.\n- **Summer Activities:** Hiking, ka

**Conclusion**: To be filled in