# **Generation Experiment**

In [1]:
%pip install --quiet --upgrade bitsandbytes langchain langchain-community langchain-huggingface transformers beautifulsoup4 faiss-gpu rank_bm25 lark qdrant-client langchain-chroma langchain_groq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m123.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.8 MB/s[0m eta [36m0:0

In [31]:
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
import torch
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import BitsAndBytesConfig
from langchain_core.output_parsers import StrOutputParser
import re
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.storage import InMemoryStore
from operator import itemgetter
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.retrievers import EnsembleRetriever # Supports Ensembling of results from multiple retrievers
from langchain_community.retrievers import BM25Retriever
from pydantic import BaseModel, Field
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from google.colab import userdata
from langchain import PromptTemplate
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
import json
from google.colab import files
import time
from langchain_groq import ChatGroq
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.callbacks.manager import CallbackManagerForRetrieverRun

## **User Action Required**

1. Run the code below to create the ```data``` folder

2. Choose to upload the following files
- ```iceland_articles_updated.csv```
- ```finland_articles_updated.csv```

In [3]:
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

In [4]:
uploaded_files = files.upload()

Saving iceland_articles_updated.csv to iceland_articles_updated.csv
Saving finland_articles_updated.csv to finland_articles_updated.csv


In [5]:
for file_name in uploaded_files.keys():
    os.rename(file_name, os.path.join(data_folder, file_name))

Your folder structure should now look as such:

```
data
  - iceland_articles_updated.csv
  - finland_articles_updated.csv
```

In [6]:
article_names = ['finland_articles_updated.csv', 'iceland_articles_updated.csv']
article_fps = [os.path.join(data_folder, article_name) for article_name in article_names]

docs = []
for article_fp in article_fps:
  df = pd.read_csv(article_fp)
  for _, row in df.iterrows():
    text = row['Title'] + " " + row['Content']

    doc = Document(
        page_content=text,
        metadata={'country': row['Country'], 'source': row['Source'], 'link': row['Article Links']}
    )

    docs.append(doc)

## **Query Decomposition:**

Decomposition is a query re-writing technique that focuses on decomposing a question into a set of subquestions.

This is applicable and effective for our use case as users planning a holiday tend to string together many requests in a single query. By breaking down a large queries into sub-queries, the retriever can retrieve more relevant documents to each sub-query and therefore, support the LLM in answering the whole query better

In [129]:
# Prompt Decomposition template used by the LLM to help break a question into sub questions

template = """You are an expert at converting user travel questions into sub-questions. \
You have access to several documents about the different travel destinations. \

Perform query decomposition. Given a user question, break it down into the most specific sub questions you can \
which will help you answer the original question. Each sub question should be about a single concept/fact/idea.

If there are acronyms or words you are not familiar with, do not try to rephrase them.
"""

template_2 = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries only):""" # add few shot?

prompt_decomposition = PromptTemplate.from_template(template_2)

In [130]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
llm = ChatGroq()

In [131]:
def clean_questions(questions):
  questions = questions.strip()
  questions = questions.split('\n')
  return questions

In [132]:
query_decomp = ( prompt_decomposition | llm | StrOutputParser() | clean_questions)

**Simple experiment test queries for query decomposition**

In [133]:
test_queries = ["When is the best time to go Finland and what is there to do", "Help me plan a trip to Iceland, I am adventurous and like activities such as hiking", "How do I get around Iceland"]

In [134]:
# Apply the decompsition template and break down the questions into sub questions using the prompt decompsition pipeline
for q in test_queries:
  print(f'Query: {q}')
  print('Query Decomposed:')
  print(query_decomp.invoke({"question":q}))
  print('###################################################################')

Query: When is the best time to go Finland and what is there to do
Query Decomposed:
['1. "What is the best time to visit Finland for good weather?"', '2. "What are the popular activities to do in Finland during summer?"', '3. "Are there any festivals or events in Finland during winter months?"']
###################################################################
Query: Help me plan a trip to Iceland, I am adventurous and like activities such as hiking
Query Decomposed:
['1. "What are the best hiking trails in Iceland?"', '2. "What is the climate like in Iceland during (specific month of trip) for hiking?"', '3. "What permits or guidelines should I follow for hiking in Iceland?"']
###################################################################
Query: How do I get around Iceland
Query Decomposed:
['1. "What are the transportation options available in Iceland?"', '2. "How can I rent a car in Iceland?"', '3. "Is there public transportation in Iceland?"']
##############################

<br/>
<br/>

## **Initialise Best Retriever from Retreiver Evaluation: FAISS (Index Flat L2)+Re-ranking+ParentDocumentRetriever**

- Bi-Encoder/Embeddings model: all-mpnet-base-v2
- Retriever: FAISS with Index Flat L2
- Documents retrieved: 20
- Documents to keep after re-ranking: 10
- Text Split Method: Recursive Character Text Splitter
- Child Chunk Size: 250
- Child Chunk Overlap: 50
- Parent Chunk Size: 650
- Parent Chunk Overlap: 160
- Cross-Encoder/Re-ranking model: BAAI/bge-reranker-large
- Similarity Search type: MMR

In [45]:
bi_encoder_embeddings_model_name = "sentence-transformers/all-mpnet-base-v2"

embeddings_model = HuggingFaceEmbeddings(model_name=bi_encoder_embeddings_model_name)
vector_store_index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
faiss_vector_store = FAISS(
    embedding_function=embeddings_model,
    index=vector_store_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
store = InMemoryStore()

num_docs_retrieved = 20

top_k = 10

child_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250, chunk_overlap=50, add_start_index=True
)

parent_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=650, chunk_overlap=160, add_start_index=True
)

cross_encoder_embedings_model_name = "BAAI/bge-reranker-large"

cross_encoder_model = HuggingFaceCrossEncoder(model_name=cross_encoder_embedings_model_name)

search_type = 'mmr'

In [84]:
'''
Custom ParentDocumentRetriever object (ParentDocumentReranker) to do re-ranking of child chunks and return the parent chunks in the same order
'''
class ParentDocumentReranker(ParentDocumentRetriever):
    model_config = {"extra": "allow"}

    def __init__(self, cross_encoder_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cross_encoder_model = cross_encoder_model

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Get documents relevant to a query.
        Args:
            query: String to find relevant documents for
            run_manager: The callbacks handler to use
        Returns:
            List of relevant documents
        """
        if self.search_kwargs['search_type'] == 'mmr':
            sub_docs = self.vectorstore.max_marginal_relevance_search(
                query, **self.search_kwargs
            )
        else:
            sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs)

        # Reranking logic here with sub_docs (child documents)
        sub_docs_rerank_vals = []
        for r in sub_docs:
          sub_docs_rerank_vals.append((r,self.cross_encoder_model.score((r.page_content,query))))

        sub_docs = [key for key, value in sorted(sub_docs_rerank_vals, key=lambda x: x[1], reverse=True)]

        # We do this to maintain the order of the ids that are returned
        ids = []
        for d in sub_docs:
            if self.id_key in d.metadata and d.metadata[self.id_key] not in ids:
                ids.append(d.metadata[self.id_key])
        docs = self.docstore.mget(ids)
        return [d for d in docs if d is not None]

In [96]:
custom_retriever = ParentDocumentReranker(
    cross_encoder_model=cross_encoder_model,
    vectorstore=faiss_vector_store,
    docstore=store,
    child_splitter=child_text_splitter,
    parent_splitter=parent_text_splitter,
    search_kwargs={"k": num_docs_retrieved, "fetch_k": num_docs_retrieved*2, "search_type": search_type}
)

custom_retriever.add_documents(docs)

In [97]:
res = custom_retriever.invoke('best things to do in iceland')

In [98]:
res

[Document(metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/how-to-practice-your-hobby-in-iceland', 'start_index': 1966}, page_content='tours are available as well. For the ones who don’t want to use their own body strength: Snowmobiling tours are a popular pastime in North Iceland as well as dogsledding. For even more adventurous back-country skiing, heli-skiing and glacial mountain skiing make for an incredible experience in Iceland’s winter wonderland! FISHING Iceland is a mecca for river fishing, fly fishing, sea angling, and even ice fishing. Some of the most famous salmon fishing rivers in Iceland are Vatnsdalsá, Láxá í Aðaldal and Laxá í Kjós,Hofsá and Selá and Jökulsá á Brú, amongst many others. What they all have in common is their beautiful setting in a tranquil'),
 Document(metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/christmas-in-iceland', 'start_index': 5428}, p

<br/>
<br/>

## **Re-order re-ranked results such that extremas have the most relevent documents to mitigate "lost-in-the-middle" effect**

In [99]:
def reorder_docs(res):
  res_reordered = [None] * len(res)
  start = 0
  end = len(res) - 1
  place_at_start = True

  for doc in res:
      if place_at_start:
          res_reordered[start] = doc
          start += 1
      else:
          res_reordered[end] = doc
          end -= 1

      place_at_start = not place_at_start
  return res_reordered

In [100]:
reorder_docs(res)

[Document(metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/how-to-practice-your-hobby-in-iceland', 'start_index': 1966}, page_content='tours are available as well. For the ones who don’t want to use their own body strength: Snowmobiling tours are a popular pastime in North Iceland as well as dogsledding. For even more adventurous back-country skiing, heli-skiing and glacial mountain skiing make for an incredible experience in Iceland’s winter wonderland! FISHING Iceland is a mecca for river fishing, fly fishing, sea angling, and even ice fishing. Some of the most famous salmon fishing rivers in Iceland are Vatnsdalsá, Láxá í Aðaldal and Laxá í Kjós,Hofsá and Selá and Jökulsá á Brú, amongst many others. What they all have in common is their beautiful setting in a tranquil'),
 Document(metadata={'country': 'Iceland', 'source': 'visiticeland', 'link': 'https://www.visiticeland.com/article/christmas-in-iceland', 'start_index': 5428}, p

<br/>
<br/>
<br/>

### **Build the Final Answer By Answering Each Sub-Question**

In [104]:
qna_prompt = hub.pull("rlm/rag-prompt")
synthesize_template = """Here is a set of Q+A pairs:

  {context}

  Use these to synthesize an answer to the question: {question}
"""
synthesize_prompt = PromptTemplate.from_template(template)

def build_answer(question,qna_prompt,synthesize_prompt,sub_question_generator_chain,retriever):
  sub_questions = sub_question_generator_chain.invoke({"question":question})
  print('Sub Questions: ')
  print(sub_questions)
  print('###################################################################')
  print('###################################################################')
  sub_answers = []
  for sub_question in sub_questions:
      print(f'Sub Question: {sub_question}')
      retrieved_docs = retriever.invoke(sub_question)[:top_k]
      print('Retrieved Docs:')
      print(retrieved_docs)
      retreived_docs = reorder_docs(retrieved_docs)
      answer = (qna_prompt | llm | StrOutputParser()).invoke({"question": sub_question,"context": retrieved_docs})
      print('LLM Answer:')
      print(answer)
      print('###################################################################')
      sub_answers.append(answer)

  formatted_string = ""
  for i, (question, answer) in enumerate(zip(sub_questions, sub_answers), start=1):
      formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"

  context = formatted_string.strip()

  final_rag_chain = (
      synthesize_prompt
      | llm
      | StrOutputParser()
  )

  final_rag_chain.invoke({"question":question, "context":context})



In [105]:
build_answer('best things to do in iceland',qna_prompt,synthesize_prompt,query_decomp,custom_retriever)

Sub Questions: 
1. "Top rated attractions in Iceland"
2. "Must-visit natural wonders in Iceland"
3. "Exciting activities to do in Iceland"
###################################################################
###################################################################
Sub Question: 1
Retrieved Docs:
[Document(metadata={'country': 'Finland', 'source': 'visitfinland', 'link': 'https://www.visitfinland.com/en/articles/finnish-food-culture/', 'start_index': 8831}, page_content='consumed, these savoury delights go great with heaps of butter and scorching black coffee. Tip: Try a different flatbread in each place you visit. Hopefully, you’ll have the chance to taste the local Savo variety, which is made with sour milk.; Korvapuusti – a pastry enjoyed with a cup of coffee: Korvapuusti translates to “slapped ears” in English, but these pastries are essentially cinnamon buns. And while Finland doesn’t hold a patent on cinnamon buns, it probably should. Usually enjoyed with a cup of coffee

KeyboardInterrupt: 

**Conclusion**: To be filled in