# **Generation Experiment**

In [1]:
%pip install --quiet --upgrade bitsandbytes langchain langchain-community langchain-huggingface transformers beautifulsoup4 faiss-gpu rank_bm25 lark qdrant-client langchain-chroma langchain_groq ragas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m115.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.3 MB/s[0m eta [36m0:00

In [2]:
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
import torch
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import BitsAndBytesConfig
from langchain_core.output_parsers import StrOutputParser
import re
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.storage import InMemoryStore
from operator import itemgetter
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.retrievers import EnsembleRetriever # Supports Ensembling of results from multiple retrievers
from langchain_community.retrievers import BM25Retriever
from pydantic import BaseModel, Field
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from google.colab import userdata
from langchain import PromptTemplate
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
import json
from google.colab import files
import time
from langchain_groq import ChatGroq
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from sentence_transformers import CrossEncoder
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from ragas import SingleTurnSample
from ragas.metrics import ResponseRelevancy, LLMContextRecall
from ragas.llms import LangchainLLMWrapper

## **User Action Required**

1. Run the code below to create the ```data``` folder

2. Choose to upload the following files
- ```iceland_articles.csv```
- ```finland_articles.csv```
- ```sweden_articles.csv```

In [3]:
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

In [4]:
uploaded_files = files.upload()

Saving sweden_articles.csv to sweden_articles.csv
Saving iceland_articles.csv to iceland_articles.csv
Saving finland_articles.csv to finland_articles.csv


In [5]:
for file_name in uploaded_files.keys():
    os.rename(file_name, os.path.join(data_folder, file_name))

Your folder structure should now look as such:

```
data
- iceland_articles.csv
- finland_articles.csv
- sweden_articles.csv
```

## **Create document objects from the data and store in Docs**

In [6]:
article_names = ['finland_articles.csv', 'iceland_articles.csv', 'sweden_articles.csv']
article_fps = [os.path.join(data_folder, article_name) for article_name in article_names]

docs = []
for article_fp in article_fps:
  df = pd.read_csv(article_fp)
  for _, row in df.iterrows():
    #text = row['Title'] + " " + row['Content']
    text = row['Content']

    doc = Document(
        page_content=text,
        metadata={'country': row['Country'], 'source': row['Source'], 'link': row['Article Links']}
    )

    docs.append(doc)

## **Define Query Decomposition Chain**

Decomposition is a query re-writing technique that focuses on decomposing a question into a set of subquestions.

This is applicable and effective for our use case as users planning a holiday tend to string together many requests in a single query. By breaking down a large/complex queries into sub-queries ('atomic' queries), the retriever can retrieve more relevant documents to each sub-query and therefore, support the LLM in answering the whole query better

In [7]:
# Prompt Decomposition template used by the LLM to help break a question into sub questions

# https://python.langchain.com/v0.1/docs/use_cases/query_analysis/techniques/decomposition/
# https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_5_to_9.ipynb
decomp_template = """You are an expert at breaking down travel-related questions into smaller, manageable sub-questions.
You have access to a collection of documents about various travel destinations, including information on activities, accommodations, local culture, and transportation.

Perform query decomposition. Given a user travel query, break it down into **distinct and independent** sub-questions that
you need to answer in order to comprehensively address the original query.

**Important:**
1. Ensure that the sub-questions are independent of each other and do not rely on answers from other sub-questions.
2. Cover diverse aspects of the query, ensuring that no sub-question overlaps or repeats similar information.
3. If there are location names, abbreviations, or specific terms you are unfamiliar with, do not attempt to rephrase them or make assumptions about their meaning.
4. Ensure your response adheres strictly to the format provided below. Do not add any additional text, explanations, or information outside of this structure.

Generate sub-questions related to: {question} \n

**Output Format (3 queries only):**
1. Sub-question 1
2. Sub-question 2
3. Sub-question 3

Here are a few examples:

---
Question: When is the best time to go Finland and what is there to do

Response:
1. What is the climate and weather like in Finland during different times of the year?
2. What are the popular activities and attractions to visit in Finland?
3. Are there any cultural events or festivals in Finland that would affect the best time to visit?
---

---
Question: Help me plan a trip to Iceland, I am adventurous and like activities such as hiking

Response:
1. What are some popular hiking trails and locations in Iceland?
2. What is the best time of year to visit Iceland for hiking and outdoor activities?
3. What types of accommodations are available near the hiking areas in Iceland?
---

---
Question: How do I get around Iceland?

Response:
1. What are the different transportation options available for traveling within Iceland?
2. How can I travel between the international airport and Reykjavik, the capital city of Iceland?
3. Are there any notable road conditions or driving rules to be aware of when renting a car in Iceland?
---

"""

decomp_prompt_template = PromptTemplate.from_template(decomp_template)

In [8]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
llm = ChatGroq()

In [9]:
def clean_questions(raw_questions):
  questions = []
  raw_questions = raw_questions.split('\n')
  for qn in raw_questions:
    match = re.match(r"^\d+\.\s?", qn)
    if match:
      qn_number = match.group(0)
      qn_content = qn.replace(qn_number, '').strip()
      questions.append(qn_content)
  return questions

In [10]:
decomp_chain = ( decomp_prompt_template | llm | StrOutputParser() | clean_questions)

**Simple experiment test queries for query decomposition**

In [11]:
test_queries = ["When is the best time to go Iceland and what is there to do", "Help me plan a trip to Sweden, I love culture and food", "What are fun things to do in Finland?", "How is transportation like in Iceland?"]

In [12]:
# Apply the decompsition template and break down the questions into sub questions using the prompt decompsition pipeline
for q in test_queries:
  print(f'Query: {q}')
  print('Query Decomposed:')
  print(decomp_chain.invoke({"question":q}))
  print('###################################################################')

Query: When is the best time to go Iceland and what is there to do
Query Decomposed:
['What is the climate and weather like in Iceland during different times of the year?', 'What are the popular activities and attractions for adventure seekers in Iceland, particularly those involving hiking?', 'Are there any cultural events or festivals in Iceland that would be of interest to an adventurous traveler?']
###################################################################
Query: Help me plan a trip to Sweden, I love culture and food
Query Decomposed:
['What are the cultural attractions and events in Sweden that might interest someone who loves culture?', 'What are the recommended Swedish foods and restaurants that I should try during my trip?', 'Which cities or regions in Sweden are known for their cultural and food experiences?']
###################################################################
Query: What are fun things to do in Finland?
Query Decomposed:
['What are some popular and f

<br/>
<br/>

## **Define Best Retriever from Retreiver Evaluation**

**Control Variables**

Re-ranked retriever with FAISS Index Flat L2

- Retriever: FAISS with Index Flat L2
- Bi-Encoder/Embeddings model: all-mpnet-base-v2
- Text Split Method: Recursive Character Text Splitter
- Child Chunk Size: 250
- Child Chunk Overlap: 50
- Cross-Encoder/Re-ranking model: BAAI/bge-reranker-large
- Similarity Search type: MMR
- Number of docs to be fetched for MMR: 25
- Number of docs to be retrieved and reranked: 15

**Experimental Variables**

- Number of docs to be fed to LLM: 10, 15
  - Parent Chunk Size: 300 (50 words), 450 (75 words), 600 (100 words)
    - Parent Chunk Overlap: 20%


In [17]:
num_docs_feed = [10,15]
parent_chunk_size = [300,450,600]
fetch_k = 25
top_k = 15
bi_encoder_embeddings_model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings_model = HuggingFaceEmbeddings(model_name=bi_encoder_embeddings_model_name)
cross_encoder_embedings_model_name = "BAAI/bge-reranker-large"
cross_encoder_model = HuggingFaceCrossEncoder(model_name=cross_encoder_embedings_model_name)
search_type = 'mmr'
child_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250, chunk_overlap=50, add_start_index=True
)



idx_docs_feed = 0
idx_chunk_size = 0

docs_feed = num_docs_feed[idx_docs_feed]
parent_chunk_size = parent_chunk_size[idx_chunk_size]

parent_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=parent_chunk_size, chunk_overlap=60, add_start_index=True
)

In [14]:
'''
Custom ParentDocumentRetriever object (ParentDocumentReranker) to do re-ranking of child chunks and return the parent chunks in the same order
'''
class ParentDocumentReranker(ParentDocumentRetriever):
    model_config = {"extra": "allow"}

    def __init__(self, cross_encoder_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cross_encoder_model = cross_encoder_model

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Get documents relevant to a query.
        Args:
            query: String to find relevant documents for
            run_manager: The callbacks handler to use
        Returns:
            List of relevant documents
        """
        if self.search_kwargs['search_type'] == 'mmr':
            sub_docs = self.vectorstore.max_marginal_relevance_search(
                query, **self.search_kwargs
            )
        else:
            sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs)

        # Reranking logic here with sub_docs (child documents)
        sub_docs_rerank_vals = []
        for r in sub_docs:
          sub_docs_rerank_vals.append((r,self.cross_encoder_model.score((r.page_content,query))))
        sub_docs = [key for key, value in sorted(sub_docs_rerank_vals, key=lambda x: x[1], reverse=True)]

        # We do this to maintain the order of the ids that are returned
        ids = []
        for d in sub_docs:
            if self.id_key in d.metadata and d.metadata[self.id_key] not in ids:
                ids.append(d.metadata[self.id_key])
        docs = self.docstore.mget(ids)
        return [d for d in docs if d is not None][:self.search_kwargs['docs_feed']]

In [18]:
vector_store_index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
faiss_vector_store = FAISS(
    embedding_function=embeddings_model,
    index=vector_store_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

store = InMemoryStore()

custom_retriever = ParentDocumentReranker(
    cross_encoder_model=cross_encoder_model,
    vectorstore=faiss_vector_store,
    docstore=store,
    child_splitter=child_text_splitter,
    parent_splitter=parent_text_splitter,
    search_kwargs={"k": top_k, "fetch_k": fetch_k, "docs_feed": docs_feed, "search_type": search_type}
)

custom_retriever.add_documents(docs)

<br/>
<br/>

## **Define Re-ordering Function**

Re-order re-ranked results such that extremas have the most relevent documents to mitigate "lost-in-the-middle" effect

In [19]:
def reorder_docs(res):
  res_reordered = [None] * len(res)
  start = 0
  end = len(res) - 1
  place_at_start = True

  for doc in res:
      if place_at_start:
          res_reordered[start] = doc
          start += 1
      else:
          res_reordered[end] = doc
          end -= 1

      place_at_start = not place_at_start
  return res_reordered

<br/>
<br/>
<br/>

## **Build the Final Answer By Answering Each Sub-Question**

In [20]:
qna_template = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. Each statement in your answer must be **entailable** by at least one of the provided citations, meaning the information in the statement must align directly and logically with the cited context.

### **IMPORTANT REQUIREMENTS**:
1. **Inline Numbered Citations**: For each statement in your answer, **you must include at least one inline numbered citation** (e.g., [1], [2]). Statements without sufficient support from the context should not be included in the answer.
2. **Entailment Verification**: Ensure that every statement (hypothesis) is directly entailed by the cited context (premise). Use only information explicitly stated or strongly implied by the citation. Do not include unsupported information or make up details.
3. **Sequential Citation Numbering**: Reset the numbering for citations to start from **1** for each response, and ensure the numbering increases sequentially.
4. **Single Paragraph**: Write your response as **a single paragraph**. Avoid using any new line characters in the response. All statements should flow naturally and seamlessly into one another.

At the bottom, provide the full citations corresponding to each number, but include **only the page content** of the Document Object (exclude metadata). Use citations exactly as provided in the context.

Respond in the following format:
---
Statement 1 [1]. Statement 2 [2, 3].

Citations:
[1]: <Page Content 1> //Only the page content of the document
[2]: <Page Content 2>
[3]: <Page Content 3>
---

### **HOW TO APPROACH**:
- Before writing your answer, check if each statement is **entailable** by verifying that it is supported by one or more citations.
- If the context does not support a statement, exclude it from your answer and say "I don't know."

Here are a few examples:

---
The best hikes in Norway include the Reinebringen hike in the Lofoten islands, Preikestolen, Kjeragbolten, and Trolltunga [1, 2]. The Reinebringen hike, although not one of the highest peaks, offers an iconic view of the Reine fjord from its summit [1]. Preikestolen, Kjeragbolten, and Trolltunga are famous for their stunning fjord views and unique geological formations, such as a boulder stuck between a mountain crevasse and a tongue-shaped rock [2].

Citations:
[1]: "The best hikes in Norway include the Reinebringen hike in the Lofoten islands. At a modest 448 meters high, Reinebringen is far from one of the highest peaks on the Lofoten islands. Yet this is more than made up for by the iconic view from the summit of Reine. It is not suitable for winter! Also, the trail can be quite demanding as the steps are quite steep."
[2]: "The most famous hikes in Norway include Preikestolen (a beautiful fjord), Kjeragbolten (with a famous boulder stuck between a mountain crevasse) as well as Trolltunga which resembles a tongue."
---

---
In Switzerland, you can embark on several scenic hikes [1]. One such hike is at Grindelwald, where at the summit, you will find a stunning lake, but it's only visible during the summer [1]. Other notable hikes include Zermatt, also known as the Matterhorn [1], and Lauterbrunnen [1]. For those seeking a challenging hike, the Matterhorn at Zermatt is a must-go [2]. This iconic peak is featured on the Toblerone chocolate and is best explored with a mountain guide due to the inherent dangers [2].

Citations:
[1]: "One of the most scenic hikes in Switzerland can be done at Grindelwald. At the summit of Grindelwald, a beautiful lake awaits you. However, you can only see this lake during summer time. Other notable hikes include Zermatt, i.e. the matterhorn and Lauterbrunnen."
[2]: "The matterhorn at zermatt is a must-go for hiking enthusiasts. It is the icon of the famous chocolate: Toblerone. However, it is recommended to hire a mountain guide to go with you as it can be very dangerous!"
---

---
In Iceland, you can participate in a variety of adventurous activities [1]. For instance, you can hike active volcanoes and explore a natural ice cave, offering unique geological experiences [1]. Driving in Iceland is also an amazing adventure, with open roads, majestic volcanoes, and towering mountains as your backdrop, and the possibility of encountering sheep and arctic foxes along the way [2]. Additionally, Iceland is known for its exceptional diving sites [3]. One of the most famous in the world, Silfra, is located in Iceland [3]. It is the only diving site where you can dive between two tectonic plates, and the water is so fresh that you can drink it, promising an unparalleled tasting experience [3].

Citations:
[1]: "Iceland is a must-go to place for adventurous people! You can hike active volcanoes, drive a jeep through the volcanic ash, explore a natural ice cave, see waterfalls. There are so many opportunities for an adventurer."
[2]: "Driving in Iceland is an amazing experience - open roads, majestic volcanos and towering mountains along the way, sheep and arctic foxes make it a great experience. All you need is an international driving license. And, please drive slowly during the winter!"
[3]: "One of the most famous diving sites in the world, Silfra, is located in Iceland! It is the only diving site in the world where you can dive between 2 tectonic plates. The water is also so fresh that you can drink from it, it is the best water that you will ever taste."
---

Question: {question}

Context: {context}

Helpful Answer:
"""

qna_prompt_template = PromptTemplate.from_template(qna_template)

In [22]:
def get_answer(question,decomp_chain,retriever,answer_chain):

  final_response = {}

  sub_questions = decomp_chain.invoke({"question":question})
  for sub_question in sub_questions:
      retrieved_docs = retriever.invoke(sub_question)
      retreived_docs = reorder_docs(retrieved_docs)
      answer = answer_chain.invoke({"question": sub_question,"context": retrieved_docs})
      final_response[sub_question] = {
          "answer": answer,
          "retrieved_docs": retrieved_docs
      }

  return final_response

In [23]:
answer_chain = ( qna_prompt_template | llm | StrOutputParser() )

In [24]:
questions_answer = ["When is the best time to go Iceland and what is there to do", "Help me plan a trip to Sweden, I love culture and food", "What are fun things to do in Finland?", "How is transportation like in Iceland?"]

In [25]:
responses = []
for qn in questions_answer:
  response = get_answer(qn,decomp_chain,custom_retriever,answer_chain)
  print('')
  print('')
  print('')
  responses.append(response)

Sub Questions: 
['What is the climate and weather like in Iceland during different times of the year?', 'What are the popular activities and attractions for adventure seekers in Iceland, specifically related to hiking and outdoor experiences?', 'Are there any cultural events or festivals in Iceland that would affect the best time to visit for an adventurous traveler?']
###################################################################
###################################################################
Sub Question: What is the climate and weather like in Iceland during different times of the year?
LLM Answer:
The climate in Iceland is sub-arctic, meaning long and cold winters [4]. During the winter months, temperatures typically stay around medium -3 Celsius due to the moderating influence of the Gulfstream [4]. However, North-Iceland and the Westfjords are known for heavy snowfall, with several meters deep in the winters, making them ideal for winter sports and playing in the snow [5