# **Generation Experiment**

## **User Action Required**

- Set the number of documents to feed the LLM and the size of each document

In [1]:
'''
num_docs_feed = [5,10,15]
parent_chunk_size = [360,500]
'''

docs_feed = 5
parent_chunk_size = 360

In [2]:
%pip install --quiet --upgrade bitsandbytes langchain langchain-community langchain-huggingface transformers beautifulsoup4 faiss-gpu rank_bm25 lark qdrant-client langchain-chroma langchain_groq ragas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.0 MB/s[0m eta [36m0:00:

In [3]:
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
import torch
from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import BitsAndBytesConfig
from langchain_core.output_parsers import StrOutputParser
import re
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.storage import InMemoryStore
from operator import itemgetter
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.retrievers import EnsembleRetriever # Supports Ensembling of results from multiple retrievers
from langchain_community.retrievers import BM25Retriever
from pydantic import BaseModel, Field
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from google.colab import userdata
from langchain import PromptTemplate
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import os
import json
from google.colab import files
import time
from langchain_groq import ChatGroq
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from sentence_transformers import CrossEncoder
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from ragas import SingleTurnSample
from ragas.metrics import ResponseRelevancy, LLMContextRecall
from ragas.llms import LangchainLLMWrapper
import math
import numpy as np

<br/>
<br/>
<br/>

## **User Action Required**

1. Run the code below to create the ```data``` folder

2. Choose to upload the following files
- ```iceland_articles.csv```
- ```finland_articles.csv```
- ```sweden_articles.csv```
- ```generation_eval.json```
- ```sub_questions.json``` (if available)
- ```sub_questions_refusal.json``` (if available)
- ```model_response_docs_feed_X_parent_chunk_size_Y``` (if available)

In [4]:
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

In [5]:
uploaded_files = files.upload()

Saving finland_articles.csv to finland_articles.csv
Saving generation_eval.json to generation_eval.json
Saving iceland_articles.csv to iceland_articles.csv
Saving sub_questions.json to sub_questions.json
Saving sub_questions_refusal.json to sub_questions_refusal.json
Saving sweden_articles.csv to sweden_articles.csv


In [6]:
for file_name in uploaded_files.keys():
    os.rename(file_name, os.path.join(data_folder, file_name))

Your folder structure should now look as such:

```
data
- iceland_articles.csv
- finland_articles.csv
- sweden_articles.csv
- generation_eval.json
- sub_questions.json (if available)
- sub_questions_refusal.json (if available)
- model_response_docs_feed_X_parent_chunk_size_Y (if available)
```

<br/>
<br/>
<br/>

## **Getting the average number of characters per word from the dataset**

```
article_names = ['finland_articles.csv', 'iceland_articles.csv', 'sweden_articles.csv']
article_fps = [os.path.join(data_folder, article_name) for article_name in article_names]
all_content = []
for article_fp in article_fps:
  df = pd.read_csv(article_fp)
  for _, row in df.iterrows():
    all_content.append(row['Content'])

total_chars = 0
total_words = 0
for para in all_content:
  words = para.split()
  total_chars += sum(len(word) for word in words)
  total_words += len(words)
total_chars/total_words

5.141210817455439
```

<br/>
<br/>
<br/>

## **Create document objects from the data and store in Docs**

In [7]:
article_names = ['finland_articles.csv', 'iceland_articles.csv', 'sweden_articles.csv']
article_fps = [os.path.join(data_folder, article_name) for article_name in article_names]

docs = []
for article_fp in article_fps:
  df = pd.read_csv(article_fp)
  for _, row in df.iterrows():
    #text = row['Title'] + " " + row['Content']
    text = row['Content']

    doc = Document(
        page_content=text,
        metadata={'country': row['Country'], 'source': row['Source'], 'link': row['Article Links']}
    )

    docs.append(doc)

<br/>
<br/>
<br/>

## **Initialise LLM**

In [8]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
# https://console.groq.com/settings/limits
llm = ChatGroq(model="mixtral-8x7b-32768")

<br/>
<br/>
<br/>

## **Get the answerable questions, model answers and the refusal questions**

In [9]:
with open(os.path.join(data_folder,'generation_eval.json'), 'r') as file:
    generation_eval = json.load(file)

question_names = ['question_1', 'question_2', 'question_3', 'question_4', 'question_5']
answerable_questions = []
for question_name in question_names:
  answerable_questions.append(generation_eval[question_name]['question'])

refusal_questions = generation_eval['refusal_questions']

<br/>
<br/>
<br/>

## **If no existing sub questions: Define Query Decomposition Chain**

Decomposition is a query re-writing technique that focuses on decomposing a question into a set of subquestions.

This is applicable and effective for our use case as users planning a holiday tend to string together many requests in a single query. By breaking down a large/complex queries into sub-queries ('atomic' queries), the retriever can retrieve more relevant documents to each sub-query and therefore, support the LLM in answering the whole query better

In [None]:
# Prompt Decomposition template used by the LLM to help break a question into sub questions

# https://python.langchain.com/v0.1/docs/use_cases/query_analysis/techniques/decomposition/
# https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_5_to_9.ipynb
decomp_template = """You are an expert at breaking down travel-related questions into smaller, manageable sub-questions.
You have access to a collection of documents about various travel destinations, including information on activities, accommodations, local culture, and transportation.

Perform query decomposition. Given a user travel query, break it down into **distinct and independent** sub-questions that
you need to answer in order to comprehensively address the original query.

**Important:**
1. Ensure that the sub-questions are unambiguous, clear, and explicit. Avoid generic phrases such as "these activities" or "specific regions". Instead, directly refer to the activities, locations, or topics mentioned in the original query.
2. Ensure that the sub-questions are independent of each other and do not rely on answers from other sub-questions.
3. Cover diverse aspects of the query, ensuring that no sub-question overlaps or repeats similar information.
4. If there are location names, abbreviations, or specific terms you are unfamiliar with, do not attempt to rephrase them or make assumptions about their meaning.
5. **STRICTLY REFUSE to process non-travel-related queries.** If the query is unrelated to travel, explicitly respond with the following message and provide no other explanation:
   **"This query is not related to travel and cannot be broken down."**
6. Ensure your response adheres strictly to the format provided below. Do not add any additional text, explanations, or information outside of this structure.

Generate sub-questions related to: {question} \n

**Output Format (3 queries only):**
1. Sub-question 1
2. Sub-question 2
3. Sub-question 3

Here are a few examples:

---
Question: When is the best time to go Finland and what is there to do

Response:
1. What is the weather like in Finland during different times of the year?
2. What are the best and most suitable activities to do in Finland during each season?
3. Are there any cultural events or festivals in Finland that would influence the best time to visit?
---

---
Question: Help me plan a trip to Iceland, I am adventurous and like activities such as hiking

Response:
1. What are some popular hiking trails and locations in Iceland?
2. What is the best time of year to visit Iceland for hiking and outdoor activities?
3. What types of accommodations are available near the hiking areas in Iceland?
---

---
Question: How do I get around Iceland?

Response:
1. What are the different transportation options available for traveling within Iceland?
2. How can I travel between the international airport and Reykjavik, the capital city of Iceland?
3. Are there any notable road conditions or driving rules to be aware of when renting a car in Iceland?
---

---
Question: How old is Earth?

Response:
**This query is not related to travel and cannot be broken down.**
---

"""

decomp_prompt_template = PromptTemplate.from_template(decomp_template)

In [None]:
def clean_questions(raw_questions):
  questions = []
  raw_questions = raw_questions.split('\n')
  for qn in raw_questions:
    match = re.match(r"^\d+\.\s?", qn)
    if match:
      qn_number = match.group(0)
      qn_content = qn.replace(qn_number, '').strip()
      questions.append(qn_content)
  return questions

In [None]:
decomp_chain = ( decomp_prompt_template | llm | StrOutputParser() | clean_questions)

### **Decompose the answerable test queries**

In [None]:
# Apply the decompsition template and break down the questions into sub questions using the prompt decompsition pipeline
questions_decomposed = {}

for q in answerable_questions:
  print(f'Query: {q}')
  print('Query Decomposed:')
  sub_questions = decomp_chain.invoke({"question":q})
  print(sub_questions)
  questions_decomposed[q] = sub_questions
  print('###################################################################')

Query: Help me plan a trip to Iceland, I love outdoor activities!
Query Decomposed:
['What are some popular outdoor activities and locations in Iceland for adventure seekers?', 'What is the climate and weather like in Iceland during popular months for outdoor activities?', 'What types of accommodations are available near the outdoor activity locations in Iceland?']
###################################################################
Query: What are some special places that I can stay in Finland?
Query Decomposed:
['What types of unique accommodations are available in Finland, such as hotels, resorts, or rentals?', 'Are there any notable resorts or hotels in Finland known for their distinctive architecture or natural settings?', 'Are there any traditional Finnish accommodations, like log cabins or igloos, that offer a special experience for travelers?']
###################################################################
Query: Help me plan a trip to Sweden, I love culture and food!
Query

### **Save the answerable test queries for reproducibility**

In [None]:
with open(f'sub_questions.json', 'w') as file:
    json.dump(questions_decomposed, file, indent=4)

In [None]:
questions_decomposed

{'Help me plan a trip to Iceland, I love outdoor activities!': ['What are some popular outdoor activities and locations in Iceland for adventure seekers?',
  'What is the climate and weather like in Iceland during popular months for outdoor activities?',
  'What types of accommodations are available near the outdoor activity locations in Iceland?'],
 'What are some special places that I can stay in Finland?': ['What types of unique accommodations are available in Finland, such as hotels, resorts, or rentals?',
  'Are there any notable resorts or hotels in Finland known for their distinctive architecture or natural settings?',
  'Are there any traditional Finnish accommodations, like log cabins or igloos, that offer a special experience for travelers?'],
 'Help me plan a trip to Sweden, I love culture and food!': ['What are some culturally significant sites and attractions in Sweden that I should visit?',
  'What are the recommended Swedish dishes and local food specialties that I shoul

### **Decompose the refusal test queries**

In [None]:
# Apply the decompsition template and break down the questions into sub questions using the prompt decompsition pipeline
questions_decomposed_refusal = {}

for q in refusal_questions:
  print(f'Query: {q}')
  print('Query Decomposed:')
  sub_questions = decomp_chain.invoke({"question":q})
  print(sub_questions)
  questions_decomposed_refusal[q] = sub_questions
  print('###################################################################')

Query: How old is Barack Obama?
Query Decomposed:
[]
###################################################################
Query: How fast does a rocket fly?
Query Decomposed:
[]
###################################################################
Query: Who discovered Gravity?
Query Decomposed:
[]
###################################################################
Query: What is the fastest fish in the world?
Query Decomposed:
['What is the top speed recorded for any species of fish in the world?', 'In which ocean or sea does the fastest fish species typically inhabit?', 'What are some notable characteristics and features of the fastest fish species?']
###################################################################
Query: How hot is the Sun?
Query Decomposed:
[]
###################################################################


### **Save the refusal test queries for reproducibility**

In [None]:
with open(f'sub_questions_refusal.json', 'w') as file:
    json.dump(questions_decomposed_refusal, file, indent=4)

In [None]:
questions_decomposed_refusal

{'How old is Barack Obama?': [],
 'How fast does a rocket fly?': [],
 'Who discovered Gravity?': [],
 'What is the fastest fish in the world?': ['What is the top speed recorded for any species of fish in the world?',
  'In which ocean or sea does the fastest fish species typically inhabit?',
  'What are some notable characteristics and features of the fastest fish species?'],
 'How hot is the Sun?': []}

<br/>
<br/>
<br/>

## **If there are existing sub questions and refusal questions: Load them**

In [10]:
with open(os.path.join(data_folder,'sub_questions.json'), 'r') as file:
    questions_decomposed = json.load(file)

In [11]:
with open(os.path.join(data_folder,'sub_questions_refusal.json'), 'r') as file:
    questions_decomposed_refusal = json.load(file)

<br/>
<br/>
<br/>

## **Define Best Retriever from Retreiver Evaluation**

**Control Variables**

- Sub questions
- Retriever: FAISS with Index Flat L2
- Bi-Encoder/Embeddings model: all-mpnet-base-v2
- Text Split Method: Recursive Character Text Splitter
- Child Chunk Size: 250
- Child Chunk Overlap: 50
- Cross-Encoder/Re-ranking model: BAAI/bge-reranker-large
- Similarity Search type: MMR
- Number of docs to be fetched for MMR: 35
- Number of docs to be retrieved and reranked: 20

**Experimental Variables**

- Number of docs to be fed to LLM: 10, 15
  - Parent Chunk Size: 300 (50 words), 600 (100 words)
    - Parent Chunk Overlap: 20%


In [12]:
fetch_k = 35
top_k = 20
bi_encoder_embeddings_model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings_model = HuggingFaceEmbeddings(model_name=bi_encoder_embeddings_model_name)
cross_encoder_embedings_model_name = "BAAI/bge-reranker-large"
cross_encoder_model = HuggingFaceCrossEncoder(model_name=cross_encoder_embedings_model_name)
search_type = 'mmr'
child_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250, chunk_overlap=50, add_start_index=True
)

parent_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=parent_chunk_size, chunk_overlap=math.floor(0.2*parent_chunk_size), add_start_index=True
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [13]:
'''
Custom ParentDocumentRetriever object (ParentDocumentReranker) to do re-ranking of child chunks and return the parent chunks in the same order
'''
class ParentDocumentReranker(ParentDocumentRetriever):
    model_config = {"extra": "allow"}

    def __init__(self, cross_encoder_model, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cross_encoder_model = cross_encoder_model

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Get documents relevant to a query.
        Args:
            query: String to find relevant documents for
            run_manager: The callbacks handler to use
        Returns:
            List of relevant documents
        """
        if self.search_kwargs['search_type'] == 'mmr':
            sub_docs = self.vectorstore.max_marginal_relevance_search(
                query, **self.search_kwargs
            )
        else:
            sub_docs = self.vectorstore.similarity_search(query, **self.search_kwargs)

        # Reranking logic here with sub_docs (child documents)
        sub_docs_rerank_vals = []
        for r in sub_docs:
          sub_docs_rerank_vals.append((r,self.cross_encoder_model.score((r.page_content,query))))
        sub_docs = [key for key, value in sorted(sub_docs_rerank_vals, key=lambda x: x[1], reverse=True)]

        # We do this to maintain the order of the ids that are returned
        ids = []
        for d in sub_docs:
            if self.id_key in d.metadata and d.metadata[self.id_key] not in ids:
                ids.append(d.metadata[self.id_key])
        docs = self.docstore.mget(ids)
        res = [d for d in docs if d is not None]
        if len(res) > self.search_kwargs['docs_feed']:
            return res[: self.search_kwargs['docs_feed']]
        return res

In [14]:
vector_store_index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
index = faiss.IndexFlatL2(len(embeddings_model.embed_query("hello world")))
faiss_vector_store = FAISS(
    embedding_function=embeddings_model,
    index=vector_store_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

store = InMemoryStore()

custom_retriever = ParentDocumentReranker(
    cross_encoder_model=cross_encoder_model,
    vectorstore=faiss_vector_store,
    docstore=store,
    child_splitter=child_text_splitter,
    parent_splitter=parent_text_splitter,
    search_kwargs={"fetch_k": fetch_k, "k": top_k, "docs_feed": docs_feed, "search_type": search_type}
)

custom_retriever.add_documents(docs)

<br/>
<br/>
<br/>

## **Define Re-ordering Function**

Re-order re-ranked results such that extremas have the most relevent documents to mitigate "lost-in-the-middle" effect

In [15]:
def reorder_docs(res):
  res_reordered = [None] * len(res)
  start = 0
  end = len(res) - 1
  place_at_start = True

  for doc in res:
      if place_at_start:
          res_reordered[start] = doc
          start += 1
      else:
          res_reordered[end] = doc
          end -= 1

      place_at_start = not place_at_start
  return res_reordered

<br/>
<br/>
<br/>

## **Answer the Answerable Questions**

In [16]:
qna_template = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. Each statement in your answer must be **entailable** by at least one of the provided citations, meaning the information in the statement must align directly and logically with the cited context.

### **IMPORTANT REQUIREMENTS**:
1. **Inline Numbered Citations**: For each statement in your answer, **you must include at least one inline numbered citation** (e.g., [1], [2]). Statements without sufficient support from the context should not be included in the answer.
2. **Entailment Verification**: Ensure that every statement (hypothesis) is directly entailed by the cited context (premise). Use only information explicitly stated or strongly implied by the citation. Do not include unsupported information or make up details.
3. **Refusal for Unsupported Queries**: If the retrieved context does not provide sufficient information to answer the question, explicitly respond with: **"I don't know based on the provided context."** Do not attempt to generate an answer.
4. **Sequential Citation Numbering**: Reset the numbering for citations to start from **1** for each response, and ensure the numbering increases sequentially.
5. **Single Paragraph**: Write your response as **a single paragraph**. Avoid using any new line characters in the response. All statements should flow naturally and seamlessly into one another.

At the bottom, provide the full citations corresponding to each number, but include **only the page content** of the Document Object (exclude metadata). Use citations exactly as provided in the context.

Respond in the following format:
---
Statement 1 [1]. Statement 2 [2, 3].

Citations:
[1]: <Page Content 1> //Only the page content of the document
[2]: <Page Content 2>
[3]: <Page Content 3>
---

### **HOW TO APPROACH**:
- Before writing your answer, check if each statement is **entailable** by verifying that it is supported by one or more citations.
- If the context does not support a statement or the entire question, respond with: **"I don't know based on the provided context."**
- Avoid using unsupported statements, ambiguous language, or making up details.

Here are a few examples:

---
The best hikes in Norway include the Reinebringen hike in the Lofoten islands, Preikestolen, Kjeragbolten, and Trolltunga [1, 2]. The Reinebringen hike, although not one of the highest peaks, offers an iconic view of the Reine fjord from its summit [1]. Preikestolen, Kjeragbolten, and Trolltunga are famous for their stunning fjord views and unique geological formations, such as a boulder stuck between a mountain crevasse and a tongue-shaped rock [2].

Citations:
[1]: "The best hikes in Norway include the Reinebringen hike in the Lofoten islands. At a modest 448 meters high, Reinebringen is far from one of the highest peaks on the Lofoten islands. Yet this is more than made up for by the iconic view from the summit of Reine. It is not suitable for winter! Also, the trail can be quite demanding as the steps are quite steep."
[2]: "The most famous hikes in Norway include Preikestolen (a beautiful fjord), Kjeragbolten (with a famous boulder stuck between a mountain crevasse) as well as Trolltunga which resembles a tongue."
---

---
In Switzerland, you can embark on several scenic hikes [1]. One such hike is at Grindelwald, where at the summit, you will find a stunning lake, but it's only visible during the summer [1]. Other notable hikes include Zermatt, also known as the Matterhorn [1], and Lauterbrunnen [1]. For those seeking a challenging hike, the Matterhorn at Zermatt is a must-go [2]. This iconic peak is featured on the Toblerone chocolate and is best explored with a mountain guide due to the inherent dangers [2].

Citations:
[1]: "One of the most scenic hikes in Switzerland can be done at Grindelwald. At the summit of Grindelwald, a beautiful lake awaits you. However, you can only see this lake during summer time. Other notable hikes include Zermatt, i.e. the matterhorn and Lauterbrunnen."
[2]: "The matterhorn at zermatt is a must-go for hiking enthusiasts. It is the icon of the famous chocolate: Toblerone. However, it is recommended to hire a mountain guide to go with you as it can be very dangerous!"
---

---
In Iceland, you can participate in a variety of adventurous activities [1]. For instance, you can hike active volcanoes and explore a natural ice cave, offering unique geological experiences [1]. Driving in Iceland is also an amazing adventure, with open roads, majestic volcanoes, and towering mountains as your backdrop, and the possibility of encountering sheep and arctic foxes along the way [2]. Additionally, Iceland is known for its exceptional diving sites [3]. One of the most famous in the world, Silfra, is located in Iceland [3]. It is the only diving site where you can dive between two tectonic plates, and the water is so fresh that you can drink it, promising an unparalleled tasting experience [3].

Citations:
[1]: "Iceland is a must-go to place for adventurous people! You can hike active volcanoes, drive a jeep through the volcanic ash, explore a natural ice cave, see waterfalls. There are so many opportunities for an adventurer."
[2]: "Driving in Iceland is an amazing experience - open roads, majestic volcanos and towering mountains along the way, sheep and arctic foxes make it a great experience. All you need is an international driving license. And, please drive slowly during the winter!"
[3]: "One of the most famous diving sites in the world, Silfra, is located in Iceland! It is the only diving site in the world where you can dive between 2 tectonic plates. The water is also so fresh that you can drink from it, it is the best water that you will ever taste."
---

Question: {question}

Context: {context}

Helpful Answer:
"""

qna_template_shortened = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. Each statement must be **entailable** by at least one of the provided citations.

### **REQUIREMENTS**:
1. **Inline Numbered Citations**: Include at least one inline numbered citation (e.g., [1], [2]) for each statement. Exclude unsupported statements.
2. **Entailment Only**: Ensure every statement is directly supported by the cited context. Do not include fabricated or ambiguous information.
3. **Single Paragraph**: Write your answer in one continuous paragraph with sequential citation numbering starting from **1**.
4. **Refuse Unsupported Queries**: If the context does not provide sufficient information, respond with: **"I don't know based on the provided context."**

At the bottom, provide the full citations for each number, using only the page content of the Document Object (exclude metadata).

**Format**:
---
Statement 1 [1]. Statement 2 [2, 3].

Citations:
[1]: <Page Content 1>
[2]: <Page Content 2>
[3]: <Page Content 3>
---

**Example**:
---
The best hikes in Norway include the Reinebringen hike in the Lofoten islands, Preikestolen, and Trolltunga [1, 2]. The Reinebringen hike offers an iconic view of the Reine fjord from its summit [1].

Citations:
[1]: "The best hikes in Norway include the Reinebringen hike in the Lofoten islands. At a modest 448 meters high, Reinebringen is far from one of the highest peaks on the Lofoten islands."
[2]: "The most famous hikes in Norway include Preikestolen and Trolltunga."
---

Question: {question}

Context: {context}

Helpful Answer:
"""

qna_prompt_template = PromptTemplate.from_template(qna_template)

In [17]:
'''
def get_answer(question,decomp_chain,retriever,answer_chain):
  final_response = {}
  print(f'Main Question: {question}')
  print('')
  sub_questions = decomp_chain.invoke({"question":question})
  for sub_question in sub_questions:
      print(f'Sub Question: {sub_question}')
      start_retrieve_time = time.time()
      retrieved_docs = retriever.invoke(sub_question)
      end_retrieve_time = time.time()
      print(f'Retrieve and rerank time: {end_retrieve_time - start_retrieve_time}')
      retreived_docs = reorder_docs(retrieved_docs)
      start_generate_time = time.time()
      try:
        answer = answer_chain.invoke({"question": sub_question,"context": retrieved_docs})
      except Exception as e:
        print(f'Error: {e}')
        answer = None
      end_generate_time = time.time()
      print(f'Generate time: {end_generate_time - start_generate_time}')
      print(answer)
      print('')
      final_response[sub_question] = {
          "answer": answer,
          "retrieved_docs": [doc.page_content for doc in retrieved_docs]
      }
  return final_response
'''

def get_answer(question,sub_questions,retriever,answer_chain,sleep_time_s):
  final_response = {}
  print(f'Main Question: {question}')
  print('')

  if len(sub_questions)==0:
    print('This query cannot be answered')
    return 'This query cannot be answered'

  for sub_question in sub_questions:
      print(f'Sub Question: {sub_question}')
      start_retrieve_time = time.time()
      retrieved_docs = retriever.invoke(sub_question)
      end_retrieve_time = time.time()
      print(f'Retrieve and rerank time: {end_retrieve_time - start_retrieve_time}')
      retreived_docs = reorder_docs(retrieved_docs)
      start_generate_time = time.time()
      try:
        answer = answer_chain.invoke({"question": sub_question,"context": retrieved_docs})
      except Exception as e:
        print(f'Error: {e}')
        answer = None
      end_generate_time = time.time()
      print(f'Generate time: {end_generate_time - start_generate_time}')
      print(answer)
      print('')
      final_response[sub_question] = {
          "parent_question": question,
          "answer": answer,
          "retrieved_docs": [doc.page_content for doc in retrieved_docs]
      }
      print('Start sleep time')
      time.sleep(sleep_time_s)
      print('End sleep time')
      print('')
      print('')
  return final_response

In [18]:
answer_chain = ( qna_prompt_template | llm | StrOutputParser() )

In [19]:
responses = []
for qn,sub_qns in questions_decomposed.items():
  print('')
  response = get_answer(qn,sub_qns,custom_retriever,answer_chain,60)
  print('')
  print('')
  print('')
  print('##############################################################')
  responses.append(response)


Main Question: Help me plan a trip to Iceland, I love outdoor activities!

Sub Question: What are some popular outdoor activities and locations in Iceland for adventure seekers?
Retrieve and rerank time: 1.4827911853790283
Generate time: 1.2364346981048584
In Iceland, adventure seekers can participate in a variety of outdoor activities, such as hiking, trail running, and even alpine skiing [1, 2, 4]. For hiking enthusiasts, there are numerous trails to choose from, some of which are multi-day adventures in remote wilderness areas [1]. Popular trails include the Laugavegur trail between Landmannalaugar and Þórsmörk, Fimmvörðuháls, Lónsöræfi, Hornstrandir, and Vatnaleið [1]. These trails offer unique experiences, such as exploring waterfalls, hot springs, volcanoes, and glaciers [1]. Moreover, Iceland is an ideal destination for trail running due to its mountainous and rugged terrain [3].

Apart from hiking and trail running, Iceland also offers alpine skiing opportunities at Hlíðarfjal

KeyboardInterrupt: 

## **Save Model Responses for Answerable Questions**

In [None]:
with open(f'model_response_docs_feed_{docs_feed}_parent_chunk_size_{parent_chunk_size}.json', 'w') as file:
    json.dump(responses, file, indent=4)

<br/>
<br/>
<br/>

## **Load Model Responses for Answerable Questions if not ran**

In [None]:
with open(os.path.join(data_folder,'model_response_docs_feed_10_parent_chunk_size_500.json'), 'r') as file:
    responses = json.load(file)

<br/>
<br/>

## **Evaluate Model Answers**

In [None]:
def process_response_answer_only(response):
  answer_only = response.split('Citations')[0].replace("\n", " ").strip()
  return answer_only

### **Response Relevancy of the concatenation of the sub-answers to the main-question**

In [None]:
response_relevancy_scores = []

for i in range(len(responses)):
  sub_answers = []
  response = responses[i]
  for sub_question in response:
    sub_answers.append(process_response_answer_only(response[sub_question]['answer']))

  answer = '.'.join(sub_answers)
  parent_question = response[sub_question]['parent_question']
  sample = SingleTurnSample(
        user_input=parent_question,
        response=answer
    )
  scorer = ResponseRelevancy(llm=LangchainLLMWrapper(llm), embeddings = embeddings_model)
  response_relevancy_scores.append(await scorer.single_turn_ascore(sample))
  print(f'Score for question: {parent_question}')
  print(f'Concatenated answer: {answer}')
  print(response_relevancy_scores[-1])
  print('')
  print('Start sleep time')
  time.sleep(60)
  print('End sleep time')
  print('')
  print('')

Score for question: Help me plan a trip to Iceland, I love outdoor activities!
Concatenated answer: Iceland offers a variety of outdoor activities for adventure seekers, including hiking, trail running, and winter sports [1, 2, 4]. The country is known for its comfortable nature trails and challenging multi-day adventures in remote wilderness, with trails running across mountains, valleys, rugged coastlines, moss-grown lava fields, and featuring waterfalls, hot springs, volcanoes, and glaciers [1]. Some popular hiking routes in Iceland are the Laugavegur trail between Landmannalaugar and Þórsmörk, Fimmvörðuháls, Lónsöræfi, Hornstrandir, and Vatnaleið [2].  In addition to hiking, trail running is popular in Iceland due to its mountainous and rugged terrain [2]. For winter sports, Hlíðarfjall in Akureyri and Bláfjall in Reykjavík invite Alpine skiers, snowboarders, and cross-country trackers [4]. Snowshoeing, guided off-track cross-country tours, snowmobiling, and dogsledding are also av

In [None]:
response_relevancy_scores

[0.790250787912376,
 0.8207326317372429,
 0.8088231349402563,
 0.8691180978311509,
 0.9265644425007363]

In [None]:
mean_response_relevancy_score = np.mean(response_relevancy_scores)

In [None]:
print(f'The average response relevancy score of the sub-answers to the main-question is {mean_response_relevancy_score}')

The average response relevancy score of the sub-answers to the main-question is 0.8430978189843525


<br/>
<br/>
<br/>

## **Evaluate Citations**

In [None]:
# https://www.sbert.net/docs/cross_encoder/pretrained_models.html#nli
nli_model = CrossEncoder("cross-encoder/nli-deberta-v3-base")

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/417 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]



In [None]:
all_answers = []
for response in responses:
  for sub_question in response:
    all_answers.append(response[sub_question]['answer'])

In [None]:
def process_response(response):
  # Get the answer and citations
  response_split = response.split('Citations')
  answer = response_split[0].split('\n')
  citations = response_split[-1].split('\n')

  # For each citation, store its number (key) and text (value) in a dict
  citations_dict = {}
  for citation in citations:
      match = re.match(r"\[(.*?)\]", citation)
      if match:
          citation_number = match.group(1)
          citation_content = ''.join(citation.split(f'[{citation_number}]:')).strip()
          citations_dict[citation_number] = citation_content

  # Extract the statements from the answer
  statements = []
  # For each element in the answer
  for a in answer:
    # If the element's length is > 0
    if len(a.strip())>0:
      # Split the element by '.' to get each statement
      for b in a.split('.'):
        # If the statement's length is > 0, add it to the overall statements
        if len(b.strip())>0:
          statements.append(b.strip())

  # For each statement, store it as a key and it's corresponding citations as value
  statements_citations = {}
  for statement in statements:
      statement_citations = []
      inline_citations = re.findall(r"\[(.*?)\]", statement)
      # For each citation group found in the statement
      for citation_group in inline_citations:
          # To handle cases such as [1, 2]
          for citation in citation_group.split(','):
            citation = citation.strip()
            # If it's a 'valid' citation
            if citation in citations_dict:
                # Clean the text to be free of that citation
                statement = statement.replace(f"[{citation_group}]", "").strip()
                # Append that citation to the statement's citations
                statement_citations.append(citations_dict[citation])
      # Add the statement and its corresponding citations to the dictionary
      statements_citations[statement] = statement_citations

  return statements_citations

### **Citation Recall**

In [None]:
def response_citation_recall(response):
  statement_citations = process_response(response)
  entailment_count = 0
  # For each statement, calculate if there is an entailment by feeding the (concatenated citations i.e. premise, statement i.e. hypothesis)
  for k,v in statement_citations.items():
    # Premise should come first
    # https://towardsdatascience.com/natural-language-inference-an-overview-57c0eecf6517
    logits = nli_model.predict(['.'.join(v),k])
    probabilities = torch.softmax(torch.tensor(logits), dim=0)
    formatted_probabilities = [float(f"{val:.4f}") for val in probabilities]
    # If there is an entailment, increment the entailment count for entire response
    if logits.argmax()==1:
      entailment_count+=1
  # To calculate the citation recall for the response, normalise the total entailment count by the number of statements in the response
  # https://ar5iv.labs.arxiv.org/html/2305.14627
  response_citation_recall_val = entailment_count/len(statement_citations)
  return response_citation_recall_val

In [None]:
def overall_citation_recall(responses):
  cumulative_response_citation_recall = 0
  # For each response/set of statements, get its citation recall value
  for response in responses:
    response_citation_recall_val = response_citation_recall(response)
    cumulative_response_citation_recall+=response_citation_recall_val
  # To get the citation recall over the set of responses, sum the citation recall value of each response and normalise it by the number of responses
  return cumulative_response_citation_recall/len(responses)

In [None]:
overall_citation_recall_val = overall_citation_recall(all_answers)

In [None]:
print(f'The overall citation recall is {overall_citation_recall_val}')

The overall citation recall is 0.4389417989417989


### **Citation Precision**

In [None]:
def response_citation_precision(response):
  label_mapping = ['contradiction', 'entailment', 'neutral']
  statement_citations = process_response(response)
  precision_count = 0
  citation_count = 0
  # For each statement,
  for k,v in statement_citations.items():
    # For each citation in the statement,
    for i in range(len(v)):
      # Increment the total citation count for the response
      citation_count+=1
      candidate_citation = v[i]
      other_citations = v[:i] + v[i+1:]
      logits_candidate = nli_model.predict([candidate_citation,k])
      logits_other = nli_model.predict(['.'.join(other_citations),k])
      # If the citation fully supports the statement or the other set of citations (excluding the curr one) do not support the statment,
      # increment the precision count
      if logits_candidate.argmax()==1 or logits_other.argmax()!=1:
        precision_count+=1
  # https://ar5iv.labs.arxiv.org/html/2305.14627
  return precision_count/citation_count

In [None]:
def overall_citation_precision(responses):
  cumulative_response_citation_precision = 0
  # For each response/set of statements, get its citation precision value
  for response in responses:
    response_citation_precision_val = response_citation_precision(response)
    cumulative_response_citation_precision +=response_citation_precision_val
  # To get the citation precision over the set of responses, sum the citation response value of each response and normalise it by the number of responses
  return cumulative_response_citation_precision/len(responses)

In [None]:
overall_citation_precision_val = overall_citation_precision(all_answers)

In [None]:
print(f'The overall citation precision is {overall_citation_precision_val}')

The overall citation precision is 0.9418229918229919


### **Calculate Attribution Groundedness/Citations Grounded (F1_CG)**

In [None]:
f1_cg = (2 * (overall_citation_precision_val * overall_citation_recall_val)) / (overall_citation_precision_val + overall_citation_recall_val)

In [None]:
print(f'The overall citation recall is {overall_citation_recall_val}')
print(f'The overall citation precision is {overall_citation_precision_val}')
print(f'The citations grounded value is: {f1_cg}')

The overall citation recall is 0.4389417989417989
The overall citation precision is 0.9418229918229919
The citations grounded value is: 0.5988065180696711


<br/>
<br/>

## **Evaluate Response Truthfulness (Grounded Refusal)**

### **Quality of Answering**

In [None]:
# Manually add or just set to the same if all answerable questions were answered
model_answered_questions = answerable_questions
recall_answer = len(set.intersection(set(answerable_questions),set(model_answered_questions))) / len(answerable_questions)
precision_answer = len(set.intersection(set(answerable_questions),set(model_answered_questions))) / len(model_answered_questions)
f1_answer = (2 * (precision_answer*recall_answer)) / (precision_answer + recall_answer)
print(f'Recall answer: {recall_answer}')
print(f'Precision answer: {precision_answer}')
print(f'F1 answer: {f1_answer}')

Recall answer: 1.0
Precision answer: 1.0
F1 answer: 1.0


### **Quality of Refusals**

In [None]:
for qn,sub_qns in questions_decomposed_refusal.items():
  print('')
  response = get_answer(qn,sub_qns,custom_retriever,answer_chain,60)
  print('')
  print('')
  print('')
  print('##############################################################')


Main Question: How old is Barack Obama?

This query cannot be answered



##############################################################

Main Question: How fast does a rocket fly?

This query cannot be answered



##############################################################

Main Question: Who discovered Gravity?

This query cannot be answered



##############################################################

Main Question: What is the fastest fish in the world?

Sub Question: What is the top speed recorded for any species of fish in the world?
Retrieve and rerank time: 0.7002701759338379
Generate time: 0.4521656036376953
I don't know based on the provided context whether a top speed has been recorded for any species of fish in the world. The context includes information about fishing, lake activities, and the Icelandic horse, but it does not provide data on the top speed of fish species.

Start sleep time
End sleep time


Sub Question: In which ocean or sea does the fastest fish s

In [None]:
refusal_questions

['How old is Barack Obama?',
 'How fast does a rocket fly?',
 'Who discovered Gravity?',
 'What is the fastest fish in the world?',
 'How hot is the Sun?']

In [None]:
# Manually add or just set to the same if all refusal questions were refused
model_refused_questions = ['How old is Barack Obama?', 'How fast does a rocket fly?', 'Who discovered Gravity?', 'How hot is the Sun?']
recall_refusal = len(set.intersection(set(refusal_questions),set(model_refused_questions))) / len(refusal_questions)
precision_refusal = len(set.intersection(set(refusal_questions),set(model_refused_questions))) / len(model_refused_questions)
f1_refusal = (2 * (precision_refusal*recall_refusal)) / (precision_refusal + recall_refusal)
print(f'Recall refusal: {recall_refusal}')
print(f'Precision refusal: {precision_refusal}')
print(f'F1 refusal: {f1_refusal}')

Recall refusal: 0.8
Precision refusal: 1.0
F1 refusal: 0.888888888888889


### **Grounded Refusal F1_RG**

In [None]:
f1_rg = 0.5*(f1_refusal + f1_answer)
print(f'The grounded refusal value is: {f1_rg}')

The grounded refusal value is: 0.9444444444444444


</br>
</br>
</br>

## **Summary of Results**

In [None]:
res_summary = {
    'mean_response_relevancy_score': mean_response_relevancy_score,
    'overall_citation_recall_val': overall_citation_recall_val,
    'overall_citation_precision_val': overall_citation_precision_val,
    'f1_cg': f1_cg,
    'recall_answer': recall_answer,
    'precision_answer': precision_answer,
    'f1_answer': f1_answer,
    'recall_refusal': recall_refusal,
    'precision_refusal': precision_refusal,
    'f1_refusal': f1_refusal,
    'f1_rg': f1_rg
}

In [None]:
res_summary

{'mean_response_relevancy_score': 0.8430978189843525,
 'overall_citation_recall_val': 0.4389417989417989,
 'overall_citation_precision_val': 0.9418229918229919,
 'f1_cg': 0.5988065180696711,
 'recall_answer': 1.0,
 'precision_answer': 1.0,
 'f1_answer': 1.0,
 'recall_refusal': 0.8,
 'precision_refusal': 1.0,
 'f1_refusal': 0.888888888888889,
 'f1_rg': 0.9444444444444444}

In [None]:
with open(f'model_results_docs_feed_{docs_feed}_parent_chunk_size_{parent_chunk_size}.json', 'w') as file:
    json.dump(res_summary, file, indent=4)