# Question Answering with RAG Pipeline

## Overview

This notebook builds upon the **filtered questions** obtained from the previous stage and generates corresponding **answers** using a **Retrieval-Augmented Generation (RAG)** pipeline.

## Configuration

At the beginning of the notebook, update the **variables** and **path definitions** to specify the input data, model configuration, and output directories used throughout the workflow.


In [None]:
import os 
import json 


file_name = "Pakistan_Monsoon floods and rains in Pakistan-Week 31 2024-prompt-1"
cluster_path_one_file = f"./Results/Questions/Test questions different prompts/1-Questions generated/Dev set/questions-{file_name}.json"
questions_path_one_file = f"./Results/Questions/Test questions different prompts/3-Filtered questions/Dev set/final_questions-{file_name}.json"



answers_path_one_file = f"./Results/Answers/Answers-Subtopics/Dev set/answes-{file_name}.json"


os.environ['OPENAI_API_KEY'] = "" 
os.environ['GOOGLE_API_KEY'] = ""



# Load files

In [None]:


with open(cluster_path_one_file, 'r') as file:
    qc_data = json.load(file)
    

with open(questions_path_one_file, 'r') as file:
    questions_data = json.load(file)

     

In [None]:
questions_data

Functions taken from the github: https://github.com/canghongjian/beam_retriever/blob/main/test_model_tmp.py 

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_core.documents import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate 
from langchain.schema.runnable import RunnableLambda
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import numpy as np
from langchain.embeddings import HuggingFaceBgeEmbeddings 

from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatMessagePromptTemplate, PromptTemplate, ChatPromptTemplate
import langchain


# Pipeline with colbert from ragtatuille + EM and F1 for retrieving

In [None]:
from ragatouille import RAGPretrainedModel
from langchain_google_genai import ChatGoogleGenerativeAI

colbert = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")


#model = ChatOpenAI(model = 'gpt-4o', temperature=0)
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature = 0)


In [None]:
from langchain.load import dumps, loads
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results
     


# Pipeline with RAG Fusion 

In [None]:
questions_data.keys()

Working: 

Add improvment citations  


In [None]:
from langchain.docstore.document import Document
from typing import List

def format_docs_for_prompt(docs: List[Document]) -> str:
    formatted_str = ""
    # Assign source IDs from 1 to k (your k=10 in retriever)
    for i, doc in enumerate(docs):
        source_id = i + 1 # Assigns 1, 2, 3...
        formatted_str += f"Source {source_id}:\n"
        formatted_str += f"{doc.page_content}\n\n" # Only content, no source URL
    return formatted_str.strip() # Remove any trailing newlines or spaces

In [None]:
from typing import List, Tuple
def extract_documents_from_scored_tuples(scored_docs: List[Tuple[Document, float]]) -> List[Document]:
    """
    Extracts only the Document objects from a list of (Document, score) tuples.
    This is specifically for the output format of your reciprocal_rank_fusion function.
    """
    return [doc for doc, score in scored_docs]

In [None]:
scores = {}
data = {'cluster_idx': [],'question': [], 'ground_truth_answer': [], 'retrieved_answer': [], 'retrieved_contexts': []} 

for idx, cluster in qc_data.items():
    #questions = cluster.get('picked_questions', []) 
    if idx in(questions_data.keys()):
        questions = questions_data[idx]
        trueanswer = "there is no true answer"
        print(questions)
        for question in questions: 

            
            documents = []
            base_index_dir = ".ragatouille/colbert/indexes/"
            index_name = f"{file_name}-{idx}"
            
            index_path = os.path.join(base_index_dir, index_name)

            #Check if the index already exists in the specified directory
            
            if os.path.exists(index_path):
                #Load the RAG model from the existing index
                colbert = RAGPretrainedModel.from_index(index_path)
                # print(f"Index '{index_name}' loaded successfully from '{index_path}'.")
                
            else:   
            #else:
                cluster_articles = cluster.get('article_titles', [])
                print(f"cluster_articles: {cluster_articles}")
                
                documents.extend(cluster_articles)

                    
                index_path = colbert.index(index_name=index_name, collection=documents)
            

            retriever = colbert.as_langchain_retriever(k=10)


            prompt_dq = ChatPromptTemplate(input_variables=['original_query'],
                                        messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[],template='You are a helpful assistant that generates multiple search queries based on a single input query.')),
                                        HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (4 queries):'))])
            generate_queries = (
                prompt_dq | model | StrOutputParser() | (lambda x: x.split("\n"))
            )

            #question = " How old are you? "


            ragfusion_chain = generate_queries | retriever.map() | reciprocal_rank_fusion

    #         template =     '''
            
    #                 Using the provided context extracted by the retriever, write a high-quality answer to the given question. The context consists of excerpts from humanitarian documents. Your answer must be concise, precise, and based solely on the information available in the context. 
    #                 You should cite the appropriate contexts where necessary. Always cite for any factual claim. When citing several contexts, use [1][2][3]. Try to cite each sentence with at least one context.

    # If the context does not contain a clear answer, respond with 'No clear answer.'

    # Context:
    # {context}

    # Question:
    # {question}

    # Answer:
    #                 ''' 
    
            template = ''' 
            **Role:** You are an expert AI assistant specializing in answering questions *strictly* from provided source documents. Your primary directive is to provide accurate, concise answers, meticulously citing every piece of information.

**Task:**
1.  Carefully analyze the numbered source documents provided in the "Context" section.
2.  Answer the user's "Query" *exclusively* using information found within these sources.
3.  Every factual statement, phrase, or piece of data in your answer MUST be supported by a citation.

**Instructions for Citation:**
* **Granular Citation:** Place citations `[Source Number]` immediately after the *specific sentence or clause* that the information comes from. Aim for the smallest possible unit of text that can be attributed.
* **Multiple Sources:** If a single piece of information is supported by content from multiple sources, list all relevant source numbers together without spaces, e.g., `[1][2][5]`.
* **No Source, No Statement:** If you cannot find direct support for a piece of information in the provided sources, **DO NOT include that information** in your answer.
* **Source Range:** Your citations *must* correspond to the provided source numbers (1, 2, 3... up to 10). Do not generate citation numbers outside this range.
* **Prioritize Directness:** If multiple sources provide the same information, prioritize the most direct and clear phrasing, and cite all relevant sources.

**Instructions for Answer Content:**
* **Conciseness & Precision:** Keep your answer as concise and direct as possible, while fully addressing the query. Avoid conversational fillers or unnecessary elaboration.
* **No Outside Knowledge:** Do NOT use any information, assumptions, or interpretations not explicitly stated in the provided sources. This is crucial for preventing hallucinations.
* **Handling Missing Information:**
    * If the provided sources contain *relevant* information but *do not definitively answer* the specific question, respond: `No clear answer.`
    * If *none* of the sources contain *any* information relevant to the query, respond: `The provided sources do not contain information relevant to this query.`

**Example:**
Source 1: The sky is red in the evening and blue in the morning.
Source 2: Water is wet when the sky is red.
Source 3: Red skies often appear during sunset.

Query: When is water wet?
Answer: Water will be wet when the sky is red [2]. This phenomenon occurs in the evening [1], frequently during sunset [3].

---

**Context:**
{context}

**Query:** {question}

**Answer:**
            '''
            prompt = PromptTemplate(template=template, input_variables=["context", "question"])

            # Create RAG chain using the retriever and prompt
            
            retrieved_context_for_prompt = (ragfusion_chain | extract_documents_from_scored_tuples | format_docs_for_prompt).invoke(question)
            print("--- RETRIEVED CONTEXT PASSED TO THE MODEL ---")
            print(retrieved_context_for_prompt)
            print("------------------------------------------")
            
            full_rag_fusion_chain = (
                {
                    "context": lambda x: retrieved_context_for_prompt,
                    "question": RunnablePassthrough()
                }
                | prompt
                | model
                | StrOutputParser()
            )

            # Execute the RAG chain to get the answer
            answer = full_rag_fusion_chain.invoke(question)
            retrieved_docs = [doc.page_content for doc in retriever.get_relevant_documents(question)]
            # scores = [doc.score for doc in retriever.get_relevant_documents(question)]
            # rank = [doc.rank for doc in retriever.get_relevant_documents(question)]
            print("Retrieved answer:",  answer)
            print("Real answer:", trueanswer )
            
            # Append the results to the dictionary
            data["question"].append(question)
            data["ground_truth_answer"].append(trueanswer)
            
            data["retrieved_answer"].append(answer)
            data["retrieved_contexts"].append(retrieved_docs)
            data['cluster_idx'].append(idx)
            # data['retrieved_scores'].append(scores)
            # data['rank'].append(rank)

#         em_answer = int(set(trueanswer) == set(answer))
#         f1_answer = compute_f1_answers(trueanswer, answer)

        
        
#         scores[i] = { "em_answer": em_answer, "f1_answer": f1_answer}




In [None]:
retriever.get_relevant_documents(question)

# Save file 

In [None]:
output_data = []

# Iterate over the data and create a dictionary for each question
for i in range(len(data["question"])):
    output_data.append({
        'cluster_id': data["cluster_idx"][i],
        'question': data['question'][i],
        #'ground_truth_answer': data['ground_truth_answer'][i],
        'retrieved_answer': data['retrieved_answer'][i],
        'retrieved_contexts': data['retrieved_contexts'][i],
    })

# Define the file path


with open(answers_path_one_file, 'w') as f:
    json.dump(output_data, f)


# Print results rag 

In [None]:
import textwrap

for i in range(len(data["question"])):
    print(f"Question: {data['question'][i]}")
    print(f"Answer: {data['ground_truth_answer'][i]}")
    print(f"Retrieved Answer: {data['retrieved_answer'][i]}")
    
    # Format retrieved contexts for readability
    print("Retrieved Contexts:")
    for context in data["retrieved_contexts"][i]:
        wrapped_context = textwrap.fill(context, width=80)  # Wrap text to 80 characters
        print(f"- {wrapped_context}")
    
    print("-" * 50 + "\n")  # Separator and extra spacing between entries
