# Executive Summary Generation

## Overview

This notebook generates **executive summaries** for all files within a specified folder.  
Starting from the **context used in the answers**, the summaries are produced using a **Retrieval-Augmented Generation (RAG)** pipeline.

## Configuration

At the beginning of the notebook, update the **path variables** to define the input data, model configuration, and output directories used throughout the workflow.



In [None]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import SpacyTextSplitter
from langchain_core.documents import Document
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate 
from langchain.schema.runnable import RunnableLambda
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import numpy as np
from langchain.embeddings import HuggingFaceBgeEmbeddings 

from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatMessagePromptTemplate, PromptTemplate, ChatPromptTemplate
import langchain
from langchain_google_genai import ChatGoogleGenerativeAI
from datasets import load_dataset
import os
import json
import numpy as np
import re 
os.environ['OPENAI_API_KEY'] = "" 
os.environ['GOOGLE_API_KEY'] = ""

In [None]:

# Define paths
input_path = f"./Results/Answers/Answers-subtopics/Dev set/Updated_citations"
output_path = f"./Results/Executive Summary/Dev set"
base_index_dir = ".ragatouille/colbert/indexes/"

In [None]:
from ragatouille import RAGPretrainedModel

colbert = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")


model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature = 0)

In [None]:
from langchain.load import dumps, loads
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

# Create summary for all the files in a folder

In [None]:



# Ensure the output directory exists


os.makedirs(output_path, exist_ok=True)

# Get all JSON files in the input directory
json_files = np.sort([f for f in os.listdir(input_path) if f.endswith('.json')])

# Define the RAG Fusion components outside the loop if they don't change per file
prompt_dq = ChatPromptTemplate(input_variables=['original_query'],
                               messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[],template='You are a helpful assistant that generates multiple search queries based on a single input query.')),
                                         HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (4 queries):'))])

generate_queries = (
    prompt_dq | model | StrOutputParser() | (lambda x: x.split("\n"))
)

template = template = '''
**You are an AI Assistant specialized in summarizing information based *strictly* on provided source documents.**

**Your Task:**
Carefully analyze the source documents provided in the context below. Then, generate a concise, one-paragraph summary that provides an overview of the situation described in the documents, focusing on the most important information happening in the country or situation at the given moment.

**Instructions & Constraints:**

1.  **Source Adherence:** Your summary MUST be based *solely* on the information explicitly stated or clearly inferable from the provided numbered sources. Do not include any information, assumptions, or interpretations not explicitly stated in the context.
2.  **Conciseness & Overview:** Keep your summary concise, aiming for approximately one paragraph, and focused on providing a high-level understanding of the situation and its most important aspects. Aim for a narrative flow that integrates information from different sources where appropriate.
3.  **Citation Requirement:** Every piece of information or statement in your summary should be attributed to the source(s) it came from.
4.  **Citation Format:**
    * Use bracketed numbers corresponding to the source number(s), like `[1]`, `[2]`, etc.
    * Place citations *immediately* after the information they support.
    * **If multiple sources support the *same* specific piece of information or statement, list all relevant source numbers together without spaces, like `[1][2]` or `[3][5][7]`.**
    * Aim to associate citations with the specific sentence or clause they verify.
    * Every summary **must** contain at least one citation.
5.  **Handling Lack of Information:**
    * If the provided sources contain relevant information but do not offer enough detail to form a comprehensive overview, state: `The provided sources offer limited information for a comprehensive overview.`
    * If *none* of the sources contain information relevant to describing a situation, state: `The provided sources do not contain information relevant to describing a situation.`
6.  Your citation numbers should correspond **accurately** to the sources provided in the `{context}` block. Do not put numbers to citations that are greater than the number of paragraphs you receive.

---

**Now, fulfill the request based on the context below:**


{context}
------

**Summary:**
'''
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

query = "Generate a summary of the current situation. "

# for file in json_files:
file = json_files[0]
file_name = file.replace('answers_new_cit-answes-', '').replace('-prompt-1.json', '')
print(f"Processing file: {file_name}")

full_used_paragraphs = set()

# Load JSON data
with open(os.path.join(input_path, file), 'r') as f:
    json_data = json.load(f)

for item in json_data:
    full_used_paragraphs.update(item['new_used_contexts'])

full_used_paragraphs_list = sorted(list(full_used_paragraphs))

# Initialize RAGatouille model and index
colbert = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

index_name = f"{file_name}-summary"
index_path = os.path.join(base_index_dir, index_name)


if os.path.exists(index_path):
    colbert = RAGPretrainedModel.from_index(index_path)
else:    
    index_path = colbert.index(index_name=index_name, collection=full_used_paragraphs_list)
    print(f"Index '{index_name}' created/updated successfully at '{index_path}'.")

retriever = colbert.as_langchain_retriever(k=20)


ragfusion_chain = generate_queries | retriever.map() | reciprocal_rank_fusion

# Create RAG chain using the retriever and prompt
full_rag_fusion_chain = (
    {
        "context": ragfusion_chain,
        "question": RunnablePassthrough()
    }
    | prompt
    | model
    | StrOutputParser()
)

# Execute the RAG chain to get the answer
answer = full_rag_fusion_chain.invoke(query)

# Extract cited numbers
matches = re.findall(r'\[(\d+)\]', answer)
extracted_numbers = set(int(num) for num in matches)

# Get the actual cited paragraphs
cited_paragraphs = []
for number in extracted_numbers:
    idx = number - 1 # Adjust for 0-based indexing
    if 0 <= idx < len(full_used_paragraphs_list):
        cited_paragraphs.append(full_used_paragraphs_list[idx])
    else:
        print(f"Warning: Citation number {number} out of bounds for {file_name}. Max index is {len(full_used_paragraphs_list)}.")


# Prepare data for saving
results_to_save = {
    "summary": answer,
    "cited_paragraphs": cited_paragraphs, 
    "full_paragraphs": full_used_paragraphs_list
}
print(f"Number of docs {len(full_used_paragraphs_list)} for {file_name}")
# Define output file path
output_file_name = f"summary-{file_name}.json"
output_file_path = os.path.join(output_path, output_file_name)

# Save the results
with open(output_file_path, 'w') as outfile:
    json.dump(results_to_save, outfile, indent=4)
print(f"Summary and cited paragraphs saved to {output_file_path}\n")

print("All files processed!")