# Cluster Citation Post-processing and Question Summarization

## Overview

This notebook performs two main tasks:

1. **Post-processing of citations** for each cluster, assigning **unique values** to each answer (e.g., 1–10 for the first cluster, 11–20 for the second, etc.).
2. **Summarization of questions** associated with each **sub-topic**, providing concise overviews of the clustered content.

## Configuration

At the beginning of the notebook, update the **path variables** to specify the input data, output directories, and any other required resources.



In [None]:
import json 
import numpy as np
import os 
import re

input_path = "./Results/Answers/Answers-subtopics/Dev set/Updated_citations"
output_path = "./Results/Summaries/UniqueSummary-EachCluster/Dev set"



openaikey = ""
json_files = np.sort([f for f in os.listdir(input_path) if f.endswith('.json')])

file = json_files[2]
file_name = file.replace('answers_new_cit-answes-', '').replace('-prompt-1.json', '')

with open(os.path.join(input_path, file), 'r') as f:
    json_data = json.load(f)


In [None]:
file

In [None]:
modified_answers = []
    
# Group answers by cluster_id to process them in order
clusters = {}
for item in json_data:
    cluster_id = item.get('cluster_id')
    retrieved_answer = item.get('updated_retrieved_answer')
    if cluster_id and retrieved_answer is not None:
        if cluster_id not in clusters:
            clusters[cluster_id] = []
        clusters[cluster_id].append(retrieved_answer)

# Process each cluster
modified_answers = {}
for cluster_id in clusters.keys(): # Sort to ensure consistent order
    answers_in_cluster = clusters[cluster_id]
    
    modified_answers[cluster_id] = [] 
    for i, original_answer in enumerate(answers_in_cluster):
        
        offset = i * 10

        modified_answer = original_answer

        
        citations_found = set(re.findall(r'\[(\d+)\]', original_answer))

        
        sorted_citations = sorted([int(c) for c in citations_found], reverse=True)

        for old_citation_num in sorted_citations:
            new_citation_num = old_citation_num + offset
           
            modified_answer = re.sub(
                r'\[{}\]'.format(re.escape(str(old_citation_num))),
                '[{}]'.format(new_citation_num),
                modified_answer
            )

        modified_answers[cluster_id].append(modified_answer)

In [None]:
modified_answers

In [None]:
from openai import OpenAI

client = OpenAI(api_key=openaikey) 

# Dictionary to store the summaries
summarized_answers = {}
prompt = """
Your task is to integrate the following pieces of text into a single, cohesive, and flowing narrative. The goal is to present as much of the original information as possible, not to summarize it briefly.

The text contains information with citations, formatted as `[number]`. It is crucial that you adhere to the following rules:

1.  **Integrate all key information**: Combine sentences and ideas from the input to form a comprehensive and coherent text. Aim to include a good portion, if not all, of the provided details.
2.  **Maintain original citations**: Every piece of information you include in the integrated text must retain its original citation(s).
3.  **Handle citations when combining**: If you rephrase or combine sentences containing information from multiple sources, ensure that *all* relevant original citation numbers for that combined information are included at the end of the new sentence. For example, if a new sentence merges details from original sentences cited `[1]` and `[5]`, the new sentence should be followed by `[1][5]`.
4.  **Ensure logical flow**: Arrange the information in a way that creates a natural and readable progression of ideas, even if it means reordering content from the original input.
5.  **Avoid external knowledge**: Your output must be based *solely* on the information provided in the input text. Do not introduce any outside facts or personal opinions.

Here is the text to integrate:
{text_to_integrate}
"""

# Iterate through each cluster and its modified answers
for cluster_id, answers in modified_answers.items():
    
    text_to_summarize = "\n\n".join(answers)
    custom_prompt = prompt.format(text_to_integrate = text_to_summarize)
    try:
        response = client.chat.completions.create(
            model="gpt-4o", # Specify the GPT-4o model
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in combining text."},
                {"role": "user", "content": custom_prompt}
            ],
            
            temperature=0.0
        )
        summary = response.choices[0].message.content.strip()
        summarized_answers[cluster_id] = summary
        print(f"Summary for {cluster_id}:\n{summary}\n")

    except Exception as e:
        print(f"An error occurred while summarizing cluster {cluster_id}: {e}")
        summarized_answers[cluster_id] = f"Error: Could not summarize due to {e}"


print("\n--- All Summaries ---")
print(json.dumps(summarized_answers, indent=2))

In [None]:
summarized_answers

In [None]:
import textwrap

print(textwrap.fill(summarized_answers['0'], width=100))

In [None]:
modified_answers['0']

# Perform summarization of each cluster in each file 

In [None]:
import os
import json
import re
import numpy as np
from openai import OpenAI




client = OpenAI(api_key=openaikey)


prompt_template = """
Your task is to integrate the following pieces of text into a single, cohesive, and flowing narrative. The goal is to present as much of the original information as possible, not to summarize it briefly.

The text contains information with citations, formatted as `[number]`. It is crucial that you adhere to the following rules:

1.  **Integrate all key information**: Combine sentences and ideas from the input to form a comprehensive and coherent text. Aim to include a good portion, if not all, of the provided details.
2.  **Maintain original citations**: Every piece of information you include in the integrated text must retain its original citation(s).
3.  **Handle citations when combining**: If you rephrase or combine sentences containing information from multiple sources, ensure that *all* relevant original citation numbers for that combined information are included at the end of the new sentence. For example, if a new sentence merges details from original sentences cited `[1]` and `[5]`, the new sentence should be followed by `[1][5]`.
4.  **Ensure logical flow**: Arrange the information in a way that creates a natural and readable progression of ideas, even if it means reordering content from the original input.
5.  **Avoid external knowledge**: Your output must be based *solely* on the information provided in the input text. Do not introduce any outside facts or personal opinions.

Here is the text to integrate:
{text_to_integrate}
"""

os.makedirs(output_path, exist_ok=True)


try:
    json_files = np.sort([f for f in os.listdir(input_path) if f.endswith('.json')])
except FileNotFoundError:
    print(f"Error: The directory '{input_path}' was not found.")
    json_files = []

# --- Process each file ---
for file in json_files:
    file_name = file.replace('answers_new_cit-answes-', '').replace('-prompt-1.json', '')
    
    # Check if output file already exists
    output_filename = os.path.join(output_path, f"summary_{file_name}.json")
    if os.path.exists(output_filename):
        print(f"--- Skipping file: {file_name} (output already exists) ---\n")
        continue
    
    print(f"--- Processing file: {file_name} ---")

    try:
        with open(os.path.join(input_path, file), 'r') as f:
            json_data = json.load(f)
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from file: {file}")
        continue
    except Exception as e:
        print(f"An error occurred while reading {file}: {e}")
        continue

 
    clusters = {}
    for item in json_data:
        cluster_id = item.get('cluster_id')
        retrieved_answer = item.get('updated_retrieved_answer')
        if cluster_id and retrieved_answer is not None:
            clusters.setdefault(cluster_id, []).append(item)

    
    modified_answers = {}
    updated_citation_contexts = {}

    for cluster_id in sorted(clusters.keys()):
        modified_answers[cluster_id] = []
        updated_citation_contexts[cluster_id] = {}

        for i, item in enumerate(clusters[cluster_id]):
            original_answer = item['updated_retrieved_answer']
            offset = i * 10
            modified_answer = original_answer

            citations_found = set(re.findall(r'\[(\d+)\]', original_answer))
            sorted_citations = sorted([int(c) for c in citations_found], reverse=True)

            for old_citation_num in sorted_citations:
                new_citation_num = old_citation_num + offset
                # Replace citation in text
                modified_answer = re.sub(
                    r'\[{}\]'.format(re.escape(str(old_citation_num))),
                    '[{}]'.format(new_citation_num),
                    modified_answer
                )

                
                if old_citation_num in item.get('new_citations', []):
                    idx = item['new_citations'].index(old_citation_num)
                    ctx = item['new_used_contexts'][idx]
                    updated_citation_contexts[cluster_id][new_citation_num] = ctx

            modified_answers[cluster_id].append(modified_answer)

    
    final_output = {}

    for cluster_id, answers in modified_answers.items():
        text_to_summarize = "\n\n".join(answers)
        custom_prompt = prompt_template.format(text_to_integrate=text_to_summarize)

        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant specialized in combining text."},
                    {"role": "user", "content": custom_prompt}
                ],
                temperature=0.0
            )
            summary = response.choices[0].message.content.strip()
            
            prompt_title = ""
            final_output[cluster_id] = {
                "summary": summary,
                "used_contexts": updated_citation_contexts[cluster_id]
            }
            print(f"Successfully summarized cluster {cluster_id} for {file_name}")

        except Exception as e:
            error_message = f"Error: Could not summarize due to {e}"
            final_output[cluster_id] = {
                "summary": error_message,
                "used_contexts": updated_citation_contexts[cluster_id]
            }
            print(f"An error occurred while summarizing cluster {cluster_id} for {file_name}: {e}")

   
    try:
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(final_output, f, indent=2, ensure_ascii=False)
        print(f"Successfully saved summary for {file_name} to {output_filename}\n")
    except Exception as e:
        print(f"Could not write to file {output_filename}: {e}\n")

# Generate headline for the summaries

In [4]:
import os
import json
from openai import OpenAI


client = OpenAI(api_key=openaikey)

title_prompt = """You are creating a title for a situational report.

Read the following summary and generate a title that:
- Clearly identifies the situation, topic, or issue being reported
- Is appropriate for a professional situational report (SITREP style)
- Is direct and informative (8-12 words)
- Uses clear, actionable language suitable for decision-makers
- Does not include meta-phrases like "Report on" or "Summary of"

Summary:
{summary_text}

Return only the title, nothing else."""

json_files = [f for f in os.listdir(output_path) if f.startswith('summary_') and f.endswith('.json')]

for file in json_files:
    filepath = os.path.join(output_path, file)
    
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    for cluster_id, cluster_data in data.items():
        if 'title' not in cluster_data:
            summary = cluster_data.get('summary', '')
            
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": title_prompt.format(summary_text=summary)}],
                temperature=0.0
            )
            
            cluster_data['title'] = response.choices[0].message.content.strip()
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    print(f"Processed {file}")

Processed summary_Indonesia_Floods and volcanic activity in Indonesia-Week 20 2024.json
Processed summary_Jamaica_Hurricane Beryl-Week 28 2024.json
Processed summary_Ukraine_Ukraine-Week 23 2024.json
Processed summary_United Kingdom_UK riots-Week 32 2024.json
Processed summary_Israel_Israel-Hamas war-Week 19 2024.json
Processed summary_India_LandslideFloods-Week 31 2024.json
Processed summary_Israel_Israel_Palestine_confilct-Week 40 2024.json
Processed summary_Afghanistan_Afghanistan Floods-Week 21 2024.json
Processed summary_Sudan_Sudan conflict-Week 34 2024.json
Processed summary_Nigeria_Flooding in Nigeria-Week 37 2024.json
Processed summary_Haiti_Gang violence and humanitarian crisis in Haiti-Week 40 2024.json
Processed summary_Pakistan_Monsoon floods and rains in Pakistan-Week 31 2024.json
Processed summary_Bangladesh_Cyclone Remal-Week 21 2024.json
