# Final Report Generation

## Overview

This notebook compiles the **final report** by combining all the results produced across the various folders.  
In addition, **metadata** is added to each retrieved paragraph to ensure completeness and traceability.

Two report formats are generated:
1. A **Question–Answer (QA)** version.  
2. A **summary-based** version organized by **Sustainable Development Goals (SDGs)**.


In [71]:
import re

def preprocess_answers_data(answers_file):
    """
    Preprocesses the raw loaded JSON data to filter items that:
    - Contain brackets (indicating citations)
    - Do NOT contain 'no clear answer'

    Args:
        answers_file (list): The list of dictionaries loaded from the JSON file.

    Returns:
        list: A list of dictionaries that meet the criteria.
    """
    
    filtered_file = []
    for item in answers_file:
        answer = item.get('updated_answer_pt2', '')
        
        has_brackets = "[" in answer or "]" in answer
        has_no_clear = re.search(r"no clear answer", answer, re.IGNORECASE)

        if has_brackets and not has_no_clear:
            filtered_file.append(item)

    return filtered_file


In [72]:
import re

def preprocess_answers_data(answers_file):
    """
    Preprocesses the raw loaded JSON data to filter items that:
    - Contain brackets (indicating citations)
    - Do NOT contain 'no clear answer'

    Args:
        answers_file (dict): Dict of {sdg: list of QA dicts}.

    Returns:
        dict: A dict with the same keys, but filtered lists of QA dicts.
    """
    
    filtered_file = {}
    for sdg_key, qa_list in answers_file.items():
        filtered_list = []
        for item in qa_list:
            # Try both updated_answer_pt2 (if present) and retrieved_answer (fallback)
            answer = item.get('updated_answer_pt2', item.get('retrieved_answer', ''))

            has_brackets = "[" in answer or "]" in answer
            has_no_clear = re.search(r"no clear answer", answer, re.IGNORECASE)

            if has_brackets and not has_no_clear:
                filtered_list.append(item)

        if filtered_list:  # only keep non-empty SDGs
            filtered_file[sdg_key] = filtered_list

    return filtered_file


In [73]:
def preprocess_summary_data(summary_file):
    """
    Preprocesses the raw loaded JSON data to filter items with citations
    and associate citations with their corresponding contexts.

    Args:
        answers_file (list): The list of dictionaries loaded from the JSON file.

    Returns:
        list: A list of dictionaries, where each dictionary has 'citations'
              and 'used_contexts' keys added.
    """
    
    filtered_file = {}
    for key, answer in summary_file.items():
        # Only include items that potentially have citations
        if "[" in answer or "]" in answer or re.search(r"no clear answer[^\w]*", answer, re.IGNORECASE):
            filtered_file[key] = answer
    return filtered_file

In [74]:
def parse_file_name(s):
    """
    Parses a string in the format 'Country_event-period'
    and returns a dictionary with keys: country, event, period.
    """
    try:
        country_event, period = s.split("-", 1)  # split only on first '-'
        country, event = country_event.split("_", 1)  # split only on first '_'
        
        return country,event,period
    except ValueError:
        raise ValueError("String does not match expected format: 'Country_event-period'")

In [75]:
# def enrich_contexts_with_citation_numbers(context_list, metadata_data, answer_text):
#     """
#     context_list: list of paragraph texts
#     metadata_data: list of dicts [{paragraph, title, url}, ...]
#     answer_text: the answer text containing citation numbers like [2][4]
#     Returns a dict keyed by citation numbers.
#     """
#     citation_numbers = np.unique(re.findall(r'\[(\d+)\]', answer_text))
#     enriched_contexts = {}

#     for idx, citation in enumerate(citation_numbers):
#         if idx < len(context_list):
#             context_text = context_list[idx]
#             matched_metadata = next(
#                 (item for item in metadata_data if item.get('paragraph', '') == context_text),
#                 None
#             )
#             enriched_contexts[citation] = {
#                 "context": context_text,
#                 "title": matched_metadata.get("title", "") if matched_metadata else "",
#                 "url": matched_metadata.get("url", "") if matched_metadata else ""
#             }
#         else:
#             enriched_contexts[citation] = {"context": "", "title": "", "url": ""}

#     return enriched_contexts

In [76]:
# import numpy as np
# import re
# from difflib import SequenceMatcher

# def enrich_contexts_with_citation_numbers(context_list, metadata_data, answer_text, similarity_threshold=0.7):
#     """
#     context_list: list of paragraph texts
#     metadata_data: list of dicts [{paragraph, title, url}, ...]
#     answer_text: the answer text containing citation numbers like [2][4]
#     similarity_threshold: minimum ratio (0–1) to accept fuzzy match
#     Returns a dict keyed by citation numbers.
#     """
#     # Extract citation numbers from the answer text
#     citation_numbers = np.unique(re.findall(r'\[(\d+)\]', answer_text))
#     enriched_contexts = {}

#     # Helper: get best fuzzy match
#     def find_best_match(context_text):
#         best_match = None
#         best_ratio = 0.0
#         for item in metadata_data:
#             candidate_text = item.get('paragraph', '')
#             ratio = SequenceMatcher(None, context_text, candidate_text).ratio()
#             if ratio > best_ratio:
#                 best_ratio = ratio
#                 best_match = item
#         return best_match if best_ratio >= similarity_threshold else None

#     # Step 1: Assign contexts to citations
#     for idx, citation in enumerate(citation_numbers):
#         if idx < len(context_list):
#             context_text = context_list[idx]

#             # Try exact match first
#             matched_metadata = next(
#                 (item for item in metadata_data if item.get('paragraph', '') == context_text),
#                 None
#             )

#             # If no exact match, try fuzzy
#             if not matched_metadata:
#                 matched_metadata = find_best_match(context_text)

#             enriched_contexts[citation] = {
#                 "context": context_text,
#                 "title": matched_metadata.get("title", "") if matched_metadata else "",
#                 "url": matched_metadata.get("url", "") if matched_metadata else ""
#             }
#         else:
#             # Step 2: No context available → still keep citation with placeholders
#             enriched_contexts[citation] = {
#                 "context": "",
#                 "title": "",
#                 "url": ""
#             }

#     return enriched_contexts


In [77]:
import numpy as np
import re
from difflib import SequenceMatcher
import random

def enrich_contexts_with_citation_numbers(context_list, metadata_data, answer_text, similarity_threshold=0.7, min_substring_length=50, use_fuzzy=True):
    """
    context_list: list of paragraph texts
    metadata_data: list of dicts [{paragraph, title, url}, ...]
    answer_text: the answer text containing citation numbers like [2][4]
    similarity_threshold: minimum ratio (0–1) to accept fuzzy match
    min_substring_length: minimum length for substring matching (avoids false positives)
    use_fuzzy: whether to use fuzzy matching (can be slow on large datasets)
    Returns a dict keyed by citation numbers.
    """
    # Extract citation numbers from the answer text
    citation_numbers = np.unique(re.findall(r'\[(\d+)\]', answer_text))
    enriched_contexts = {}

    # Pre-process metadata for faster lookups
    # Create exact match lookup
    exact_lookup = {item.get('paragraph', ''): item for item in metadata_data}
    
    # Create normalized substring lookup (only for substring matching)
    substring_lookup = []
    for item in metadata_data:
        para = item.get('paragraph', '')
        if para:
            substring_lookup.append({
                'normalized': para.lower().strip(),
                'original': item
            })

    # Helper: get best fuzzy match (only called when needed)
    def find_best_match(context_text):
        best_match = None
        best_ratio = 0.0
        for item in metadata_data:
            candidate_text = item.get('paragraph', '')
            ratio = SequenceMatcher(None, context_text, candidate_text).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_match = item
        return best_match if best_ratio >= similarity_threshold else None

    # Helper: find substring matches (optimized)
    def find_substring_match(context_text):
        """
        Check if context_text is contained within any metadata paragraph.
        Returns first match found (or random if multiple).
        """
        if len(context_text) < min_substring_length:
            return None
            
        context_normalized = context_text.lower().strip()
        matches = []
        
        # Simple substring search - much faster than fuzzy matching
        for item in substring_lookup:
            if context_normalized in item['normalized']:
                matches.append(item['original'])
                # Optionally break early if you only need one match
                # break
        
        return random.choice(matches) if matches else None

    # Step 1: Assign contexts to citations
    for idx, citation in enumerate(citation_numbers):
        if idx < len(context_list):
            context_text = context_list[idx]
            matched_metadata = None

            # Try exact match first (O(1) lookup)
            matched_metadata = exact_lookup.get(context_text)

            # If no exact match, try substring matching (faster than fuzzy)
            if not matched_metadata:
                matched_metadata = find_substring_match(context_text)

            # If still no match and fuzzy is enabled, try fuzzy whole-document match (slowest)
            if not matched_metadata and use_fuzzy:
                matched_metadata = find_best_match(context_text)

            enriched_contexts[citation] = {
                "context": context_text,
                "title": matched_metadata.get("title", "") if matched_metadata else "",
                "url": matched_metadata.get("url", "") if matched_metadata else ""
            }
        else:
            # Step 2: No context available → still keep citation with placeholders
            enriched_contexts[citation] = {
                "context": "",
                "title": "",
                "url": ""
            }

    return enriched_contexts

In [78]:
# import numpy as np
# import re
# from difflib import SequenceMatcher

# def enrich_summary_with_citation_numbers(summary_text, cited_paragraphs, metadata_data, similarity_threshold=0.7):
#     """
#     summary_text: str, the summary string containing citations like [1][2]
#     cited_paragraphs: list of paragraph texts
#     metadata_data: list of dicts [{paragraph, title, url}, ...]
#     similarity_threshold: minimum similarity ratio (0–1) to accept fuzzy match
#     Returns a dict keyed by citation numbers.
#     """
#     # Extract all citation numbers from the summary text
#     citation_numbers = np.unique(re.findall(r'\[(\d+)\]', summary_text))
#     print("Summary citations found:", citation_numbers)

#     enriched_contexts = {}

#     # Helper: find best fuzzy match
#     def find_best_match(context_text):
#         best_match = None
#         best_ratio = 0.0
#         for item in metadata_data:
#             candidate_text = item.get("paragraph", "")
#             ratio = SequenceMatcher(None, context_text, candidate_text).ratio()
#             if ratio > best_ratio:
#                 best_ratio = ratio
#                 best_match = item
#         return best_match if best_ratio >= similarity_threshold else None

#     # Step 1: Assign contexts to citations
#     for idx, citation in enumerate(citation_numbers):
#         if idx < len(cited_paragraphs):
#             context_text = cited_paragraphs[idx]

#             # Try exact match first
#             matched_metadata = next(
#                 (item for item in metadata_data if item.get("paragraph", "") == context_text),
#                 None
#             )

#             # If no exact match, try fuzzy matching
#             if not matched_metadata:
#                 matched_metadata = find_best_match(context_text)

#             enriched_contexts[citation] = {
#                 "context": context_text,
#                 "title": matched_metadata.get("title", "") if matched_metadata else "",
#                 "url": matched_metadata.get("url", "") if matched_metadata else ""
#             }
#         else:
#             # No paragraph for this citation
#             enriched_contexts[citation] = {
#                 "context": "",
#                 "title": "",
#                 "url": ""
#             }

#     return enriched_contexts

In [79]:
sdg_names = {
    "SDG-1": "No Poverty",
    "SDG-2": "Zero Hunger",
    "SDG-3": "Good Health and Well-being",
    "SDG-4": "Quality Education",
    "SDG-5": "Gender Equality",
    "SDG-6": "Clean Water and Sanitation",
    "SDG-7": "Affordable and Clean Energy",
    "SDG-8": "Decent Work and Economic Growth",
    "SDG-9": "Industry, Innovation and Infrastructure",
    "SDG-10": "Reduced Inequalities",
    "SDG-11": "Sustainable Cities and Communities",
    "SDG-12": "Responsible Consumption and Production",
    "SDG-13": "Climate Action",
    "SDG-14": "Life Below Water",
    "SDG-15": "Life on Land",
    "SDG-16": "Peace, Justice and Strong Institutions",
    "SDG-17": "Partnerships for the Goals"
}

In [80]:
import re

def sdg_sort_key(key: str):
    """Extracts the number from keys like 'SDG-1: No Poverty' for correct sorting."""
    match = re.search(r"SDG-(\d+)", key)
    return int(match.group(1)) if match else 999  # push unknowns to the end

# Create reports QA 

In [81]:
import json
import os
import numpy as np
import re

# --- SDG Names Dictionary ---
sdg_names = {
    "SDG-1": "No Poverty",
    "SDG-2": "Zero Hunger",
    "SDG-3": "Good Health and Well-being",
    "SDG-4": "Quality Education",
    "SDG-5": "Gender Equality",
    "SDG-6": "Clean Water and Sanitation",
    "SDG-7": "Affordable and Clean Energy",
    "SDG-8": "Decent Work and Economic Growth",
    "SDG-9": "Industry, Innovation and Infrastructure",
    "SDG-10": "Reduced Inequalities",
    "SDG-11": "Sustainable Cities and Communities",
    "SDG-12": "Responsible Consumption and Production",
    "SDG-13": "Climate Action",
    "SDG-14": "Life Below Water",
    "SDG-15": "Life on Land",
    "SDG-16": "Peace, Justice and Strong Institutions",
    "SDG-17": "Partnerships for the Goals"
}
# --- End SDG Names Dictionary ---


devOrTest = "Dev"

# Define your source and cluster folders using f-strings
sources_folder = f"./Results/Sources/SourcesCountryEvent/{devOrTest} set/"
qa_folder = f"./Results/Answers/Answers-SDGs"
summary_folder = f"./Results/Executive Summaries/Dev set/Updated_citations"
metadata_folder = "./Results/paragraphs_metadata"

output_folder = f"./Results/Reports/JSON_Report_QA_SDGs/{devOrTest} set"
markdown_output_folder = f"./Results/Reports/Markdown_Report_QA_SDGs/{devOrTest} set"

os.makedirs(output_folder, exist_ok=True)
os.makedirs(markdown_output_folder, exist_ok=True)


# Get all JSON file names from the sources folder, sorted
json_files = np.sort([f for f in os.listdir(sources_folder) if f.endswith('.json')])

# Create an array of base names (without the .json extension)
base_names = [os.path.splitext(f)[0] for f in json_files]
print(f"Base Names: {base_names}\n")

for json_file in json_files:
    base_name = os.path.splitext(json_file)[0]

    # Load QA data
    qa_name = f"answers_new_cit-answes-{base_name}-prompt-1.json"
    data_qa = {}
    try:
        with open(os.path.join(qa_folder, qa_name), 'r') as file:
            data_qa = json.load(file)
            before = len(data_qa)

            data_qa = preprocess_answers_data(data_qa)
            after = len(data_qa)

            print(f"{qa_name} -> original: {before}, filtered: {after}, removed: {before - after}")
    except FileNotFoundError:
        print(f"QA file not found: {qa_name}")
        data_qa = {}
    except Exception as e:
        print(f"Error loading QA {qa_name}: {e}")
        data_qa = {}

    # Load metadata
    metadata_file_name = f"metadata-sources-metadata-{base_name}.json"
    metadata_data = []
    try:
        with open(os.path.join(metadata_folder, metadata_file_name), 'r', encoding='utf-8') as file:
            metadata_data = json.load(file)
    except FileNotFoundError:
        print(f"Metadata file not found for {base_name}: {metadata_file_name}")
    except Exception as e:
        print(f"Problems reading metadata file {metadata_file_name}: {e}")

    # Load summary
    summary_file_name = f"summary-{base_name}.json"
    summary_text = ""
    summary_contexts = {}
    try:
        with open(os.path.join(summary_folder, summary_file_name), 'r') as file:
            data_summary = json.load(file)
            summary_text = data_summary.get('new_summary', data_summary.get('summary', ''))
            cited_paragraphs = data_summary.get('new_cited_paragraphs', [])
            summary_contexts = enrich_summary_with_citation_numbers(
                summary_text, cited_paragraphs, metadata_data
            )
    except FileNotFoundError:
        print(f"Summary file not found: {summary_file_name}")
    except Exception as e:
        print(f"Error loading summary {summary_file_name}: {e}")

    # Group QA by formatted SDG name and enrich contexts
    grouped_qa = {}
    for qa_key, qa_list in data_qa.items():
        # qa_key is expected to be like 'sdg_1', 'SDG-2', etc.
        formatted_key_base = qa_key.upper().replace("_", "-")  # e.g., SDG-1
        sdg_name_long = sdg_names.get(formatted_key_base, None)
        
        if sdg_name_long:
            # Create the desired key format: "SDG 1 - No Poverty"
            # Extract the number part from 'SDG-X'
            sdg_number = formatted_key_base.split('-')[-1]
            sdg_name_key = f"SDG {sdg_number} - {sdg_name_long}"
        else:
            # Fallback if the key isn't in sdg_names (though it should be for SDGs)
            sdg_name_key = formatted_key_base

        if qa_list:
            enriched_list = []
            for item in qa_list:
                question = item.get('question', '')
                answer_text = item.get('retrieved_answer', '')
                original_contexts = item.get('new_used_contexts', [])
                enriched_contexts = enrich_contexts_with_citation_numbers(
                    original_contexts, metadata_data, answer_text
                )
                enriched_list.append({
                    'question': question,
                    'retrieved_answer': answer_text,
                    'used_contexts': enriched_contexts
                })
            grouped_qa[sdg_name_key] = enriched_list

    # --- Sorting Logic ---
    # Sort SDG names numerically by extracting the number part from the key
    def sort_key_func(sdg_key):
        match = re.search(r'SDG (\d+)', sdg_key)
        return int(match.group(1)) if match else float('inf')

    sorted_sdg_names = sorted(grouped_qa.keys(), key=sort_key_func)
    # --- End Sorting Logic ---

    # Build output JSON
    output_data = {
        'file_name': base_name,
        'summary': summary_text,
        'summary_contexts': summary_contexts
    }
    # Add SDGs in sorted order
    for sdg_name in sorted_sdg_names:
        output_data[sdg_name] = grouped_qa[sdg_name]

    # Save JSON
    output_file_path = os.path.join(output_folder, f"{base_name}_combined_data.json")
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        # Sort keys to ensure 'file_name', 'summary', 'summary_contexts' come first,
        # followed by the numerically sorted SDG keys.
        def json_sort_key(item):
            key = item[0]
            if key == 'file_name': return 0
            if key == 'summary': return 1
            if key == 'summary_contexts': return 2
            match = re.search(r'SDG (\d+)', key)
            return int(match.group(1)) + 2 if match else float('inf')

        # Use OrderedDict or a list of tuples to maintain order, but since
        # Python 3.7+ dicts maintain insertion order, we can rely on that.
        # We manually construct the dict in the desired order above.
        json.dump(output_data, outfile, indent=4, ensure_ascii=False)


    # --- Markdown generation ---
    markdown_content = f"# {base_name}\n\n"
    markdown_content += "## Summary\n"
    markdown_content += f"{summary_text}\n\n"

    # Use the same sorted list for Markdown
    for sdg_name in sorted_sdg_names:
        markdown_content += f"## {sdg_name}\n\n"
        for qa in grouped_qa[sdg_name]:
            markdown_content += f"**Question:** {qa['question']}\n"
            markdown_content += f"**Answer:** {qa['retrieved_answer']}\n\n"

    markdown_file_path = os.path.join(markdown_output_folder, f"{base_name}.md")
    with open(markdown_file_path, 'w', encoding='utf-8') as md_file:
        md_file.write(markdown_content)

print("\n--- Processing Complete ---")

Base Names: ['Afghanistan_Afghanistan Floods-Week 21 2024', 'Bangladesh_Cyclone Remal-Week 21 2024', 'Haiti_Gang violence and humanitarian crisis in Haiti-Week 40 2024', 'India_LandslideFloods-Week 31 2024', 'Indonesia_Floods and volcanic activity in Indonesia-Week 20 2024', 'Israel_Israel-Hamas war-Week 19 2024', 'Israel_Israel_Palestine_confilct-Week 40 2024', 'Jamaica_Hurricane Beryl-Week 28 2024', 'Nigeria_Flooding in Nigeria-Week 37 2024', 'Pakistan_Monsoon floods and rains in Pakistan-Week 31 2024', 'Sudan_Sudan conflict-Week 34 2024', 'Sudan_Sudan conflict-Week 39 2024', 'Ukraine_Ukraine-Week 23 2024', 'United Kingdom_UK riots-Week 32 2024']

answers_new_cit-answes-Afghanistan_Afghanistan Floods-Week 21 2024-prompt-1.json -> original: 14, filtered: 14, removed: 0
Summary citations found: ['17' '32' '39' '46' '64' '7']
answers_new_cit-answes-Bangladesh_Cyclone Remal-Week 21 2024-prompt-1.json -> original: 13, filtered: 12, removed: 1
Summary citations found: ['11' '34' '48' '63' 

# Create report just with the summary in each cluster

In [82]:
# def enrich_contexts(used_contexts, metadata_data):
#     """
#     used_contexts: dict {id: paragraph_text}
#     metadata_data: list of dicts [{paragraph, title, url}, ...]
#     """
#     enriched = {}
#     for cid, context_text in used_contexts.items():
#         matched_metadata = next(
#             (item for item in metadata_data if item.get('paragraph', '') == context_text),
#             None
#         )
#         if matched_metadata:
#             enriched[cid] = {
#                 "context": context_text,
#                 "title": matched_metadata.get("title", ""),
#                 "url": matched_metadata.get("url", "")
#             }
#         else:
#             enriched[cid] = {
#                 "context": context_text,
#                 "title": "",
#                 "url": ""
#             }
#     return enriched

In [83]:
import random

def enrich_contexts(used_contexts, metadata_data, min_substring_length=50):
    """
    used_contexts: dict {id: paragraph_text}
    metadata_data: list of dicts [{paragraph, title, url}, ...]
    min_substring_length: minimum length for substring matching
    """
    # Pre-process metadata for faster lookups
    # Create exact match lookup (O(1))
    exact_lookup = {item.get('paragraph', ''): item for item in metadata_data}
    
    # Create normalized substring lookup
    substring_lookup = []
    for item in metadata_data:
        para = item.get('paragraph', '')
        if para:
            substring_lookup.append({
                'normalized': para.lower().strip(),
                'original': item
            })
    
    # Helper: find substring match
    def find_substring_match(context_text):
        if len(context_text) < min_substring_length:
            return None
            
        context_normalized = context_text.lower().strip()
        matches = []
        
        for item in substring_lookup:
            if context_normalized in item['normalized']:
                matches.append(item['original'])
        
        return random.choice(matches) if matches else None
    
    enriched = {}
    for cid, context_text in used_contexts.items():
        matched_metadata = None
        
        # Try exact match first (O(1))
        matched_metadata = exact_lookup.get(context_text)
        
        # If no exact match, try substring matching
        if not matched_metadata:
            matched_metadata = find_substring_match(context_text)
        
        enriched[cid] = {
            "context": context_text,
            "title": matched_metadata.get("title", "") if matched_metadata else "",
            "url": matched_metadata.get("url", "") if matched_metadata else ""
        }
    
    return enriched

In [84]:
import json
import os
import numpy as np
import re

# Define your source and cluster folders
devOrTest = 'Dev'
sources_folder = f"./Results/Sources/SourcesCountryEvent/{devOrTest} set/"
clusters_folder = "./Results/Clusters+Headline "
qa_folder = f"./Results/Answers/Answers-SDG"
summary_folder = f"./Results/Executive Summaries/Dev set/Updated_citations"
cluster_summary_folder = f"./Results/Summaries/UniqueSummary-EachSDG/"
combined_output_folder = f"./Results/Reports/JSON_Report_Summaries_SDGs/{devOrTest} set" # Renamed output folder
markdown_output_folder = f"./Results/Reports/Markdown_Report_Summaries_Sdgs/{devOrTest} set" # Renamed Markdown output folder

metadata_folder = "./Results/paragraphs_metadata"

# Ensure the output folders exist
os.makedirs(combined_output_folder, exist_ok=True)
os.makedirs(markdown_output_folder, exist_ok=True)





# # Get all JSON file names from the sources folder, sorted
# json_files = np.sort([f for f in os.listdir(sources_folder) if f.endswith('.json')])

# # Create an array of base names (without the .json extension)
# base_names = [os.path.splitext(f)[0] for f in json_files]
# print(f"Base Names: {base_names}\n")

# # for json_file in json_files:
# #     base_name = os.path.splitext(json_file)[0]

# #     # --- Load overall summary ---
# #     summary_file_name = f"summary-{base_name}.json"
# #     try:
# #         with open(os.path.join(summary_folder, summary_file_name), 'r') as file:
# #             data_summary = json.load(file)
# #             overall_summary = data_summary.get('summary', '')
# #     except:
# #         overall_summary = ""

# #     # --- Load SDG cluster summaries ---
# #     cluster_summary_file_name = f"summary_{base_name}.json"
# #     try:
# #         with open(os.path.join(cluster_summary_folder, cluster_summary_file_name), 'r') as file:
# #             data_cluster_summaries = json.load(file)  # expected dict { "sdg-1": "...", "sdg-2": "..." }
# #     except:
# #         data_cluster_summaries = {}
# #     # --- Load metadata -- 
# #     metadata_file_name = f"metadata-sources-metadata-{base_name}.json"
# #     metadata_data = []
# #     try:
# #         with open(os.path.join(metadata_folder, metadata_file_name), 'r') as file:
# #             metadata_data = json.load(file)
# #     except FileNotFoundError:
# #         print(f"Metadata file not found for {base_name}: {os.path.join(metadata_folder, metadata_file_name)}")
# #     except Exception as e:
# #         print(f"Problems reading metadata file {os.path.join(metadata_folder, metadata_file_name)}: {e}")
        
# #     # --- Build JSON structure grouped by SDG ---
# #     output_data = {'file_name': base_name, 'summary': overall_summary}

# #     grouped_summaries = {}
# #     for sdg_key, text in data_cluster_summaries.items():
# #         formatted_key = sdg_key.upper().replace("_", "-")  # e.g., SDG-1
# #         sdg_name = sdg_names.get(formatted_key, formatted_key)
# #         full_key = f"{formatted_key}: {sdg_name}"  # e.g., "SDG-1: No Poverty"
# #         if text.strip():
# #             grouped_summaries[full_key] = text

# #     # Sort keys alphabetically
# #     sorted_keys = sorted(grouped_summaries.keys(), key=sdg_sort_key)
# #     for key in sorted_keys:
# #         output_data[key] = grouped_summaries[key]

# #     # --- Save JSON ---
# #     output_file_path = os.path.join(combined_output_folder, f"{base_name}_combined_data.json")
# #     with open(output_file_path, 'w', encoding='utf-8') as outfile:
# #         json.dump(output_data, outfile, indent=4, ensure_ascii=False)
# #     print(f"Created combined JSON file: {output_file_path}")
# for json_file in json_files:
#     base_name = os.path.splitext(json_file)[0]

#     # --- Load overall summary ---
#     summary_file_name = f"summary-{base_name}.json"
#     try:
#         with open(os.path.join(summary_folder, summary_file_name), 'r') as file:
#             data_summary = json.load(file)

#             # NEW: use enriched version
#             summary_text = data_summary.get('new_summary', data_summary.get('summary', ''))
#             cited_paragraphs = data_summary.get('new_cited_paragraphs', [])

#             output_data = {
#                 'file_name': base_name,
#                 'summary': summary_text,
#                 'summary_contexts': enrich_summary_with_citation_numbers(
#                     summary_text, cited_paragraphs, metadata_data
#                 )
#             }
#     except Exception as e:
#         print(f"Problem loading overall summary for {base_name}: {e}")
#         output_data = {'file_name': base_name, 'summary': "", 'summary_contexts': []}

#     # --- Load SDG cluster summaries ---
#     cluster_summary_file_name = f"summary_{base_name}.json"
#     try:
#         with open(os.path.join(cluster_summary_folder, cluster_summary_file_name), 'r') as file:
#             data_cluster_summaries = json.load(file)  
#             # expected dict { "sdg-1": {...}, "sdg-2": {...} }
#     except:
#         data_cluster_summaries = {}

#     # --- Load metadata -- 
#     metadata_file_name = f"metadata-sources-metadata-{base_name}.json"
#     metadata_data = []
#     try:
#         with open(os.path.join(metadata_folder, metadata_file_name), 'r') as file:
#             metadata_data = json.load(file)
#     except FileNotFoundError:
#         print(f"Metadata file not found for {base_name}: {os.path.join(metadata_folder, metadata_file_name)}")
#     except Exception as e:
#         print(f"Problems reading metadata file {os.path.join(metadata_folder, metadata_file_name)}: {e}")
        
#     # --- Build enriched clusters ---
#     output_data['clusters'] = []
#     if isinstance(data_cluster_summaries, dict):
#         for cluster_id, cluster_summary_entry in data_cluster_summaries.items():
#             cluster_summary_text = ""
#             used_contexts = {}

#             if isinstance(cluster_summary_entry, dict):
#                 cluster_summary_text = cluster_summary_entry.get("summary", "")
#                 raw_used_contexts = cluster_summary_entry.get("used_contexts", {})
#                 used_contexts = enrich_contexts(raw_used_contexts, metadata_data) if raw_used_contexts else {}

#             elif isinstance(cluster_summary_entry, str):
#                 # old format
#                 cluster_summary_text = cluster_summary_entry

#             if cluster_summary_text.strip():
#                 cluster_entry = {
#                     "cluster_id": cluster_id,
#                     "cluster_summary": cluster_summary_text,
#                     "used_contexts": used_contexts
#                 }
#                 output_data['clusters'].append(cluster_entry) 

#     # --- Save JSON ---
#     output_file_path = os.path.join(combined_output_folder, f"{base_name}_combined_data.json")
#     with open(output_file_path, 'w', encoding='utf-8') as outfile:
#         json.dump(output_data, outfile, indent=4, ensure_ascii=False)
#     print(f"Created combined JSON file: {output_file_path}")


#     # --- Markdown ---
#     markdown_content = f"# {base_name}\n\n"
#     markdown_content += "## Summary\n"
#     markdown_content += f"{overall_summary}\n\n"

#     markdown_content += "## SDG Cluster Summaries\n\n"
#     for key in sorted_keys:
#         markdown_content += f"### {key}\n\n"
#         markdown_content += f"{grouped_summaries[key]}\n\n"

#     markdown_file_path = os.path.join(markdown_output_folder, f"{base_name}.md")
#     with open(markdown_file_path, 'w', encoding='utf-8') as md_file:
#         md_file.write(markdown_content)
#     print(f"Created Markdown file: {markdown_file_path}")

# print("\n--- Processing Complete ---")

In [85]:
for json_file in json_files:
    base_name = os.path.splitext(json_file)[0]

    # --- Load metadata --- 
    metadata_file_name = f"metadata-sources-metadata-{base_name}.json"
    metadata_data = []
    try:
        with open(os.path.join(metadata_folder, metadata_file_name), 'r') as file:
            metadata_data = json.load(file)
    except FileNotFoundError:
        print(f"Metadata file not found for {base_name}: {os.path.join(metadata_folder, metadata_file_name)}")
    except Exception as e:
        print(f"Problems reading metadata file {os.path.join(metadata_folder, metadata_file_name)}: {e}")

    # --- Load overall summary ---
    summary_file_name = f"summary-{base_name}.json"
    try:
        with open(os.path.join(summary_folder, summary_file_name), 'r') as file:
            data_summary = json.load(file)

            # Prefer enriched fields if available
            summary_text = data_summary.get('new_summary', data_summary.get('summary', ''))
            cited_paragraphs = data_summary.get('new_cited_paragraphs', [])

            output_data = {
                'file_name': base_name,
                'summary': summary_text,
                'summary_contexts': enrich_summary_with_citation_numbers(
                    summary_text, cited_paragraphs, metadata_data
                )
            }
    except Exception as e:
        print(f"Problem loading overall summary for {base_name}: {e}")
        output_data = {'file_name': base_name, 'summary': "", 'summary_contexts': []}

    # --- Load SDG cluster summaries ---
    cluster_summary_file_name = f"summary_{base_name}.json"
    try:
        with open(os.path.join(cluster_summary_folder, cluster_summary_file_name), 'r') as file:
            data_cluster_summaries = json.load(file) 
            # expected dict { "sdg-1": {...}, "sdg-2": {...} }
    except:
        data_cluster_summaries = {}

    # --- Build enriched clusters ---
    temp_clusters = []
    if isinstance(data_cluster_summaries, dict):
        for cluster_id, cluster_summary_entry in data_cluster_summaries.items():
            cluster_summary_text = ""
            used_contexts = {}

            # Determine the display name for the SDG
            # Normalize key: 'sdg-1' -> 'SDG-1'
            normalized_key = cluster_id.upper().replace("_", "-")
            sdg_name_long = sdg_names.get(normalized_key)
            
            if sdg_name_long:
                # Extract the number part (e.g., '1' from 'SDG-1')
                sdg_number_match = re.search(r'\d+', normalized_key)
                sdg_number = sdg_number_match.group(0) if sdg_number_match else "Unknown"
                # Create the desired key format: "SDG 1 - No Poverty"
                display_cluster_id = f"SDG {sdg_number} - {sdg_name_long}"
            else:
                # Fallback to the original ID if not found in the dictionary
                display_cluster_id = cluster_id
            
            # Extract summary text and contexts
            if isinstance(cluster_summary_entry, dict):
                cluster_summary_text = cluster_summary_entry.get("summary", "")
                raw_used_contexts = cluster_summary_entry.get("used_contexts", {})
                used_contexts = enrich_contexts(raw_used_contexts, metadata_data) if raw_used_contexts else {}

            elif isinstance(cluster_summary_entry, str):
                # old format
                cluster_summary_text = cluster_summary_entry

            if cluster_summary_text.strip():
                cluster_entry = {
                    # Use the new formatted name
                    "cluster_id": display_cluster_id, 
                    "cluster_summary": cluster_summary_text,
                    "used_contexts": used_contexts
                }
                temp_clusters.append(cluster_entry)

    # --- Sort the clusters by SDG number ---
    def sort_cluster_key(cluster_entry):
        # Extracts the number from the 'cluster_id' (e.g., 'SDG 1 - ...' -> 1)
        match = re.search(r'SDG (\d+)', cluster_entry['cluster_id'])
        return int(match.group(1)) if match else float('inf')

    # Sort the list of cluster dictionaries
    output_data['clusters'] = sorted(temp_clusters, key=sort_cluster_key)


    # --- Save JSON ---
    output_file_path = os.path.join(combined_output_folder, f"{base_name}_combined_data.json")
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(output_data, outfile, indent=4, ensure_ascii=False)
    print(f"Created combined JSON file: {output_file_path}")

print("\n--- Processing Complete ---")


Summary citations found: ['17' '32' '39' '46' '64' '7']
Created combined JSON file: ./Results/Reports/JSON_Report_Summaries_SDGs/Dev set/Afghanistan_Afghanistan Floods-Week 21 2024_combined_data.json
Summary citations found: ['11' '34' '48' '63' '67' '70' '72' '73' '74' '77']
Created combined JSON file: ./Results/Reports/JSON_Report_Summaries_SDGs/Dev set/Bangladesh_Cyclone Remal-Week 21 2024_combined_data.json
Summary citations found: ['124' '129' '135' '136' '138' '139' '142' '3' '41' '63' '75']
Created combined JSON file: ./Results/Reports/JSON_Report_Summaries_SDGs/Dev set/Haiti_Gang violence and humanitarian crisis in Haiti-Week 40 2024_combined_data.json
Summary citations found: ['1' '34' '42' '49' '52' '57' '69' '92']
Created combined JSON file: ./Results/Reports/JSON_Report_Summaries_SDGs/Dev set/India_LandslideFloods-Week 31 2024_combined_data.json
Summary citations found: ['11' '12' '20' '21' '22' '23' '24' '7']
Created combined JSON file: ./Results/Reports/JSON_Report_Summar