# Final Report Generation

## Overview

This notebook compiles the **final report** by combining all the results produced across the various folders.  
In addition, **metadata** is added to each retrieved paragraph to ensure completeness and traceability.

Two report formats are generated:
1. A **Question–Answer (QA)** version.  
2. A **summary-based** version organized by **sub-topic**.



In [30]:
import re

def preprocess_answers_data(answers_file):
    """
    Preprocesses the raw loaded JSON data to filter items that:
    - Contain brackets (indicating citations)
    - Do NOT contain 'no clear answer'

    Args:
        answers_file (list): The list of dictionaries loaded from the JSON file.

    Returns:
        list: A list of dictionaries that meet the criteria.
    """
    
    filtered_file = []
    for item in answers_file:
        answer = item.get('updated_answer_pt2', '')
        
        has_brackets = "[" in answer or "]" in answer
        has_no_clear = re.search(r"no clear answer", answer, re.IGNORECASE)

        if has_brackets and not has_no_clear:
            filtered_file.append(item)

    return filtered_file


In [31]:
def preprocess_summary_data(summary_file):
    """
    Preprocesses the raw loaded JSON data to filter items with citations
    and associate citations with their corresponding contexts.

    Args:
        answers_file (list): The list of dictionaries loaded from the JSON file.

    Returns:
        list: A list of dictionaries, where each dictionary has 'citations'
              and 'used_contexts' keys added.
    """
    
    filtered_file = {}
    for key, answer in summary_file.items():
        # Only include items that potentially have citations
        if "[" in answer or "]" in answer or re.search(r"no clear answer[^\w]*", answer, re.IGNORECASE):
            filtered_file[key] = answer
    return filtered_file

In [32]:
def parse_file_name(s):
    """
    Parses a string in the format 'Country_event-period'
    and returns a dictionary with keys: country, event, period.
    """
    try:
        country_event, period = s.split("-", 1)  # split only on first '-'
        country, event = country_event.split("_", 1)  # split only on first '_'
        
        return country,event,period
    except ValueError:
        raise ValueError("String does not match expected format: 'Country_event-period'")

In [33]:
#v2

In [34]:
import numpy as np
import re
from difflib import SequenceMatcher
import random

def enrich_contexts_with_citation_numbers(context_list, metadata_data, answer_text, similarity_threshold=0.7, min_substring_length=50):
    """
    context_list: list of paragraph texts
    metadata_data: list of dicts [{paragraph, title, url}, ...]
    answer_text: the answer text containing citation numbers like [2][4]
    similarity_threshold: minimum ratio (0–1) to accept fuzzy match
    min_substring_length: minimum length for substring matching (avoids false positives)
    Returns a dict keyed by citation numbers.
    """
    # Extract citation numbers from the answer text
    citation_numbers = np.unique(re.findall(r'\[(\d+)\]', answer_text))
    enriched_contexts = {}

    # Pre-process metadata for faster lookups
    # Create exact match lookup
    exact_lookup = {item.get('paragraph', ''): item for item in metadata_data}
    
    # Create normalized substring lookup (only for substring matching)
    substring_lookup = []
    for item in metadata_data:
        para = item.get('paragraph', '')
        if para:
            substring_lookup.append({
                'normalized': para.lower().strip(),
                'original': item
            })

    # Helper: get best fuzzy match (only called when needed)
    def find_best_match(context_text):
        best_match = None
        best_ratio = 0.0
        for item in metadata_data:
            candidate_text = item.get('paragraph', '')
            ratio = SequenceMatcher(None, context_text, candidate_text).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_match = item
        return best_match if best_ratio >= similarity_threshold else None

    # Helper: find substring matches (optimized)
    def find_substring_match(context_text):
        """
        Check if context_text is contained within any metadata paragraph.
        Returns first match found (or random if multiple).
        """
        if len(context_text) < min_substring_length:
            return None
            
        context_normalized = context_text.lower().strip()
        matches = []
        
        # Simple substring search - much faster than fuzzy matching
        for item in substring_lookup:
            if context_normalized in item['normalized']:
                matches.append(item['original'])
                # Optionally break early if you only need one match
                # break
        
        return random.choice(matches) if matches else None

    # Step 1: Assign contexts to citations
    for idx, citation in enumerate(citation_numbers):
        if idx < len(context_list):
            context_text = context_list[idx]
            matched_metadata = None

            # Try exact match first (O(1) lookup)
            matched_metadata = exact_lookup.get(context_text)

            # If no exact match, try substring matching (faster than fuzzy)
            if not matched_metadata:
                matched_metadata = find_substring_match(context_text)

            # If still no match, try fuzzy whole-document match (slowest)
            if not matched_metadata:
                matched_metadata = find_best_match(context_text)

            enriched_contexts[citation] = {
                "context": context_text,
                "title": matched_metadata.get("title", "") if matched_metadata else "",
                "url": matched_metadata.get("url", "") if matched_metadata else ""
            }
        else:
            # Step 2: No context available → still keep citation with placeholders
            enriched_contexts[citation] = {
                "context": "",
                "title": "",
                "url": ""
            }

    return enriched_contexts

In [35]:
import numpy as np
import re
from difflib import SequenceMatcher

def enrich_summary_with_citation_numbers(summary_text, cited_paragraphs, metadata_data, similarity_threshold=0.7):
    """
    summary_text: str, the summary string containing citations like [1][2]
    cited_paragraphs: list of paragraph texts
    metadata_data: list of dicts [{paragraph, title, url}, ...]
    similarity_threshold: minimum similarity ratio (0–1) to accept fuzzy match
    Returns a dict keyed by citation numbers.
    """
    # Extract all citation numbers from the summary text
    citation_numbers = np.unique(re.findall(r'\[(\d+)\]', summary_text))
    print("Summary citations found:", citation_numbers)

    enriched_contexts = {}

    # Helper: find best fuzzy match
    def find_best_match(context_text):
        best_match = None
        best_ratio = 0.0
        for item in metadata_data:
            candidate_text = item.get("paragraph", "")
            ratio = SequenceMatcher(None, context_text, candidate_text).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best_match = item
        return best_match if best_ratio >= similarity_threshold else None

    # Step 1: Assign contexts to citations
    for idx, citation in enumerate(citation_numbers):
        if idx < len(cited_paragraphs):
            context_text = cited_paragraphs[idx]

            # Try exact match first
            matched_metadata = next(
                (item for item in metadata_data if item.get("paragraph", "") == context_text),
                None
            )

            # If no exact match, try fuzzy matching
            if not matched_metadata:
                matched_metadata = find_best_match(context_text)

            enriched_contexts[citation] = {
                "context": context_text,
                "title": matched_metadata.get("title", "") if matched_metadata else "",
                "url": matched_metadata.get("url", "") if matched_metadata else ""
            }
        else:
            # No paragraph for this citation
            enriched_contexts[citation] = {
                "context": "",
                "title": "",
                "url": ""
            }

    return enriched_contexts


# Create reports with QA + Citations 

In [39]:
import json
import os
import numpy as np
import re

devOrTest = "Dev"
# Define your source and cluster folders using f-strings
sources_folder = f"./Results/Sources/SourcesCountryEvent/{devOrTest} set/"
clusters_folder = "./Results/Cluster/Clusters+Headline "
qa_folder = f"./Results/Answers/Answers-subtopics/Dev set/Updated_citations"
metadata_folder = "./Results/paragraphs_metadata"
summary_folder = f"./Results/Executive Summaries/Dev set/Updated_citations"
cluster_summaries_folder = f"./Results/Summaries/UniqueSummary-EachCluster/{devOrTest} set"
output_folder = f"./Results/Reports/JSON_Report_QA/{devOrTest} set"
markdown_output_folder = f"./Results/Reports/Markdown_Report_QA/{devOrTest} set"
os.makedirs(output_folder, exist_ok=True)

os.makedirs(markdown_output_folder, exist_ok=True)

# Get all JSON file names from the sources folder, sorted
json_files = np.sort([f for f in os.listdir(sources_folder) if f.endswith('.json')])
base_names = [os.path.splitext(f)[0] for f in json_files]
print(f"Base Names: {base_names}\n")

for json_file in json_files:
    base_name = os.path.splitext(json_file)[0]

    # Load cluster data
    cluster_name = f"clusters-{base_name}.json"
    data_clusters = {}
    try:
        with open(os.path.join(clusters_folder, cluster_name), 'r') as file:
            data_clusters = json.load(file)
    except FileNotFoundError:
        print(f"Cluster file not found for {base_name}: {os.path.join(clusters_folder, cluster_name)}")
    except Exception as e:
        print(f"Problems reading cluster file {os.path.join(clusters_folder, cluster_name)}: {e}")

    # Load cluster summaries with titles
    cluster_summary_name = f"summary_{base_name}.json"
    cluster_summaries = {}
    try:
        with open(os.path.join(cluster_summaries_folder, cluster_summary_name), 'r') as file:
            cluster_summaries = json.load(file)
    except FileNotFoundError:
        print(f"Cluster summaries file not found for {base_name}: {os.path.join(cluster_summaries_folder, cluster_summary_name)}")
    except Exception as e:
        print(f"Problems reading cluster summaries file {os.path.join(cluster_summaries_folder, cluster_summary_name)}: {e}")

    # Load QA data
    qa_name = f"answers_new_cit-answes-{base_name}-prompt-1.json"
    data_qa = []
    try:
        with open(os.path.join(qa_folder, qa_name), 'r') as file:
            data_qa = json.load(file)
            before = len(data_qa)

            data_qa = preprocess_answers_data(data_qa)
            after = len(data_qa)

            print(f"{qa_name} -> original: {before}, filtered: {after}, removed: {before - after}")
            
            
    except FileNotFoundError:
        print(f"QA file not found for {base_name}: {os.path.join(qa_folder, qa_name)}")
    except Exception as e:
        print(f"Problems reading QA file {os.path.join(qa_folder, qa_name)}: {e}")

    # Load Metadata Data
    metadata_file_name = f"metadata-sources-metadata-{base_name}.json"
    metadata_data = []
    try:
        with open(os.path.join(metadata_folder, metadata_file_name), 'r', encoding='utf-8') as file:
            metadata_data = json.load(file)
    except FileNotFoundError:
        print(f"Metadata file not found for {base_name}: {metadata_file_name}")
    except Exception as e:
        print(f"Problems reading metadata file {metadata_file_name}: {e}")

    # --- Create the combined JSON structure ---
    output_data = {'file_name': base_name}
    
    ## 1. Get summary
    summary_file_name = f"summary-{base_name}.json"
    try:
        with open(os.path.join(summary_folder, summary_file_name), 'r') as file:
            data_summary = json.load(file)
            summary_text = data_summary.get('new_summary', '')
            cited_paragraphs = data_summary.get('new_cited_paragraphs', [])

            output_data['summary'] = summary_text
            output_data['summary_contexts'] = enrich_summary_with_citation_numbers(
                summary_text, cited_paragraphs, metadata_data
            )
    except FileNotFoundError:
        print(f"Summary file not found for {base_name}: {os.path.join(summary_folder, summary_file_name)}")
        output_data['summary'] = ""
        output_data['summary_contexts'] = {}
    except Exception as e:
        print(f"Problems reading summary file {os.path.join(summary_folder, summary_file_name)}: {e}")
        output_data['summary'] = ""
        output_data['summary_contexts'] = {}

    # 2. Organize QA by cluster_id
    qa_by_cluster = {}
    for qa_item in data_qa:
        cluster_id = qa_item.get('cluster_id')
        if cluster_id:
            if cluster_id not in qa_by_cluster:
                qa_by_cluster[cluster_id] = []
            original_context = qa_item.get('new_used_contexts', [])
            answer_text = qa_item.get('updated_answer_pt2', '')
            enriched_context = enrich_contexts_with_citation_numbers(original_context, metadata_data, answer_text)

            qa_by_cluster[cluster_id].append({
                'question': qa_item.get('question', ''),
                'updated_retrieved_answer': answer_text,
                'used_contexts': enriched_context
            })

    # 3. Process clusters
    output_data['clusters'] = []
    if isinstance(data_clusters, dict):
        for cluster_id, cluster_info in data_clusters.items():
            qa_items = qa_by_cluster.get(cluster_id, [])
            if qa_items:  # ✅ only keep clusters with at least one QA
                # Get title from cluster summaries
                cluster_title = cluster_summaries.get(cluster_id, {}).get('title', cluster_info.get('cluster_headline', ''))
                
                cluster_entry = {
                    'cluster_id': cluster_id,
                    'cluster_headline': cluster_title,
                    'questions_and_answers': qa_items
                }
                output_data['clusters'].append(cluster_entry)
    else:
        print(f"Warning: data_clusters for {base_name} is not a dictionary. Skipping cluster processing.")

    # Save combined JSON
    output_file_path = os.path.join(output_folder, f"{base_name}_combined_data.json")
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(output_data, outfile, indent=4, ensure_ascii=False)
    print(f"Created combined data file: {output_file_path}")

    # --- Generate Markdown ---
    country, event, period = parse_file_name(output_data['file_name'])
    markdown_content = f"# {event} {period}\n\n"
    markdown_content += "## Summary \n"
    markdown_content += f"{output_data['summary']}\n\n"
    markdown_content += "## Questions and Answers\n\n"

    for cluster in output_data['clusters']:
        if cluster['questions_and_answers']:
            markdown_content += f"### {cluster['cluster_headline']}\n\n"
            for qa in cluster['questions_and_answers']:
                markdown_content += f"**Question:** {qa['question']}\n"
                markdown_content += f"**Answer:** {qa['updated_retrieved_answer']}\n\n"

    markdown_file_path = os.path.join(markdown_output_folder, f"{base_name}.md")
    with open(markdown_file_path, 'w', encoding='utf-8') as md_file:
        md_file.write(markdown_content)
    print(f"Created Markdown file: {markdown_file_path}")

print("\n--- Processing Complete ---")

Base Names: ['Afghanistan_Afghanistan Floods-Week 21 2024', 'Bangladesh_Cyclone Remal-Week 21 2024', 'Haiti_Gang violence and humanitarian crisis in Haiti-Week 40 2024', 'India_LandslideFloods-Week 31 2024', 'Indonesia_Floods and volcanic activity in Indonesia-Week 20 2024', 'Israel_Israel-Hamas war-Week 19 2024', 'Israel_Israel_Palestine_confilct-Week 40 2024', 'Jamaica_Hurricane Beryl-Week 28 2024', 'Nigeria_Flooding in Nigeria-Week 37 2024', 'Pakistan_Monsoon floods and rains in Pakistan-Week 31 2024', 'Sudan_Sudan conflict-Week 34 2024', 'Sudan_Sudan conflict-Week 39 2024', 'Ukraine_Ukraine-Week 23 2024', 'United Kingdom_UK riots-Week 32 2024']

answers_new_cit-answes-Afghanistan_Afghanistan Floods-Week 21 2024-prompt-1.json -> original: 30, filtered: 28, removed: 2
Summary citations found: ['17' '32' '39' '46' '64' '7']
Created combined data file: ./Results/Reports/JSON_Report_QA/Dev set/Afghanistan_Afghanistan Floods-Week 21 2024_combined_data.json
Created Markdown file: ./Result

# Create report just with the summary in each cluster

In [37]:
import random

def enrich_contexts(used_contexts, metadata_data, min_substring_length=50):
    """
    used_contexts: dict {id: paragraph_text}
    metadata_data: list of dicts [{paragraph, title, url}, ...]
    min_substring_length: minimum length for substring matching
    """
    # Pre-process metadata for faster lookups
    # Create exact match lookup (O(1))
    exact_lookup = {item.get('paragraph', ''): item for item in metadata_data}
    
    # Create normalized substring lookup
    substring_lookup = []
    for item in metadata_data:
        para = item.get('paragraph', '')
        if para:
            substring_lookup.append({
                'normalized': para.lower().strip(),
                'original': item
            })
    
    # Helper: find substring match
    def find_substring_match(context_text):
        if len(context_text) < min_substring_length:
            return None
            
        context_normalized = context_text.lower().strip()
        matches = []
        
        for item in substring_lookup:
            if context_normalized in item['normalized']:
                matches.append(item['original'])
        
        return random.choice(matches) if matches else None
    
    enriched = {}
    for cid, context_text in used_contexts.items():
        matched_metadata = None
        
        # Try exact match first (O(1))
        matched_metadata = exact_lookup.get(context_text)
        
        # If no exact match, try substring matching
        if not matched_metadata:
            matched_metadata = find_substring_match(context_text)
        
        if matched_metadata:
            enriched[cid] = {
                "context": context_text,
                "title": matched_metadata.get("title", ""),
                "url": matched_metadata.get("url", "")
            }
        else:
            enriched[cid] = {
                "context": context_text,
                "title": "",
                "url": ""
            }
    
    return enriched