# Final Report Postprocessing

## Overview

This notebook updateds the citations for all the generated notebooks 


In [None]:


import json
import re
from pathlib import Path
from typing import Dict

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    return " ".join(s.split())

def load_json(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: Path, data):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)




# Change citations QA Files

In [None]:
def reindex_file(file_path: Path, output_folder: Path):
    try:
        data = load_json(file_path)
    except Exception as e:
        print(f"[skip] {file_path} non è un JSON valido ({e})")
        return

    if "clusters" not in data and "summary_contexts" not in data:
        print(f"[skip] {file_path} non ha 'clusters' né 'summary_contexts'")
        return

    context_to_data: Dict[str, dict] = {}
    cit_pattern = re.compile(r"\[(\d+)\]")

    # 1) Raccogli tutti i contesti (sia da clusters che da summary_contexts)
    for cluster in data.get("clusters", []):
        for qa in cluster.get("questions_and_answers", []):
            for ctx in qa.get("used_contexts", {}).values():
                norm_text = normalize_text(ctx.get("context", ""))
                if norm_text not in context_to_data:
                    context_to_data[norm_text] = ctx

    for ctx in data.get("summary_contexts", {}).values():
        norm_text = normalize_text(ctx.get("context", ""))
        if norm_text not in context_to_data:
            context_to_data[norm_text] = ctx

    # 2) Scansiona i testi (clusters + summary) per assegnare numeri in ordine di apparizione
    context_to_newnum: Dict[str, int] = {}
    next_num = 1

    def register_from_text(text: str, uc_dict: Dict[str, dict]):
        nonlocal next_num
        for m in cit_pattern.finditer(text or ""):
            oldnum = m.group(1)
            if oldnum in uc_dict:
                norm_text = normalize_text(uc_dict[oldnum]["context"])
            else:
                norm_text = f"__MISSING_CONTEXT__{oldnum}"
            if norm_text not in context_to_newnum:
                context_to_newnum[norm_text] = next_num
                next_num += 1

    # cluster answers
    for cluster in data.get("clusters", []):
        for qa in cluster.get("questions_and_answers", []):
            register_from_text(qa.get("updated_retrieved_answer", ""), qa.get("used_contexts", {}))

    # summary
    if "summary" in data:
        register_from_text(data["summary"], data.get("summary_contexts", {}))

    # 3) Aggiungi contesti non citati
    for norm_text in context_to_data:
        if norm_text not in context_to_newnum:
            context_to_newnum[norm_text] = next_num
            next_num += 1

    # 4) Funzione sostituzione citazioni
    def replace_citation_match(match, uc_dict):
        oldnum = match.group(1)
        if oldnum in uc_dict:
            norm_text = normalize_text(uc_dict[oldnum]["context"])
        else:
            norm_text = f"__MISSING_CONTEXT__{oldnum}"
        return f"[{context_to_newnum[norm_text]}]"

    # Aggiorna cluster answers
    for cluster in data.get("clusters", []):
        for qa in cluster.get("questions_and_answers", []):
            if "updated_retrieved_answer" in qa:
                qa["updated_retrieved_answer"] = cit_pattern.sub(
                    lambda m: replace_citation_match(m, qa.get("used_contexts", {})),
                    qa["updated_retrieved_answer"] or ""
                )
            new_uc = {}
            for old_key, ctx in qa.get("used_contexts", {}).items():
                norm_text = normalize_text(ctx.get("context", ""))
                new_key = str(context_to_newnum[norm_text])
                new_uc[new_key] = ctx
            qa["used_contexts"] = new_uc

    # Aggiorna summary
    if "summary" in data:
        data["summary"] = cit_pattern.sub(
            lambda m: replace_citation_match(m, data.get("summary_contexts", {})),
            data["summary"] or ""
        )

    # Aggiorna summary_contexts
    if "summary_contexts" in data:
        new_sc = {}
        for old_key, ctx in data["summary_contexts"].items():
            norm_text = normalize_text(ctx.get("context", ""))
            new_key = str(context_to_newnum[norm_text])
            new_sc[new_key] = ctx
        data["summary_contexts"] = new_sc
        
    # duplicate_cit_pattern = re.compile(r"(\[\d+\])\1+") 
    
    # def remove_adjacent_duplicates(text: str) -> str:
    #     # Sostituisce la sequenza di citazioni duplicate adiacenti con la prima occorrenza
    #     # es: "[1][1][1]" viene sostituito con "[1]"
    #     return duplicate_cit_pattern.sub(r"\1", text or "")
        
    # # Applica la rimozione su cluster answers
    # for cluster in data.get("clusters", []):
    #     for qa in cluster.get("questions_and_answers", []):
    #         if "updated_retrieved_answer" in qa:
    #             qa["updated_retrieved_answer"] = remove_adjacent_duplicates(qa["updated_retrieved_answer"])

    # # Applica la rimozione su summary
    # if "summary" in data:
    #     data["summary"] = remove_adjacent_duplicates(data["summary"])
    
    # 5) Salva (il numero del passo è rimasto 5 per coerenza con l'originale se si considera il nuovo come un'aggiunta)
    output_path = output_folder / file_path.name
    save_json(output_path, data)
    print(f"[ok] salvato aggiornato: {output_path}")

    # 5) Salva
    output_path = output_folder / file_path.name
    save_json(output_path, data)
    print(f"[ok] salvato aggiornato: {output_path}")


In [None]:
def reindex_all(folder_path: Path):
    output_folder = folder_path / "updated_citations_V2"
    output_folder.mkdir(exist_ok=True)
    for file_path in folder_path.glob("*.json"):
        reindex_file(file_path, output_folder)

folder_path = Path("./Results/Reports/JSON_Report_QA/Dev set")
reindex_all(folder_path)

## SDGs

In [None]:
from pathlib import Path
import json
import re
from typing import Dict, Set

CIT_PATTERN = re.compile(r"\[(\d+)\]")

def normalize_text(s: str) -> str:
    """Normalize context text so identical contexts map to the same key."""
    if not s:
        return ""
    s = s.replace("\u2018", "'").replace("\u2019", "'")
    s = s.replace("\u201c", '"').replace("\u201d", '"')
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s.casefold()  # case-insensitive stable normalization

def load_json(p: Path):
    with p.open("r", encoding="utf-8") as f:
        return json.load(f)

def save_json(p: Path, data):
    with p.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def reindex_file(file_path: Path, output_folder: Path):
    try:
        data = load_json(file_path)
    except Exception as e:
        print(f"[skip] {file_path} non è un JSON valido ({e})")
        return

    # Identify top-level categories (list-of-QA) while preserving insertion order.
    # Exclude known metadata fields.
    metadata_keys = {"file_name", "summary", "summary_contexts", "clusters"}
    category_keys = []
    for k, v in data.items():
        if k in metadata_keys:
            continue
        if isinstance(v, list):
            # accept empty lists and lists whose elements look like QAs
            if not v or (isinstance(v[0], dict) and ("question" in v[0] or "retrieved_answer" in v[0] or "used_contexts" in v[0])):
                category_keys.append(k)

    # Gather contexts globally (norm_text -> context dict)
    context_to_data: Dict[str, dict] = {}
    
    global_old_to_norms: Dict[str, Set[str]] = {}

    def register_context(old_key: str, ctx: dict):
        norm = normalize_text(ctx.get("context", ""))
        # Only register non-empty contexts
        if norm:
            if norm not in context_to_data:
                context_to_data[norm] = ctx
            if old_key is not None:
                global_old_to_norms.setdefault(str(old_key), set()).add(norm)

    # From clusters
    for cluster in data.get("clusters", []) if isinstance(data.get("clusters"), list) else []:
        for qa in cluster.get("questions_and_answers", []):
            for oldk, ctx in qa.get("used_contexts", {}).items():
                register_context(oldk, ctx)

    # From summary_contexts
    for oldk, ctx in (data.get("summary_contexts") or {}).items():
        register_context(oldk, ctx)

    # From category-style QAs
    for cat in category_keys:
        for qa in data.get(cat, []):
            for oldk, ctx in qa.get("used_contexts", {}).items():
                register_context(oldk, ctx)

    # Helper to resolve an old numeric ref to a normalized text key (best-effort)
    def resolve_old_to_norm(oldnum: str, local_uc: Dict[str, dict]):
        # 1) local used_contexts (most reliable)
        if local_uc and oldnum in local_uc:
            norm = normalize_text(local_uc[oldnum].get("context", ""))
            if norm:  # only return if non-empty
                return norm
        # 2) summary_contexts (useful when scanning summary)
        if oldnum in (data.get("summary_contexts") or {}):
            norm = normalize_text((data["summary_contexts"][oldnum] or {}).get("context", ""))
            if norm:
                return norm
        # 3) if this oldnum has been seen globally and maps to exactly one normalized context, use that
        if oldnum in global_old_to_norms and len(global_old_to_norms[oldnum]) == 1:
            return next(iter(global_old_to_norms[oldnum]))
        # 4) fallback: return None for missing/empty contexts (will be filtered later)
        return None

    # Assign new numbers in order of first appearance when scanning the texts top-to-bottom (file order)
    context_to_newnum: Dict[str, int] = {}
    next_num = 1

    def register_from_text(text: str, local_uc: Dict[str, dict]):
        nonlocal next_num
        if not text:
            return
        for m in CIT_PATTERN.finditer(text):
            old = m.group(1)
            norm = resolve_old_to_norm(old, local_uc or {})
            if norm and norm not in context_to_newnum:
                context_to_newnum[norm] = next_num
                next_num += 1

    # Iterate top-level items in insertion order so numbering follows file appearance
    for key, val in data.items():
        if key == "clusters":
            for cluster in val:
                for qa in cluster.get("questions_and_answers", []):
                    ans = qa.get("updated_retrieved_answer") or qa.get("retrieved_answer") or ""
                    register_from_text(ans, qa.get("used_contexts", {}))
        elif key == "summary":
            register_from_text(val or "", data.get("summary_contexts", {}))
        elif key in category_keys:
            for qa in val:
                ans = qa.get("updated_retrieved_answer") or qa.get("retrieved_answer") or ""
                register_from_text(ans, qa.get("used_contexts", {}))
        # else metadata or other -> ignore for numbering

    # Add any contexts that were never cited so they still get a unique number
    for norm in context_to_data:
        if norm not in context_to_newnum:
            context_to_newnum[norm] = next_num
            next_num += 1

    # Replacement helper that uses the same resolution used while registering
    def replace_match_with_newnum(match, local_uc: Dict[str, dict]):
        old = match.group(1)
        norm = resolve_old_to_norm(old, local_uc or {})
        if norm is None:
            # Missing/empty context -> remove citation
            return ""
        newnum = context_to_newnum.get(norm)
        if newnum is None:
            # shouldn't happen, but guard
            return ""
        return f"[{newnum}]"

    # Update all QA answers and their used_contexts maps
    # Clusters
    for cluster in data.get("clusters", []) if isinstance(data.get("clusters"), list) else []:
        for qa in cluster.get("questions_and_answers", []):
            # replace in answer text (prefer updated_retrieved_answer if present)
            if "updated_retrieved_answer" in qa:
                qa["updated_retrieved_answer"] = CIT_PATTERN.sub(
                    lambda m: replace_match_with_newnum(m, qa.get("used_contexts", {})),
                    qa.get("updated_retrieved_answer") or ""
                )
            else:
                # fallback to retrieved_answer if updated not present
                if "retrieved_answer" in qa:
                    qa["retrieved_answer"] = CIT_PATTERN.sub(
                        lambda m: replace_match_with_newnum(m, qa.get("used_contexts", {})),
                        qa.get("retrieved_answer") or ""
                    )

            # rebuild used_contexts with new numeric keys (one entry per unique normalized context)
            new_uc = {}
            for oldk, ctx in qa.get("used_contexts", {}).items():
                norm = normalize_text(ctx.get("context", ""))
                if norm:  # only include non-empty contexts
                    new_key = str(context_to_newnum[norm])
                    # keep the first occurrence for that new_key
                    if new_key not in new_uc:
                        new_uc[new_key] = ctx
            qa["used_contexts"] = new_uc

    # Category-style QAs
    for cat in category_keys:
        for qa in data.get(cat, []):
            # update answer text (these files use `retrieved_answer` typically)
            if "updated_retrieved_answer" in qa:
                qa["updated_retrieved_answer"] = CIT_PATTERN.sub(
                    lambda m: replace_match_with_newnum(m, qa.get("used_contexts", {})),
                    qa.get("updated_retrieved_answer") or ""
                )
            else:
                qa["retrieved_answer"] = CIT_PATTERN.sub(
                    lambda m: replace_match_with_newnum(m, qa.get("used_contexts", {})),
                    qa.get("retrieved_answer") or ""
                )

            new_uc = {}
            for oldk, ctx in qa.get("used_contexts", {}).items():
                norm = normalize_text(ctx.get("context", ""))
                if norm:  # only include non-empty contexts
                    new_key = str(context_to_newnum[norm])
                    if new_key not in new_uc:
                        new_uc[new_key] = ctx
            qa["used_contexts"] = new_uc

    # Update summary text & summary_contexts if present
    if "summary" in data:
        data["summary"] = CIT_PATTERN.sub(
            lambda m: replace_match_with_newnum(m, data.get("summary_contexts", {})),
            data.get("summary") or ""
        )

    if "summary_contexts" in data:
        new_sc = {}
        for oldk, ctx in (data.get("summary_contexts") or {}).items():
            norm = normalize_text(ctx.get("context", ""))
            if norm:  # only include non-empty contexts
                new_key = str(context_to_newnum[norm])
                if new_key not in new_sc:
                    new_sc[new_key] = ctx
        data["summary_contexts"] = new_sc
        
    # Save
    output_folder.mkdir(parents=True, exist_ok=True)
    output_path = output_folder / file_path.name
    save_json(output_path, data)
    print(f"[ok] salvato aggiornato: {output_path}")

def reindex_all(folder_path: Path, recursive: bool = False):
    """
    Reindex all JSON files in folder_path, writing updated files into folder_path/updated_citations_V2.
    Set recursive=True to traverse subfolders.
    """
    output_folder = folder_path / "updated_citations_V2"
    output_folder.mkdir(exist_ok=True)
    pattern = "**/*.json" if recursive else "*.json"
    for file_path in folder_path.glob(pattern):
        # skip files already in the output folder
        if output_folder in file_path.parents:
            continue
        try:
            reindex_file(file_path, output_folder)
        except Exception as e:
            print(f"[error] {file_path} non è stato processato ({e})")
            
            
folder_path = Path("./Results/Reports/JSON_Report_QA_SDGs/Dev set")
reindex_all(folder_path)

# Change citations summary clusters 

In [None]:
#!/usr/bin/env python3
import json
import re
from pathlib import Path
from typing import Dict

def remove_duplicate_citations(text: str) -> str:
    """Remove duplicate numeric citations like [1][2][3][2][1] → [1][2][3]."""
    def deduplicate(match):
        citations = match.groups()[0].split('][')
        citations = [c.strip('[]') for c in citations]
        seen = set()
        unique_citations = [x for x in citations if not (x in seen or seen.add(x))]
        return ''.join(f'[{x}]' for x in unique_citations)
    
    return re.sub(r'((?:\[\d+\])+)', deduplicate, text or "")


def normalize_text(s: str) -> str:
    if s is None:
        return ""
    return " ".join(s.split())

def load_json(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: Path, data):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def reindex_file(file_path: Path, output_folder: Path):
    try:
        data = load_json(file_path)
    except Exception as e:
        print(f"[skip] {file_path} non è un JSON valido ({e})")
        return

    if "clusters" not in data and "summary_contexts" not in data:
        print(f"[skip] {file_path} non ha né 'clusters' né 'summary_contexts'")
        return

    cit_pattern = re.compile(r"\[(\d+)\]")

    # 1) Raccogli tutti i contesti normalizzati (da clusters + summary_contexts)
    context_to_data: Dict[str, dict] = {}
    for cluster in data.get("clusters", []):
        for ctx in cluster.get("used_contexts", {}).values():
            norm_text = normalize_text(ctx.get("context", ""))
            if norm_text not in context_to_data:
                context_to_data[norm_text] = ctx

    for ctx in data.get("summary_contexts", {}).values():
        norm_text = normalize_text(ctx.get("context", ""))
        if norm_text not in context_to_data:
            context_to_data[norm_text] = ctx

    # 2) Scansiona testi (cluster_summary + summary) per assegnare numeri in ordine di apparizione
    context_to_newnum: Dict[str, int] = {}
    next_num = 1

    def register_from_text(text: str, uc_dict: Dict[str, dict]):
        nonlocal next_num
        for m in cit_pattern.finditer(text or ""):
            oldnum = m.group(1)
            if oldnum in uc_dict:
                norm_text = normalize_text(uc_dict[oldnum]["context"])
            else:
                norm_text = f"__MISSING_CONTEXT__{oldnum}"
            if norm_text not in context_to_newnum:
                context_to_newnum[norm_text] = next_num
                next_num += 1

    for cluster in data.get("clusters", []):
        register_from_text(cluster.get("cluster_summary", ""), cluster.get("used_contexts", {}))

    if "summary" in data:
        register_from_text(data["summary"], data.get("summary_contexts", {}))

    # 3) Aggiungi contesti non citati
    for norm_text in context_to_data:
        if norm_text not in context_to_newnum:
            context_to_newnum[norm_text] = next_num
            next_num += 1

    # 4) Funzione per sostituire le citazioni
    def replace_citation_match(match, uc_dict):
        oldnum = match.group(1)
        if oldnum in uc_dict:
            norm_text = normalize_text(uc_dict[oldnum]["context"])
        else:
            norm_text = f"__MISSING_CONTEXT__{oldnum}"
        return f"[{context_to_newnum[norm_text]}]"

    # Aggiorna cluster_summary
    for cluster in data.get("clusters", []):
        if "cluster_summary" in cluster:
            cluster["cluster_summary"] = cit_pattern.sub(
                lambda m: replace_citation_match(m, cluster.get("used_contexts", {})),
                cluster["cluster_summary"] or ""
            )
            
            # Remove duplicate citations
            
            cluster["cluster_summary"] = remove_duplicate_citations(cluster["cluster_summary"])
            
            
        new_uc = {}
        for old_key, ctx in cluster.get("used_contexts", {}).items():
            norm_text = normalize_text(ctx.get("context", ""))
            new_key = str(context_to_newnum[norm_text])
            new_uc[new_key] = ctx
        cluster["used_contexts"] = new_uc

    # Aggiorna summary
    if "summary" in data:
        data["summary"] = cit_pattern.sub(
            lambda m: replace_citation_match(m, data.get("summary_contexts", {})),
            data["summary"] or ""
        )
        
        
        data["summary"] = remove_duplicate_citations(data["summary"])

    # Aggiorna summary_contexts
    if "summary_contexts" in data:
        new_sc = {}
        for old_key, ctx in data["summary_contexts"].items():
            norm_text = normalize_text(ctx.get("context", ""))
            new_key = str(context_to_newnum[norm_text])
            new_sc[new_key] = ctx
        data["summary_contexts"] = new_sc
        
        
    
        
    # 5) Salva
    output_path = output_folder / file_path.name
    save_json(output_path, data)
    print(f"[ok] salvato aggiornato: {output_path}")


def reindex_all(folder_path: Path):
    output_folder = folder_path / "updated_citations_V2"
    output_folder.mkdir(exist_ok=True)
    for file_path in folder_path.glob("*.json"):
        reindex_file(file_path, output_folder)


In [None]:
summary_cluster_path = Path("./Results/Reports/JSON_Report_Summaries/Dev set")
reindex_all(summary_cluster_path)

## SDGs

In [None]:
import json
import re
from pathlib import Path
from typing import Dict, Any

def normalize_text(s: str) -> str:
    """Normalizes whitespace in a string."""
    if s is None:
        return ""
    return " ".join(s.split())

def load_json(path: Path) -> Any:
    """Loads JSON data from a file."""
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(path: Path, data: Any):
    """Saves data to a JSON file, creating parent directories if needed."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def get_context_text(ctx: Any) -> str:
    """Safely extracts context text, handling both dict and string formats."""
    if isinstance(ctx, dict):
        return ctx.get("context", "")
    elif isinstance(ctx, str):
        return ctx
    return ""

def reindex_file(file_path: Path, output_folder: Path):
    print(file_path)
    try:
        data = load_json(file_path)
    except Exception as e:
        print(f"[skip] {file_path} non è un JSON valido ({e})")
        return

    if "clusters" not in data and "summary_contexts" not in data:
        print(f"[skip] {file_path} non ha né 'clusters' né 'summary_contexts'")
        return

    cit_pattern = re.compile(r"\[(\d+)\]")

    # 1) Raccogli tutti i contesti normalizzati (da clusters + summary_contexts)
    context_to_data: Dict[str, dict] = {}
    
    # Process contexts from clusters
    for cluster in data.get("clusters", []):
        for ctx in cluster.get("used_contexts", {}).values():
            raw_text = get_context_text(ctx) # <-- FIX: Use safe getter
            norm_text = normalize_text(raw_text)
            if norm_text and norm_text not in context_to_data:
                context_to_data[norm_text] = ctx

    # Process contexts from summary_contexts
    for ctx in data.get("summary_contexts", {}).values():
        raw_text = get_context_text(ctx) # <-- FIX: Use safe getter
        norm_text = normalize_text(raw_text)
        if norm_text and norm_text not in context_to_data:
            context_to_data[norm_text] = ctx

    # 2) Scansiona testi (cluster_summary + summary) per assegnare numeri in ordine di apparizione
    context_to_newnum: Dict[str, int] = {}
    next_num = 1

    def register_from_text(text: str, uc_dict: Dict[str, Any]):
        nonlocal next_num
        for m in cit_pattern.finditer(text or ""):
            oldnum = m.group(1)
            if oldnum in uc_dict:
                ctx = uc_dict[oldnum]
                raw_text = get_context_text(ctx) # <-- FIX: Use safe getter
                norm_text = normalize_text(raw_text)
            else:
                norm_text = f"__MISSING_CONTEXT__{oldnum}"
            
            if norm_text not in context_to_newnum:
                context_to_newnum[norm_text] = next_num
                next_num += 1

    for cluster in data.get("clusters", []):
        register_from_text(cluster.get("cluster_summary", ""), cluster.get("used_contexts", {}))

    if "summary" in data:
        register_from_text(data["summary"], data.get("summary_contexts", {}))

    # 3) Aggiungi contesti non citati
    for norm_text in context_to_data:
        if norm_text not in context_to_newnum:
            context_to_newnum[norm_text] = next_num
            next_num += 1

    # 4) Funzione per sostituire le citazioni
    def replace_citation_match(match, uc_dict):
        oldnum = match.group(1)
        if oldnum in uc_dict:
            ctx = uc_dict[oldnum]
            raw_text = get_context_text(ctx) # <-- FIX: Use safe getter
            norm_text = normalize_text(raw_text)
        else:
            norm_text = f"__MISSING_CONTEXT__{oldnum}"
        
        # Guard against unhandled missing contexts, though all contexts should be registered by step 3
        if norm_text not in context_to_newnum:
            return f"[MISSING_NEW_INDEX:{norm_text}]"
            
        return f"[{context_to_newnum[norm_text]}]"

    # Aggiorna cluster_summary e used_contexts
    for cluster in data.get("clusters", []):
        if "cluster_summary" in cluster:
            cluster["cluster_summary"] = cit_pattern.sub(
                lambda m: replace_citation_match(m, cluster.get("used_contexts", {})),
                cluster["cluster_summary"] or ""
            )
            
            cluster['cluster_summary'] = remove_duplicate_citations(cluster['cluster_summary'])
        new_uc = {}
        for old_key, ctx in cluster.get("used_contexts", {}).items():
            raw_text = get_context_text(ctx) # <-- FIX: Use safe getter
            norm_text = normalize_text(raw_text)
            
            if norm_text in context_to_newnum:
                new_key = str(context_to_newnum[norm_text])
                new_uc[new_key] = ctx
        cluster["used_contexts"] = new_uc

    # Aggiorna summary
    if "summary" in data:
        data["summary"] = cit_pattern.sub(
            lambda m: replace_citation_match(m, data.get("summary_contexts", {})),
            data["summary"] or ""
        )
        
        data['summary'] = remove_duplicate_citations(data['summary'])

    # Aggiorna summary_contexts
    if "summary_contexts" in data:
        new_sc = {}
        for old_key, ctx in data["summary_contexts"].items():
            raw_text = get_context_text(ctx) # <-- FIX: Use safe getter
            norm_text = normalize_text(raw_text)
            
            if norm_text in context_to_newnum:
                new_key = str(context_to_newnum[norm_text])
                new_sc[new_key] = ctx
        data["summary_contexts"] = new_sc
        
    # Commented out block for duplicate citation removal
    # 5) Salva
    output_path = output_folder / file_path.name
    save_json(output_path, data)
    print(f"[ok] salvato aggiornato: {output_path}")


def reindex_all(folder_path: Path):
    output_folder = folder_path / "updated_citations_V3"
    output_folder.mkdir(exist_ok=True)
    for file_path in folder_path.glob("*.json"):
        reindex_file(file_path, output_folder)
 
 
summary_cluster_path = Path("./Results/Reports/JSON_Report_Summaries_SDGs/Dev set")
reindex_all(summary_cluster_path) 
 

In [None]:
import os
os.listdir("./Results/Reports/JSON_Report_Summaries_SDGs/Dev set")