In [None]:
#Folder based page update
import pandas as pd
import json
from pathlib import Path

def update_json_page_numbers(csv_file, json_folder, output_folder):
    """
    Updates the page numbers in all JSON files within a folder based on the correct offset from a CSV metadata file.

    Parameters:
    csv_file (str): Path to the CSV file containing Q&A metadata.
    json_folder (str): Path to the folder containing JSON files to be processed.
    output_folder (str): Path to the folder where updated JSON files will be saved.

    Output:
    Saves new JSON files with "_correct_pages" appended to the filenames in the output folder.
    """
    # Load CSV
    metadata_df = pd.read_csv(csv_file)
    
    # Ensure the folders exist
    json_folder_path = Path(json_folder)
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)
    
    if not json_folder_path.exists():
        print(f"❌ Folder not found: {json_folder}")
        return
    
    # Process all JSON files in the folder
    for json_file in json_folder_path.glob("*.json"):
        # Extract original filename by removing "processed_"
        json_filename = json_file.stem.replace("processed_", "")

        # Append .pdf to match filenames in CSV
        json_filename_with_ext = json_filename + ".pdf"

        # Find the corresponding row in CSV
        row = metadata_df[metadata_df["Extracted Q&A File Name"] == json_filename_with_ext]

        if row.empty:
            print(f"❌ No matching row found in CSV for file: {json_filename_with_ext}")
            continue

        correct_page_number = int(row.iloc[0]["Q&A Section Page"])

        # Read the JSON file
        with open(json_file, "r", encoding="utf-8") as f:
            file_content = f.read().strip()

        # Preprocess JSON: Remove Markdown artifacts
        if file_content.startswith("```json") and file_content.endswith("```"):
            file_content = file_content[7:-3].strip()

        # Ensure JSON objects are wrapped in a list
        file_content_cleaned = "[" + file_content.replace("}\n{", "},\n{") + "]"

        try:
            # Load the cleaned JSON
            json_data = json.loads(file_content_cleaned)

            # Update Page numbers
            for entry in json_data:
                if "Page" in entry and isinstance(entry["Page"], list):
                    entry["Page"] = [p + correct_page_number - 2 for p in entry["Page"]]

            # Save new version with updated pages in the output folder
            new_json_file = output_folder_path / f"{json_file.stem}_correct_pages.json"
            with open(new_json_file, "w", encoding="utf-8") as f:
                json.dump(json_data, f, indent=4)

            print(f"✅ Updated JSON saved as: {new_json_file}")

        except json.JSONDecodeError as e:
            print(f"❌ Failed to parse JSON after cleaning: {e}")


In [115]:
##Needed only when run localy --- update using path to your project folder 
import os
import sys
from pathlib import Path

#Update path HERE - dont forget the r in the begining  
project_path = Path(r"C:\Users\genna\Documents\Cam-Course\BoE_RAGS\Bank_of_England_NLP")
os.chdir(project_path)
sys.path.insert(0, str(project_path))

print(f"Current directory: {os.getcwd()}")

Current directory: C:\Users\genna\Documents\Cam-Course\BoE_RAGS\Bank_of_England_NLP


In [117]:
#Setup for folder based processing

csv_file = "1_data_and_preprocess/1.0_raw/extracted_QnA/qna_sections_metadata.csv"
json_folder = "1_data_and_preprocess/1.0_raw/processed_QnA"
output_folder = "1_data_and_preprocess/1.0_raw/processed_QnA_correct_pages"
update_json_page_numbers(csv_file, json_folder, output_folder)

✅ Updated JSON saved as: 1_data_and_preprocess\1.0_raw\processed_QnA_correct_pages\processed_QnA_jpm-1q24-earnings-call-transcript_correct_pages.json
✅ Updated JSON saved as: 1_data_and_preprocess\1.0_raw\processed_QnA_correct_pages\processed_QnA_jpm-2q24-earnings-call-transcript_correct_pages.json
✅ Updated JSON saved as: 1_data_and_preprocess\1.0_raw\processed_QnA_correct_pages\processed_QnA_jpm-3q24-earnings-call-transcript_correct_pages.json
✅ Updated JSON saved as: 1_data_and_preprocess\1.0_raw\processed_QnA_correct_pages\processed_QnA_jpm-4q24-earnings-call-transcript_correct_pages.json
✅ Updated JSON saved as: 1_data_and_preprocess\1.0_raw\processed_QnA_correct_pages\processed_QnA_Q1_Transcript-Analyst-Call-25-April-2024_correct_pages.json
✅ Updated JSON saved as: 1_data_and_preprocess\1.0_raw\processed_QnA_correct_pages\processed_QnA_Q2_Transcript-Analyst-Call-24-July-2024-For-Upload_correct_pages.json
✅ Updated JSON saved as: 1_data_and_preprocess\1.0_raw\processed_QnA_correct