In [None]:
# Stage 1: Convert PDF to JSON using Docling VLM pipeline via LM Studio
from Functions.utils_logging import setup_basic_logging
setup_basic_logging()

from Functions.pdf_to_json import convert_pdf_with_lmstudio

# Configure input PDF and output directory
PDF_SOURCE = r"Sample Papers/1-s2.0-S1385110124000054-main.pdf"  # Change to your PDF path if needed
OUTPUT_DIR = "output_lmstudio_conversion"

# Run conversion
json_path = convert_pdf_with_lmstudio(
    PDF_SOURCE,
    OUTPUT_DIR,
    lm_studio_url="http://localhost:1234/v1/chat/completions",
    model_identifier="google/gemma-3-12b-it-gguf",
    prompt="Parse the document.",
    max_tokens=16384,
    generate_page_images=True,
)
print("JSON saved to:", json_path)


In [8]:
# Stage 2: Process Docling JSON -> Enhanced JSON -> NLP-ready JSON (with inline verification)
# This cell re-imports libraries and re-reads files to keep the two stages fully separated.
from Functions.utils_logging import setup_basic_logging
setup_basic_logging()

from Functions.pipeline_steps import (
    step1_add_ai_descriptions_with_chart_extraction,
    step2_remove_all_images,
)
from Functions.verification import verify_final_json

# Set the source JSON produced in Stage 1 (update this path to match your Stage 1 output)
DOC_JSON_SOURCE = r"C:\\Users\\Hongyu\\OneDrive - Northwestern University\\NU\\Capstone\\PDF_Analyzer_App\\output\\applsci-14-07088-v2.json"

# Step 1: Add AI descriptions + optional chart data extraction and web search
success1, enhanced_path = step1_add_ai_descriptions_with_chart_extraction(
    DOC_JSON_SOURCE,
    lm_studio_url="http://localhost:1234/v1/chat/completions",
    model_name="google/gemma-3-12b-it-gguf",
    enable_chart_extraction=True,
    enable_web_search_for_conceptual=True,
    sleep_between_images_s=1.0,
)
print("Step 1 success:", success1)
print("Enhanced JSON:", enhanced_path)

# Step 2: Remove all images -> NLP-ready JSON and verify inline
if success1 and enhanced_path:
    success2, nlp_ready_path = step2_remove_all_images(enhanced_path)
    print("Step 2 success:", success2)
    print("NLP-ready JSON:", nlp_ready_path)

    if success2 and nlp_ready_path:
        summary = verify_final_json(nlp_ready_path, require_no_images=True, min_description_length=20)
        print("Verification OK:", summary.get("ok"))
        print({k: summary[k] for k in [
            "total_pictures","no_images_ok","with_ai_analysis","with_nonempty_description",
            "with_web_context","with_chart_data","images_keys_found"
        ]})
        if summary.get("per_picture_issues"):
            print("Per-picture issues:")
            for issue in summary["per_picture_issues"]:
                print(issue)
else:
    print("Stage 2 aborted: Stage 1 failed or no enhanced JSON path.")


2025-08-11 19:13:37,845 - INFO - Functions.pipeline_steps - Processing picture 1/8...
2025-08-11 19:13:38,222 - INFO - Functions.pipeline_steps - Marking non-informative image #1 for removal
2025-08-11 19:13:38,223 - INFO - Functions.pipeline_steps - Processing picture 2/8...
2025-08-11 19:13:38,427 - INFO - Functions.pipeline_steps - Marking non-informative image #2 for removal
2025-08-11 19:13:38,427 - INFO - Functions.pipeline_steps - Processing picture 3/8...
2025-08-11 19:13:38,650 - INFO - Functions.pipeline_steps - Marking non-informative image #3 for removal
2025-08-11 19:13:38,650 - INFO - Functions.pipeline_steps - Processing picture 4/8...
  ddgs = DDGS()
2025-08-11 19:13:42,831 - INFO - primp - response: https://www.bing.com/search?q=Arabic+NLP+Text+Summarization 200
2025-08-11 19:13:46,949 - INFO - Functions.pipeline_steps - Processing picture 5/8...
  ddgs = DDGS()
2025-08-11 19:13:50,388 - INFO - primp - response: https://www.bing.com/search?q=machine+translation+natural

Step 1 success: True
Enhanced JSON: C:\Users\Hongyu\OneDrive - Northwestern University\NU\Capstone\PDF_Analyzer_App\output\applsci-14-07088-v2_with_descriptions_and_chart_data.json
Step 2 success: True
NLP-ready JSON: C:\Users\Hongyu\OneDrive - Northwestern University\NU\Capstone\PDF_Analyzer_App\output\applsci-14-07088-v2_nlp_ready.json
Verification OK: True
{'total_pictures': 5, 'no_images_ok': True, 'with_ai_analysis': 5, 'with_nonempty_description': 5, 'with_web_context': 2, 'with_chart_data': 0, 'images_keys_found': 0}
