In [None]:
# ENVIRONMENT

from aisurveywriter.core.llm_handler import LLMHandler
import aisurveywriter.core.file_handler as fh
from aisurveywriter.utils import get_all_files_from_paths
from aisurveywriter.core.pipeline import PaperPipeline
from aisurveywriter.core.paper import PaperData
import aisurveywriter.tasks as tks

import os
os.environ["GOOGLE_API_KEY"]=fh.read_credentials("../credentials.yaml")["google_key"]

# llm = LLMHandler(model="qwen2.5:14b", model_type="ollama", temperature=0.5)
prompts = fh.read_yaml("../templates/prompt_config.yaml")
review = fh.read_yaml("../templates/review_config.yaml")

In [None]:
# Serialize default prompt store

from aisurveywriter.store.prompt_store import PromptStore, default_prompt_store
import json

old = default_prompt_store()

with open("prompts-24022025.json", "w", encoding="utf-8") as f:
    json.dump(old.model_dump(), f, indent=2)

In [None]:
# Manual RAG retrieval

from aisurveywriter.core.agent_rags import AgentRAG, RAGType
from aisurveywriter.core.text_embedding import EmbeddingsHandler

embed = EmbeddingsHandler("Snowflake/snowflake-arctic-embed-l-v2.0", "huggingface")
rag = AgentRAG(embed, bib_faiss_path="../out/refextract-bibdb.faiss", figures_faiss_path="../out/figures-rag.faiss", content_faiss_path="../out/content-rag.faiss",
               request_cooldown_sec=6)

query = r"Schematic representation of a Langmuir trough that contains the Wilhelmy plate for measuring surface pressure using an electrobalance and a surface potential probe.   Also shown is the dipper employed for transferring Langmuir/Blodgett onto a solid substrate"
rag.retrieve(RAGType.ImageData, query)

In [None]:
# image caption extraction test
from aisurveywriter.core.pdf_processor import PDFProcessor
from aisurveywriter.utils.helpers import get_all_files_from_paths

# pdf = PDFProcessor(["../refexamples/filter-artigos-rafael/103.pdf"])
pdf = PDFProcessor(get_all_files_from_paths("../refexamples/filter-artigos-rafael", skip_ext=[".txt"]))
# images = pdf.extract_images("test")

content = pdf.extract_content()


In [None]:
for i, c in enumerate(content):
    c = c.lower()
    if "polarization dependence of the" in c:
        print(f"found in {i}: {pdf.pdf_paths[i]}")

In [None]:
print("\n".join([f"{i}: {image.caption}" for i,image in enumerate(images)]))

In [None]:
text = pdf.pdf_documents[0][9].page_content
# text = text[text.find("9."):]
print(text)

In [None]:
# LayoutParser for pdf info extraction
%pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [None]:
%pip install torchvision


In [None]:
%pip install layoutparser pdf2image "layoutparser[ocr]"

In [None]:
import layoutparser as lp
from layoutparser.models.detectron2 import catalog
import copy
import os
import requests

def load_model(
        config_path: str = 'lp://<dataset_name>/<model_name>/config',
        extra_config=None,
):

    config_path_split = config_path.split('/')
    dataset_name = config_path_split[-3]
    model_name = config_path_split[-2]

    # get the URLs from the MODEL_CATALOG and the CONFIG_CATALOG 
    # (global variables .../layoutparser/models/detectron2/catalog.py)
    model_url = catalog.MODEL_CATALOG[dataset_name][model_name]
    config_url = catalog.CONFIG_CATALOG[dataset_name][model_name]

    # override folder destination:
    if 'model' not in os.listdir():
        os.mkdir('model')

    config_file_path, model_file_path = None, None

    for url in [model_url, config_url]:
        filename = url.split('/')[-1].split('?')[0]
        save_to_path = f"model/" + filename
        if 'config' in filename:
            config_file_path = copy.deepcopy(save_to_path)
        if 'model_final' in filename:
            model_file_path = copy.deepcopy(save_to_path)

        # skip if file exist in path
        if filename in os.listdir("model"):
            continue
        # Download file from URL
        r = requests.get(url, stream=True, headers={'user-agent': 'Wget/1.16 (linux-gnu)'})

        with open(save_to_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=4096):
                if chunk:
                    f.write(chunk)

    # load the label map
    label_map = catalog.LABEL_MAP_CATALOG[dataset_name]

    return lp.models.Detectron2LayoutModel(
        config_path=config_file_path,
        model_path=model_file_path,
        label_map=label_map,
        extra_config=extra_config,
    )

In [None]:
import layoutparser as lp
import pdf2image
import cv2
import numpy as np
import re
from PIL import Image

pdf_image = pdf2image.convert_from_path("../refexamples/filter-artigos-rafael/18.pdf")

lp_model = load_model("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config", extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.75])
lp_ocr = lp.TesseractAgent.with_tesseract_executable("/home/juliocesar/bin/tesseract")

In [None]:
pdf_image[0]

In [None]:
layout = lp_model.detect(pdf_image[0])
lp.draw_box(pdf_image[0], layout, box_width=3)

In [None]:
lp.draw_box(pdf_image[0], [b for b in layout if b.type == "Title"], box_width=3)

In [None]:
all_text = []
figure_count = 0
output_dir = "test"
os.makedirs(output_dir, exist_ok=True)

for page_num, img in enumerate(pdf_image):
    print(f"Processing page {page_num + 1}/{len(pdf_image)}...")
        
    # Convert PIL Image to OpenCV format
    img_cv = np.array(img)
    img_cv = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
    page_width, page_height = img.width, img.height
    
    layout = lp_model.detect(img_cv)
    
    # Sort blocks according to scientific paper reading order
    # First divide the page into 4 quadrants and sort blocks by their position
    def sort_blocks_by_scientific_layout(blocks):
        # Define page midpoints
        mid_x = page_width / 2
        mid_y = page_height / 2
        
        # Group blocks by quadrant
        top_left = []
        top_right = []
        bottom_left = []
        bottom_right = []
        
        for block in blocks:
            # Get block center
            x1, y1, x2, y2 = block.coordinates
            center_x = (x1 + x2) / 2
            center_y = (y1 + y2) / 2
            
            # Assign to quadrant
            if center_x < mid_x:
                if center_y < mid_y:
                    top_left.append(block)
                else:
                    bottom_left.append(block)
            else:
                if center_y < mid_y:
                    top_right.append(block)
                else:
                    bottom_right.append(block)
        
        # Sort blocks within each quadrant by y-coordinate (top to bottom)
        for quadrant in [top_left, top_right, bottom_left, bottom_right]:
            quadrant.sort(key=lambda block: block.coordinates[1])
        
        # Combine quadrants in reading order: top-left, bottom-left, top-right, bottom-right
        return top_left + bottom_left + top_right + bottom_right
        
    # Apply scientific layout sorting
    layout = sort_blocks_by_scientific_layout(layout)

    page_text = []
    figure_blocks = []
    text_blocks = []
    
    for block in layout:
        # Extract coordinates
        x1, y1, x2, y2 = block.coordinates
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        
        # Crop the region
        region = img_cv[y1:y2, x1:x2]
        
        if block.type == "Figure":
            figure_blocks.append(block)
        elif block.type in ["Text", "Title", "List"]:
            # Use LayoutParser's Tesseract agent for OCR
            segment_image = (block
                            .pad(left=5, right=5, top=5, bottom=5)
                            .crop_image(img_cv))
            
            # Extract text using OCR
            text = lp_ocr.detect(segment_image)
            block.set(text=text, inplace=True)
            text_blocks.append(block)
            
            # Add to the full text content (only from text blocks)
            page_text.append(text)
        
    # Process figures and associate captions
    for i, figure_block in enumerate(figure_blocks):
        figure_count += 1
        fig_x1, fig_y1, fig_x2, fig_y2 = [int(coord) for coord in figure_block.coordinates]
        
        # Extract the figure image
        figure_img = img_cv[fig_y1:fig_y2, fig_x1:fig_x2]
        figure_img_rgb = cv2.cvtColor(figure_img, cv2.COLOR_BGR2RGB)
        figure_pil = Image.fromarray(figure_img_rgb)
        
        # Find the closest text block below the figure that could be a caption
        caption = ""
        min_distance = float('inf')
        
        for text_block in text_blocks:
            text_y1 = text_block.coordinates[1]
            text_x_center = (text_block.coordinates[0] + text_block.coordinates[2]) / 2
            fig_x_center = (fig_x1 + fig_x2) / 2
            
            # Check if text block is below the figure and horizontally aligned
            if (text_y1 > fig_y2 and 
                abs(text_x_center - fig_x_center) < (fig_x2 - fig_x1) / 2):
                
                distance = text_y1 - fig_y2
                if distance < min_distance:
                    min_distance = distance
                    caption = text_block.text
                    
                    # Caption threshold - if it's too far, probably not a caption
                    if min_distance > 100:
                        caption = ""
        
        # Clean caption text (if found)
        if caption:
            # Look for patterns like "Figure 1:" or "Fig. 1."
            caption = caption.strip()
            # Remove extra newlines
            caption = re.sub(r'\n+', ' ', caption)
        
        # Save figure with descriptive filename
        figure_filename = f"figure_{page_num+1}_{figure_count}.png"
        figure_path = os.path.join(output_dir, figure_filename)
        figure_pil.save(figure_path)
        
        # Save caption to a corresponding text file
        if caption:
            caption_filename = f"figure_{page_num+1}_{figure_count}_caption.txt"
            caption_path = os.path.join(output_dir, caption_filename)
            with open(caption_path, 'w', encoding='utf-8') as f:
                f.write(caption)
        
        print(f"Saved figure {figure_count} from page {page_num+1}")

    all_text.append("\n".join(page_text))
    
text_output_path = os.path.join(output_dir, "extracted_text.txt")
with open(text_output_path, 'w', encoding='utf-8') as f:
    f.write("\n\n----- PAGE BREAK -----\n\n".join(all_text))
    
print(f"Extraction complete. Extracted {figure_count} figures and saved text to {text_output_path}")

In [None]:
!wget https://github.com/AlexanderP/tesseract-appimage/releases/download/v5.5.0/tesseract-5.5.0-x86_64.AppImage

In [None]:
from aisurveywriter.core.new_pdf_processor import PDFProcessor

proc = PDFProcessor(["../refexamples/filter-artigos-rafael/37.pdf"], lp_tesseract_exectuable="/home/juliocesar/bin/tesseract")

In [None]:
print(proc.documents[0].figures[0])

In [None]:
from aisurveywriter.core.text_embedding import EmbeddingsHandler

embed = EmbeddingsHandler("Snowflake/snowflake-arctic-embed-l-v2.0", "huggingface")
embed.model.embed_query()

In [None]:
from aisurveywriter.store.reference_store import ReferenceStore

refs = ReferenceStore.from_local("../out/refstore.pkl")
figs = [fig for fig in refs.all_figures() if fig.caption]