In [None]:
# ENVIRONMENT

from aisurveywriter.core.llm_handler import LLMHandler
import aisurveywriter.core.file_handler as fh
from aisurveywriter.utils import get_all_files_from_paths
from aisurveywriter.core.pipeline import PaperPipeline
from aisurveywriter.core.paper import PaperData
import aisurveywriter.tasks as tks

import os
os.environ["GOOGLE_API_KEY"]=fh.read_credentials("../credentials.yaml")["google_key"]

# llm = LLMHandler(model="qwen2.5:14b", model_type="ollama", temperature=0.5)
prompts = fh.read_yaml("../templates/prompt_config.yaml")
review = fh.read_yaml("../templates/review_config.yaml")

In [None]:
# Serialize default prompt store

from aisurveywriter.store.prompt_store import PromptStore, default_prompt_store
import json

old = default_prompt_store()

with open("prompts-20250320.json", "w", encoding="utf-8") as f:
    json.dump(old.model_dump(), f, indent=2)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
# Manual RAG retrieval

from aisurveywriter.core.agent_rags import AgentRAG, RAGType
from aisurveywriter.core.text_embedding import EmbeddingsHandler

embed = EmbeddingsHandler("Snowflake/snowflake-arctic-embed-l-v2.0", "huggingface")
rag = AgentRAG(embed, bib_faiss_path="../out/refextract-bibdb.faiss", 
               figures_faiss_path="../out/figures-rag.faiss", 
               content_faiss_path="../out/content-rag.faiss",
               request_cooldown_sec=6)



In [None]:
query = r"meniscus effect"
rag.retrieve(RAGType.ImageData, query)

In [None]:
from aisurveywriter.core.text_embedding import EmbeddingsHandler
from langchain_community.vectorstores import FAISS

embed = EmbeddingsHandler("Snowflake/snowflake-arctic-embed-l-v2.0", "huggingface")
faiss = FAISS.load_local("../out/refextract-bibdb.faiss", embeddings=embed.model, allow_dangerous_deserialization=True)
faiss.similarity_search_with_score("This review presents a comprehensive overview of these techniques, crucial for producing high-quality LB films. Ultimately, a deeper understanding of Langmuir monolayer characterization empowers the development of advanced materials and devices across diverse fields, pushing the boundaries of nanoscience and nanotechnology", k=10)

In [None]:
# image caption extraction test
from aisurveywriter.core.pdf_processor import PDFProcessor, LayoutParserSettings
from aisurveywriter.utils.helpers import get_all_files_from_paths

lp_settings = LayoutParserSettings(config_path="lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config", tesseract_executable="/home/juliocesar/bin/tesseract", score_threshold=0.7)

pdf = PDFProcessor(["../refexamples/all21/OliveiraO2022_PastAndFuture.pdf"], lp_settings)

In [None]:
import os

from aisurveywriter.core.agent_rags import AgentRAG, RAGType
from aisurveywriter.core.text_embedding import EmbeddingsHandler
from aisurveywriter.core.paper import PaperData
from aisurveywriter.store.reference_store import ReferenceStore

subject = "Langmuir and **Langmuir-Blodgett Films**"
result_path = "../results/137refs-sempatrycja/"
paper = PaperData.from_structure_json(subject, result_path+"structure.json")
refstore = ReferenceStore.from_local(result_path+"refstore.pkl")

embed = EmbeddingsHandler("Snowflake/snowflake-arctic-embed-l-v2.0", "huggingface")
rags = AgentRAG(
    embed,
    content_faiss_path="137/content-rag.faiss"
)
#rags.create_rags(RAGType.GeneralText, refstore)

query_fmt = "Retrieve contextual, technical, and analytical information on the subject " + subject + " for a section titled \"{section_title}\", description:\n{section_description}"
 
# keep track of N of blocks retrieved for each source
source_retrievals = {
    os.path.basename(p): 0 for p in refstore.paths
}
for i, section in enumerate(paper.sections):
    results = rags.retrieve(RAGType.GeneralText, query_fmt.format(section_title=section.title, section_description=section.description), k=35)
    print(f"Retrieved {len(results)} chunks for section {i+1}: {section.title}")
    for result in results:
        source = os.path.basename(result.source_pdf)
        if source not in source_retrievals:
            print("untracked source:", source)
            source_retrievals[source] = 0
        source_retrievals[source] += 1

source_retrievals = list(sorted(source_retrievals.items(), key=lambda x: x[1], reverse=True))
print(source_retrievals)

In [None]:
import numpy
import matplotlib.pyplot as plt

files = [s[0] for s in source_retrievals]
non_zero = [r for r in source_retrievals if r[1] > 0]
print("# of references retrieved 0 times:", len(source_retrievals) - len(non_zero))

top_k = int(1 * len(source_retrievals))
sources, amounts = zip(*(source_retrievals[:top_k]))
source_ids = [files.index(s) for s in sources]

fig, ax = plt.subplots(figsize=(15,10))
ax.bar(source_ids, amounts)

ax.set(ylabel="# retrieved chunks")
plt.show()

print("Top", top_k, "sources")
for source in sources:
    print(source)