In [None]:
# ENVIRONMENT

from aisurveywriter.core.llm_handler import LLMHandler
import aisurveywriter.core.file_handler as fh
from aisurveywriter.utils import get_all_files_from_paths
from aisurveywriter.core.pipeline import PaperPipeline
from aisurveywriter.core.paper import PaperData
import aisurveywriter.tasks as tks

import os
os.environ["GOOGLE_API_KEY"]=fh.read_credentials("../credentials.yaml")["google_key"]

# llm = LLMHandler(model="qwen2.5:14b", model_type="ollama", temperature=0.5)
prompts = fh.read_yaml("../templates/prompt_config.yaml")
review = fh.read_yaml("../templates/review_config.yaml")

In [None]:
# Serialize default prompt store

from aisurveywriter.store.prompt_store import PromptStore, default_prompt_store
import json

old = default_prompt_store()

with open("prompts-20250320.json", "w", encoding="utf-8") as f:
    json.dump(old.model_dump(), f, indent=2)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
# Manual RAG retrieval

from aisurveywriter.core.agent_rags import AgentRAG, RAGType
from aisurveywriter.core.text_embedding import EmbeddingsHandler

embed = EmbeddingsHandler("Snowflake/snowflake-arctic-embed-l-v2.0", "huggingface")
rag = AgentRAG(embed, bib_faiss_path="../out/refextract-bibdb.faiss", 
               figures_faiss_path="../out/figures-rag.faiss", 
               content_faiss_path="../out/content-rag.faiss",
               request_cooldown_sec=6)



In [None]:
query = r"meniscus effect"
rag.retrieve(RAGType.ImageData, query)

In [None]:
from aisurveywriter.core.text_embedding import EmbeddingsHandler
from langchain_community.vectorstores import FAISS

embed = EmbeddingsHandler("Snowflake/snowflake-arctic-embed-l-v2.0", "huggingface")
faiss = FAISS.load_local("../out/refextract-bibdb.faiss", embeddings=embed.model, allow_dangerous_deserialization=True)
faiss.similarity_search_with_score("This review presents a comprehensive overview of these techniques, crucial for producing high-quality LB films. Ultimately, a deeper understanding of Langmuir monolayer characterization empowers the development of advanced materials and devices across diverse fields, pushing the boundaries of nanoscience and nanotechnology", k=10)

In [None]:
# image caption extraction test
from aisurveywriter.core.pdf_processor import PDFProcessor, LayoutParserSettings
from aisurveywriter.utils.helpers import get_all_files_from_paths

lp_settings = LayoutParserSettings(config_path="lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config", tesseract_executable="/home/juliocesar/bin/tesseract", score_threshold=0.7)

pdf = PDFProcessor(["../refexamples/all21/OliveiraO2022_PastAndFuture.pdf"], lp_settings)

In [None]:
from aisurveywriter.store.reference_store import ReferenceStore
import os

refstore = ReferenceStore.from_local("../results/138refs-compatrycja/refstore.pkl")
for i, doc in enumerate(refstore.documents):
    title = doc.title.replace("\n", " ").strip() if doc.title else "unk title"
    author = doc.author.replace("\n", " ").strip() if doc.author else "unk author"
    if author:
        authors = author.split("*") if "*" in author else author.split("and")
        if len(authors) > 2:
            author = authors[0].strip() + ", et al."
    name = os.path.basename(doc.path)
    
    if doc.bibtex_entry and "doi" in doc.bibtex_entry:
        doi = "https://doi.org/" + doc.bibtex_entry["doi"]
    elif "link" in doc.bibtex_entry:
        doi = doc.bibtex_entry["link"]
    elif "url" in doc.bibtex_entry:
        doi = doc.bibtex_entry["doi"]
        
    print(f"{i+1}. {author}; \"{title}\" | {name} | {doi}")

In [None]:

from aisurveywriter.utils.helpers import get_all_files_from_paths, get_bibtex_entry
import os
import re

title_pattern = re.compile(r"^(?:title)\s*[:\.-]*\s*(.+?)[\n]", re.IGNORECASE)

refs = []
paths = get_all_files_from_paths("../refexamples/rafael_and_lesscited_and_21/", skip_ext=[".pdf"], stem_sort=True)
for path in paths:
    print(path)
    with open(path, encoding="utf-8") as f:
        content = f.read()

    base = os.path.basename(path)
    print(base)
    if title_match := title_pattern.search(content):
        title = title_match.group(1).strip()
        print("title match:", title)
        bib = get_bibtex_entry(title, None)
        if not bib:
            print("no bib found\n")
            continue
        
        if bib and "doi" in bib:
            doi = "https://doi.org/" + bib["doi"]
        elif "link" in bib:
            doi = bib["link"]
        elif "url" in bib:
            doi = bib["doi"]
        
        refs.append((bib.get("author", None), title, doi, base))


for i, (author, title, doi, file) in enumerate(refs):
    author = author.replace("\n", " ").strip() if author else "unk author"
    if author != "unk author":
        authors = author.split("*") if "*" in author else author.split("and")
        if len(authors) > 2:
            author = authors[0].strip() + ", et al."

    print(f"{108+i}. {author}; \"{title}\" | {file} | {doi}")

In [None]:
get_bibtex_entry("External Infrared Reflection Absorption Spectrometry of Monolayer Films at the Air-Water Interface", None)

In [None]:
import re
from aisurveywriter.utils.helpers import get_bibtex_entry

ref_pattern = re.compile(r"(\d+)\.\s(.+?);\s\"(.+?)\"\s\|\s(.+?)\s\|\s(.+?)\s*\n")

ref_text = ""

refs = []
for match in ref_pattern.finditer(ref_text):
    num = int(match.group(1).strip())
    author = match.group(2).strip()
    title = match.group(3).strip()
    file = match.group(4).strip()
    doi = match.group(5).strip()
    
    refs.append((num, author, title, file, doi))
    
for num, author, title, file, doi in refs[1:]:
    print(f"{num-1}. {author}; \"{title}\" | {doi} | {file}")

In [None]:
import re

with open("../results/review_references.txt", encoding="utf-8") as f:
    content = f.read()
    
    

blocks = content.split("\n\n")
blocks = [block.strip() for block in blocks if block.strip()]
versions = {}
ref_pattern = re.compile(r"(\d+)\.\s(.+?);\s\"(.+?)\"\s\|\s(.+?)\s\|\s(.+?)\.(pdf|txt)")
version_pattern = re.compile(r"Version:\s\"(.+?)\"")
for block in blocks:
    version = version_pattern.match(block).group(1).strip()
    refs = []
    for match in ref_pattern.finditer(block):
        num = int(match.group(1).strip())
        author = match.group(2).strip()
        title = match.group(3).strip()
        doi = match.group(4).strip()
        file = match.group(5).strip() + "." + match.group(6).strip()
        refs.append((num, author, title, doi, file))
    versions[version] = refs

def generate_markdown_references(versions_dict):
    from textwrap import dedent

    # Flatten all items to calculate max width for each field
    all_refs = [ref for refs in versions_dict.values() for ref in refs]
    max_author_len = max(len(ref[1]) for ref in all_refs)
    max_title_len = max(len(ref[2]) for ref in all_refs)
    max_doi_len = max(len(ref[3]) for ref in all_refs)

    def format_ref(ref):
        num, author, title, doi, _ = ref
        return (
            f"{str(num)}. "
            f"{author.ljust(max_author_len)} | "
            f"{title.ljust(max_title_len)} | "
            f"{doi.ljust(max_doi_len)}"
        )

    markdown_lines = []
    for version, refs in versions_dict.items():
        markdown_lines.append(f"### {version}\n")
        for ref in refs:
            markdown_lines.append(f"{format_ref(ref)}")
        markdown_lines.append("")  # Extra newline after each version

    return "\n".join(markdown_lines)


In [None]:
all_refs = versions["138refs-compatrycja"]
all_refs

In [None]:
import bibtexparser
from aisurveywriter.utils.helpers import get_bibtex_entry

all_refs = versions["138refs-compatrycja"]
db = bibtexparser.bibdatabase.BibDatabase()

for num, author, title, doi, file in all_refs:
    if "et al." in author:
        author = author[:author.find("et al.")].strip()
    bib = get_bibtex_entry(title, author)
    if not bib:
        bib = get_bibtex_entry(title, None)
    print(bib)
    db.entries.append(bib)

In [None]:
import copy
backup = copy.deepcopy(db.entries)

In [None]:
import os


for entry, (num, author, title, doi, file) in zip(db.entries, all_refs):
    entry = copy.deepcopy(entry)
    if "doi" in entry:
        entry_doi = "https://doi.org/" + entry["doi"]
    elif "link" in entry:
        entry_doi = entry["link"]
    elif "url" in entry:
        entry_doi = entry["url"]
    else:
        print("unable to find doi in entry:", entry, file)
        print()
        entry_doi = None
    
    if entry_doi and doi != entry_doi:
        print("doi doesnt match:", doi, entry_doi, file, num)
        print(entry["ID"])
        

In [None]:
e = bibtexparser.loads("""@article{Palto1996,
author = {S. Palto and L. Blinov and A. Bune and E. Dubovik and V. Fridkin and N. Petukhova and K. Verkhovskaya and S. Yudin and},
title = {Ferroelectric langmuir-blodgett films},
journal = {Ferroelectrics},
volume = {184},
number = {1},
pages = {127--129},
year = {1996},
publisher = {Taylor \& Francis},
doi = {10.1080/00150199608230252},
URL = {https://doi.org/10.1080/00150199608230252},
eprint = {https://doi.org/10.1080/00150199608230252},
abstract = {Ferroelectric Langmuir-Blodgett films are prepared and investigated for the first time. The films are prepared from the ferroelectric copolymer of vinylidene fluoride with trifluorethylene. Films with a thickness of 150Å show a pyroelectric effect, remnant polarization switching and a ferroelectric phase transition of the first order, characterized by temperature hysteresis. }
}""").entries[0]

In [None]:
print(db.entries[1])
all_refs = [list(ref) for ref in all_refs]
for entry, ref in zip(db.entries, all_refs):
    if "doi" in entry:
        doi = "https://doi.org/" + entry["doi"]
    elif "link" in entry:
        doi = entry["link"]
    elif "url" in entry:
        doi = entry["url"]
    else:
        continue
    ref[-2] = doi

In [None]:
all_refs[136]

In [None]:
with open("refs.bib", "w", encoding="utf-8") as f:
    bibtexparser.dump(db, f)

In [None]:
with open("refs.bib", "r", encoding="utf-8") as f:
    db = bibtexparser.load(f)

In [None]:
all_tex = ""
for version in versions:
    tex = f"""\\section{{{version}}}
    \\begin{{enumerate}}
"""
    version_refs = versions[version]
    ref_text = ""
    for i, (num, author, title, doi, file) in enumerate(version_refs):
        updated_ref = [ref for ref in all_refs if ref[-1] == file][0]
        version_refs[i] = updated_ref.copy()
        version_refs[i][0] = num
    
    for i, (num, author, title, doi, file) in enumerate(version_refs):
        ref_entry = None
        for entry in db.entries:
            if "doi" in entry:
                if entry["doi"] in doi:
                    ref_entry = entry
                    break
            if "link" in entry:
                if entry["link"] == doi:
                    ref_entry = entry
                    break
            if "url" in entry:
                if entry["url"] == doi:
                    ref_entry = entry
            if entry["title"].lower().strip() == title.lower().strip():
                ref_entry = entry
        
        ref_text += f"        \\item {author}, {ref_entry.get("year", None)} \\cite{{{ref_entry["ID"]}}}\n"
    
    tex += ref_text + "   \\end{enumerate}"
    all_tex += tex + "\n\n"

print(all_tex)