In [2]:
# ====> ENVIRONMENT SETUP
import os
import sys
import yaml

def read_yaml(fpath: str) -> dict:
    with open(fpath, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)
    return data

CREDENTIALS = read_yaml("../credentials.yaml")
os.environ["GOOGLE_API_KEY"] = CREDENTIALS["google_key"]
os.environ["OPENAI_API_KEY"] = CREDENTIALS["openai_key"]
os.environ["NBLM_EMAIL"] = CREDENTIALS["nblm_email"]
os.environ["NBLM_PASSWORD"] = CREDENTIALS["nblm_password"]

project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, project_root)

## LangChain Test

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
llm.invoke([HumanMessage(content="Write a poem about the moon")])

# Pipeling testing

In [7]:
# Get pdf data from references
from langchain.document_loaders import PyPDFLoader

pdf_files = [
    "../refexamples/ArigaK2023_Langmuir.pdf",
    "../refexamples/FangC_ApplicationsLangmuir.pdf",
]
doc_data = []
for file in pdf_files:
    loader = PyPDFLoader(file)
    doc_data.extend(loader.load())

In [8]:
import yaml
from langchain.prompts import PromptTemplate

with open("../paper_instructions.yaml", "r") as f:
    paper = yaml.safe_load(f)

pdf_data = "\n".join([doc.page_content for doc in doc_data])
prompt_fmt = paper["base_prompt_format"] + "\n\nThe accompanying PDF data for the references is:\n{pdf_data}"
prep_instructions = paper["preparation_instructions"]
prompt = PromptTemplate(
    input_variables=["preparation_instructions", "title", "subject", "description"],
    template=paper["base_prompt_format"],
)

In [10]:
from langchain.chains import LLMChain

chain = LLMChain(llm=llm, prompt=prompt)
gen_sections = []

for section in paper["sections"]:
    response = chain.run({
        "preparation_instructions": prep_instructions,
        "subject": paper["subject"],
        "pdf_data": pdf_data,
        "title": section["title"],
        "description": section["description"],
    })
    gen_sections.append({"title": section["title"], "content": response})

In [None]:
print(gen_sections)

In [13]:
dump = {"sections": gen_sections}
with open("generated20241213", "w") as f:
    yaml.dump(dump, f)

# Protoyping

In [None]:
from typing import List,Union
import undetected_chromedriver as uc
from fake_useragent import UserAgent
from langchain.document_loaders import PyPDFLoader
from langchain_core.messages import SystemMessage, AIMessage
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from chatbots import NotebookLMBot

import re
import os
from time import sleep

def init_driver(browser_path: Union[str,None] = None, driver_path: Union[str,None] = None) -> uc.Chrome:
    op = uc.ChromeOptions()
    op.add_argument(f"user-agent={UserAgent.random}")
    op.add_argument("user-data-dir=./")
    op.add_experimental_option("detach", True)
    op.add_experimental_option("excludeSwitches", ["enable-logging"])
    driver = uc.Chrome(
            chrome_options=op,
            browser_executable_path=browser_path,
            driver_executable_path=driver_path
    )
    return driver

def get_pdf_contents(pdf_paths: List[str]):
    doc_data = []
    for file in pdf_paths:
        loader = PyPDFLoader(file)
        doc_data.extend(loader.load())
    return doc_data

def generate_paper_structure(prompt: str, subject: str, pdf_paths: List[str], outfile: str):
    """ Generate paper structure using NotebookLM """
    if prompt.find("{subject}") != -1:
        prompt = prompt.replace("{subject}", subject)

    # Use NotebookLM bot to send it
    driver = init_driver(None, "../drivers/chromedriver")
    nblm = NotebookLMBot(
        user=os.environ["NBLM_EMAIL"],
        password=os.environ["NBLM_PASSWORD"],
        driver=driver,
        src_paths=pdf_paths
    )
    if not nblm.login():
        print("Unable to login to NotebookLM")
        return
    
    nblm.send_prompt(prompt, sleep_for=30)
    response = nblm.get_last_response()

    # format response and save to yaml
    result = "sections:\n"
    for line in response.split("\n"):
        result += f"  {line}\n"
    
    with open(outfile, "w", encoding="utf-8") as f:
        f.write(result)

    return read_yaml(outfile)

def setup_context_msg(response_fmt_prompt: str, pdf_paths: List[str]):
    """ Setup context SystemMessage with writing instructions + PDFs contents """
    
    context = response_fmt_prompt
    pdf_content = get_pdf_contents(pdf_paths)
    context += "\n\nThe PDF content of the given references are:\n"
    context += "\n".join([doc.page_content for doc in pdf_content])

    return SystemMessage(content=context)

def init_chain(llm, ctx_msg: SystemMessage, write_prompt: str):
    """ Setup LLMChain with proper prompts and context """
    req_prompt = HumanMessagePromptTemplate.from_template(write_prompt)
    
    full_prompt = ChatPromptTemplate.from_messages([ctx_msg, req_prompt])
    chain = full_prompt | llm
    return full_prompt, chain

def write_section(chain, subject: str, title: str, description: str) -> AIMessage:
    """ Write the given section """
    return chain.invoke({
        "subject": subject,
        "title": title,
        "description": description,
    })

def dump_generated_sections(sections: dict, outpath: str):
    with open(outpath, "w", encoding="utf-8") as f:
        yaml.safe_dump(sections, f)


def save_latex_sections(tex_template_path: str, sections: List[dict], outpath: str):
    """ 
    Join the contents of every section to the output LaTeX file 
    'sections' must be a list of dictionaries with two keys: 'title' and 'content'
    """
    with open(tex_template_path, "r", encoding="utf-8") as f:
        tex_template = f.read()

    paper_content = ""
    
    bib_content = ""
    bib_pattern = r"\\begin{filecontents\*}(.*?)\\end{filecontents\*}"

    for section in sections:
        # Extract biblatex file content
        match = re.search(bib_pattern, section["content"], re.DOTALL)
        sec_bib_content = match.group(1).strip() if match else None
        if sec_bib_content is None:
            print("FAILED TO MATCH BIBLATEX CONTENT IN SECTION:", section["title"])
            continue

        section_text = re.sub(bib_pattern, "", section["content"], flags=re.DOTALL)
        
        paper_content += section_text
        bib_content += sec_bib_content
    bib_content = bib_content.replace("{mybib.bib}", "")
    bib_file = outpath+"bib.bib"
    
    # Replace paper content in latex template and save it
    tex_content = tex_template.replace("{content}", paper_content).replace("{bibresourcefile}", os.path.basename(bib_file))
    with open(outpath, "w", encoding="utf-8") as f:
        f.write(tex_content)

    # also save the biblatex file
    with open(bib_file, "w", encoding="utf-8") as f:
        f.write(bib_content)


def main():
    paper_cfg = read_yaml("../templates/prompt_config.yaml")
    pdf_paths = [
        "../refexamples/ArigaK2023_Langmuir.pdf",
        "../refexamples/FangC_ApplicationsLangmuir.pdf",
        "../refexamples/ArigaK2022_PastAndFutureLangmuir.pdf",
        # "../refexamples/LuC2024_AIScientist.pdf"
    ]
    paper_subject = paper_cfg["subject"]
    paper_structure = generate_paper_structure(
        prompt=paper_cfg["gen_struct_prompt"],
        subject=paper_subject,
        pdf_paths=pdf_paths,
        outfile="genstruct.yaml"
    )
    # paper_structure = read_yaml("../templates/paper_structure.yaml")


    ctx = setup_context_msg(
        response_fmt_prompt=paper_cfg["response_format"],
        pdf_paths=pdf_paths,
    )
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-8b")
    prompt, chain = init_chain(llm, ctx, paper_cfg["write_prompt"])

    paper_content = []
    for section in paper_structure["sections"]:
        airesponse = write_section(chain, paper_subject, section["title"], section["description"])
        paper_content.append(
            {
                "title": section["title"],
                "content": airesponse.content,
            }
        )
        print("====> FINISHED WRITING SECTION:", section["title"])
        print("====> REPONSE METADATA:", airesponse.usage_metadata)
        # wait because of gemini-1.5-pro quota (2 RPM, 32000 TPM)
        sleep(60*1)

    os.makedirs("out", exist_ok=True)
    dump_generated_sections({"sections": paper_content}, "out/lastgeneration.dump")
    save_latex_sections(
        tex_template_path="../templates/paper_template.tex",
        sections=paper_content,
        outpath="out/lastgenerated.tex",
    )

main()

In [3]:
print([sec["title"] for sec in read_yaml("genstruct.yaml")["sections"]])

{'sections': [{'title': 'Introduction', 'description': '- Overview of Langmuir Monolayers and Langmuir-Blodgett (LB) Films\n  - Introduce Langmuir monolayers as a two-dimensional system formed at the air/water interface [1, 2].\n  - Define LB films as the transferred monolayers onto solid substrates [1, 2].\n  - Briefly mention the historical context and significance of these techniques in materials science and nanotechnology [1, 3, 4].\n- Scope and Significance\n  - Explain the importance of LB films as a method for fabricating ultrathin films with controlled thickness and organization [1].\n  - Highlight their applications in diverse fields including sensing, electronics, and biomedicine [1, 5-7].\n  - State the goal of the survey paper, focusing on recent developments and applications of LB techniques within the context of nanoarchitectonics [1, 3].\n'}, {'title': 'Basic Concepts and Characterization Methods for Langmuir Monolayers', 'description': "- Formation of Langmuir Monolayer