In [2]:
# ====> ENVIRONMENT SETUP
import os
import yaml

def read_yaml(fpath: str) -> dict:
    with open(fpath, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)
    return data

APIKEYS = read_yaml("../apikeys.yaml")
os.environ["GOOGLE_API_KEY"]=APIKEYS["google"]
os.environ["OPENAI_API_KEY"]=APIKEYS["openai"]

## LangChain Test

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
llm.invoke([HumanMessage(content="Write a poem about the moon")])

# Pipeling testing

In [7]:
# Get pdf data from references
from langchain.document_loaders import PyPDFLoader

pdf_files = [
    "../refexamples/ArigaK2023_Langmuir.pdf",
    "../refexamples/FangC_ApplicationsLangmuir.pdf",
]
doc_data = []
for file in pdf_files:
    loader = PyPDFLoader(file)
    doc_data.extend(loader.load())

In [8]:
import yaml
from langchain.prompts import PromptTemplate

with open("../paper_instructions.yaml", "r") as f:
    paper = yaml.safe_load(f)

pdf_data = "\n".join([doc.page_content for doc in doc_data])
prompt_fmt = paper["base_prompt_format"] + "\n\nThe accompanying PDF data for the references is:\n{pdf_data}"
prep_instructions = paper["preparation_instructions"]
prompt = PromptTemplate(
    input_variables=["preparation_instructions", "title", "subject", "description"],
    template=paper["base_prompt_format"],
)

In [10]:
from langchain.chains import LLMChain

chain = LLMChain(llm=llm, prompt=prompt)
gen_sections = []

for section in paper["sections"]:
    response = chain.run({
        "preparation_instructions": prep_instructions,
        "subject": paper["subject"],
        "pdf_data": pdf_data,
        "title": section["title"],
        "description": section["description"],
    })
    gen_sections.append({"title": section["title"], "content": response})

In [None]:
print(gen_sections)

In [13]:
dump = {"sections": gen_sections}
with open("generated20241213", "w") as f:
    yaml.dump(dump, f)

# Protoyping

In [5]:
from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain import PromptTemplate, ConversationChain, LLMChain
from langchain_core.messages import SystemMessage
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain_google_genai import ChatGoogleGenerativeAI
import re
import os
from time import sleep

def get_pdf_contents(pdf_paths: List[str]):
    doc_data = []
    for file in pdf_paths:
        loader = PyPDFLoader(file)
        doc_data.extend(loader.load())
    return doc_data

def generate_paper_structure(prompt: str, subject: str, pdf_paths: List[str]):
    """ Generate paper structure using NotebookLM """
    if prompt.find("{subject}") != -1:
        prompt = prompt.replace("{subject}", subject)

    # Use NotebookLM bot to send it
    
    # Save structure to "templates/paper_structure.yaml"
    pass

def setup_context_msg(preparation_yaml_path: str, pdf_paths: List[str]):
    """ Setup context SystemMessage with writing instructions + PDFs contents """
    preparation = read_yaml(preparation_yaml_path)
    
    context = f"For this context, be aware:"
    for prep in preparation:
        context += f"\n{preparation[prep]}"
    
    pdf_content = get_pdf_contents(pdf_paths)
    context += "\n\nThe PDF content of the given references are:\n"
    context += "\n".join([doc.page_content for doc in pdf_content])

    return SystemMessage(content=context)

def init_chain(llm, ctx_msg: SystemMessage, write_prompt_yaml: str) -> ConversationChain:
    """ Setup LLMChain with proper prompts and context """
    prompt_fmt = read_yaml(write_prompt_yaml)

    req_prompt = HumanMessagePromptTemplate.from_template(prompt_fmt["base_prompt_format"])
    
    full_prompt = ChatPromptTemplate.from_messages([ctx_msg, req_prompt])
    # chain = LLMChain(
    #     llm=llm,
    #     prompt=full_prompt
    # )
    chain = full_prompt | llm
    return full_prompt, chain

def write_section(chain: ConversationChain, subject: str, title: str, description: str) -> str:
    """ Write the given section """
    return chain.invoke({
        "subject": subject,
        "title": title,
        "description": description,
    })

def dump_generated_sections(sections: dict, outpath: str):
    with open(outpath, "w", encoding="utf-8") as f:
        yaml.safe_dump(sections, f)


def save_latex_sections(tex_template_path: str, sections: List[dict], outpath: str):
    """ 
    Join the contents of every section to the output LaTeX file 
    'sections' must be a list of dictionaries with two keys: 'title' and 'content'
    """

    with open(tex_template_path, "r", encoding="utf-8") as f:
        tex_template = f.read()

    paper_content = ""
    
    bib_content = ""
    bib_pattern = r"\\begin{filecontents\*}(.*?)\\end{filecontents\*}"

    for section in sections:
        # Extract biblatex file content
        match = re.search(bib_pattern, section["content"], re.DOTALL)
        sec_bib_content = match.group(1).strip() if match else None
        if sec_bib_content is None:
            print("FAILED TO MATCH BIBLATEX CONTENT IN SECTION:", section["title"])
            continue

        section_text = re.sub(bib_pattern, "", section["content"], flags=re.DOTALL)
        
        paper_content += section_text
        bib_content += sec_bib_content
    bib_content = bib_content.replace("{mybib.bib}", "")
    bib_file = outpath+"bib.bib"
    
    # Replace paper content in latex template and save it
    tex_content = tex_template.replace("{content}", paper_content).replace("{bibresourcefile}", os.path.basename(bib_file))
    with open(outpath, "w", encoding="utf-8") as f:
        f.write(tex_content)

    # also save the biblatex file
    with open(bib_file, "w", encoding="utf-8") as f:
        f.write(bib_content)


def main():
    pdf_paths = [
        "../refexamples/ArigaK2023_Langmuir.pdf",
        "../refexamples/FangC_ApplicationsLangmuir.pdf",
        "../refexamples/ArigaK2022_PastAndFutureLangmuir.pdf",
        # "../refexamples/LuC2024_AIScientist.pdf"
    ]
    struct_generation = read_yaml("../templates/gen_paper_structure.yaml")
    paper_subject = struct_generation["subject"]
    # paper_structure = generate_paper_structure(
    #     prompt=struct_generation["gen_struct_prompt"],
    #     subject=paper_subject,
    #     pdf_paths=pdf_paths,
    # )

    ctx = setup_context_msg(
        preparation_yaml_path="../templates/preparation.yaml",
        pdf_paths=pdf_paths,
    )
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
    prompt, chain = init_chain(llm, ctx, "../templates/write_prompt_fmt.yaml")
    # print(prompt.format_prompt(title="Ola",subject="Tchau",description="dadsf").to_string())

    paper_structure = read_yaml("../templates/paper_structure.yaml")
    paper_content = []
    for section in paper_structure["sections"]:
        paper_content.append(
            {
                "title": section["title"],
                "content": write_section(chain, paper_subject, section["title"], section["description"]),
            }
        )
        print(paper_content)
        print("====> FINISHED WRITING SECTION:", section["title"])
        break
        # wait because of gemini-1.5-pro quota (2 RPM, 32000 TPM)
        sleep(60*1.4)

    os.makedirs("out", exist_ok=True)
    dump_generated_sections({"sections": paper_content}, "out/lastgeneration.dump")
    save_latex_sections(
        tex_template_path="../templates/paper_template.tex",
        sections=paper_content,
        outpath="out/lastgenerated.tex",
    )

main()

KeyError: "Input to ChatPromptTemplate is missing variables {'title', 'description', 'subject'}.  Expected: ['description', 'subject', 'title'] Received: ['Langmuir Monolayers and Langmuir-Blodgett Films', 'Introduction', '-Definition and Importance\\n  -Definition of Langmuir monolayers and LB films.\\n  -Highlight the importance of these films in nanoscience and nanotechnology.\\n  -Brief mention of applications in areas such as electronics, sensors, biomedicine, etc.\\n-Comparison with Similar Films\\n  -Compare and contrast LB films with other thin-film techniques, such as self-assembled monolayers (SAMs) and layer-by-layer (LbL) films.\\n  -Discussion of the advantages and disadvantages of each technique in relation to molecular precision, scalability, and versatility.\\n  -llustration (Figure): Schematic comparison of LB, SAM, and LbL techniques. (Original artwork)\\n']\nNote: if you intended {title} to be part of the string and not a variable, please escape it with double curly braces like: '{{title}}'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT "