<a href="https://colab.research.google.com/github/hooked-on-mas/AutoGenBook/blob/main/AutoGenBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Specification Development

After entering the following, please press [Run all cells (Ctrl + F9)].

In [None]:
# @markdown ## Required fields
# @markdown ### Textbook content
book_content = "Linear Algebra for Machine Learning" # @param {type:"string"}
# @markdown ### Approximate number of pages
n_pages = 40 # @param {"type":"integer","placeholder":"40"}

# @markdown ## Optional fields
# @markdown ### Target audience
target_readers = "Second-year undergraduate students in information science" # @param {type:"string"}
# @markdown ### Frequency of equations
equation_frequency_level = 4 # @param {type:"slider", min:1, max:5, step:1}
# @markdown ### Additional requirements regarding content
additional_requirements = "Assuming students have already completed general linear algebra, the course will also serve as a review, focusing in detail on topics commonly used in machine learning." # @param {type:"string"}

if book_content == "":
    print('\033[31m'+'Please specify the textbook content.'+'\033[0m')
if n_pages == 0:
    print('\033[31m'+'Please specify the number of pages.'+'\033[0m')

## Definition of Prompts

In [None]:
# Common Prompt
prompt_common = f"""
We will write a book based on the following content:
{book_content}
The total number of pages for the entire book is expected to be {n_pages}, with 40 lines per page.
"""
if target_readers != "":
    prompt_common += f"We are considering the following as the intended audience.\n {target_readers}"
if additional_requirements != "":
    prompt_common += f"Also, please take the following into consideration.\n {additional_requirements}"

# Prompt for generating book/chapter titles and summaries
prompt_book_title = prompt_common + """
Based on the above, please provide the book and chapter titles and summaries in the following JSON format.
For the book's summary, not only include a synopsis but also touch on the book's main objectives, the scope of its contents, and the depth it covers. Write 5 to 10 sentences in detail.
Also, consider how many pages should be allocated to each chapter. Write the page number in 0.1 increments, such as 0.8 pages.
Additionally, consider whether each chapter needs subdivision based on semantic cohesion (needsSubdivision). Answer with true or false.
Do not include the chapter number in the title.
The number of sections should be adjusted as necessary.
```json
{{"title": "",
"summary": "",
"childs":
    [{{"title": "",
    "summary": "",
    "n_pages": ,
    "needsSubdivision":
    }},
    {{"title": "",
    "summary": "",
    "n_pages": ,
    "needsSubdivision":
    }},
    {{"title": "",
    "summary": "",
    "n_pages": ,
    "needsSubdivision":
    }}]
}}
```
"""

# Prompt for creating section list
prompt_section_list_creation = prompt_common + """
Based on the information above, I am planning to create a book titled {book_title}. The summary of the book is shown below.
{book_summary}
I would like to create the section on {target} in {n_pages} pages. It is assumed that there are 40 lines per page.
The summary of this section is as follows.
I would like to subdivide this section into multiple parts.
Please output the titles and summaries of each part in the following JSON format. Also, consider how many pages should be allocated to each part. Write the page number in 0.1 increments, such as 0.8 pages.
Additionally, consider whether each section needs subdivision based on semantic cohesion (needsSubdivision). Answer with true or false.
Do not include the section number in the title.
```json
[{{"title": "",
"summary": "",
"n_pages": ,
"needsSubdivision":
}},
{{"title": "",
"summary": "",
"n_pages": ,
"needsSubdivision":
}}]
```
"""

# Prompt for generating the content of the text
prompt_content_creation = prompt_common + """
Based on the information above, I am planning to create a book titled {book_title}. The summary of the book is shown below.
{book_summary}
I would like to create the section on {target} in {n_pages} pages. It is assumed that there are 40 lines per page.
The summary of this section is as follows.
{section_summary}
Please output the content of this section in LaTeX format for {n_pages} pages, which equates to {n_pages} × 40 lines. All necessary libraries have already been imported in the preamble.
Do not include any assumptions or unverified information. Do not include headers, only the body text.
{equation_frequency}
Please use the following format for the output:
```tex
Body text
```
"""

## Parameter Settings

In [None]:
# Parameters
max_depth = 3  # 1 for sections only, 2 for subsections, and so on...
max_output_pages = 1.5 # Maximum Output Pages for LLM

book_node_name = "book" # Root node's name

openai_api_secret_key_name = 'openai_api' # # Variable name when registering openai_api as a secret key
model_name = "gpt-4o" # Model's name

## Library Installation and Import

In [None]:
!apt-get update
!apt-get install -y python3-dev graphviz libgraphviz-dev pkg-config
!apt-get install -y latexmk
!apt-get install -y texlive-lang-japanese
!apt-get install -y texlive-latex-extra
%pip install -qU langchain-openai
%pip install pygraphviz
%pip install pylatex

import os
import re
import json
import networkx as nx
from google.colab import userdata
from IPython.display import Markdown
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List, Optional

import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout

from pylatex import Command, Document, Section, Subsection, Package
from pylatex.section import Chapter
from pylatex.utils import NoEscape

from google.colab import files

## Graph Creation

In [None]:
book_graph = nx.DiGraph(book_content=book_content, target_readers=target_readers, equation_frequency_level=equation_frequency_level, additional_requirements=additional_requirements)

## Creation of Title and Chapters

### Function Definition

In [None]:
def extract_book_and_chapter_contents(markdown_text):
    """
    A function that extracts the first found JSON data from text in Markdown format and converts it into a Python dictionary.

    Args:
        markdown_text (str): A string in Markdown format, expected to contain JSON data.

    Returns:
        dict or None: If a valid JSON is found, it returns a dictionary; if no valid JSON is found or parsing fails, it returns None.
    """

    # Find the starting point of the JSON in the Markdown
    start_index = markdown_text.find('{')
    if start_index == -1:
        return None

    # Traverse the entire string and check the balance of nested braces
    brace_count = 0
    for i in range(start_index, len(markdown_text)):
        if markdown_text[i] == '{':
            brace_count += 1
        elif markdown_text[i] == '}':
            brace_count -= 1

        # When the braces balance, extract the string at that point
        if brace_count == 0:
            json_string = markdown_text[start_index:i+1]
            try:
                # Convert to JSON format
                json_data = json.loads(json_string)
                return json_data
            except json.JSONDecodeError as e:
                print(f"JSON parsing error: {e}")
                return None
            return

    # If no closing brace is found
    return None

### Output from the LLM

In [None]:
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = userdata.get(openai_api_secret_key_name)

llm = ChatOpenAI(model=model_name)

prompt = PromptTemplate.from_template(prompt_book_title)

chain = prompt | llm
result = chain.invoke(
    {
        "book_content": book_content,
        "target_readers": target_readers,
        "n_pages": n_pages,
        "additional_requirements": additional_requirements
    }
)

book_json = extract_book_and_chapter_contents(result.content)

### Result storage in this graph

In [None]:
# Regarding the book
book_graph.add_nodes_from([(book_node_name, {"title": book_json["title"], "summary": book_json["summary"], "n_pages": n_pages, "needsSubdivision": True})])

# Regarding chapters (sections)
book_graph.add_nodes_from([(str(idx+1), child) for idx, child in enumerate(book_json["childs"])])
book_graph.add_edges_from([(book_node_name, str(idx+1)) for idx in range(len(book_json["childs"]))])

## Title and Content Structure Review

### Title and Structure Display

In [None]:
book_node = book_graph.nodes[book_node_name]

content_md = ""
content_md += "\n ## Title: " + book_node["title"] + " (Pages: " + str(book_node["n_pages"]) + ")"
content_md += "\n " + book_node["summary"]
for idx, child_node_name in enumerate(book_graph.successors(book_node_name)):
    child_node = book_graph.nodes[child_node_name]
    content_md += "\n ### Chapter " + str(idx+1) + ": " + child_node["title"] + " (Pages: " + str(child_node["n_pages"]) + ")"
    content_md += "\n" + child_node["summary"]

Markdown(content_md)

## Creation of the book graph

### Function Definition

In [None]:
def extract_section_list(markdown_text):

    pattern = r'```json\s*(.*?)\s*```'
    match = re.search(pattern, markdown_text, re.DOTALL)

    if match:
        json_string = match.group(1)
        data = json.loads(json_string)
        return data
    else:
        print("No JSON data found.")
        return None

def extract_section_content(markdown_text):

    pattern = r'```tex\s*(.*?)\s*```'
    match = re.search(pattern, markdown_text, re.DOTALL)

    if match:
        tex_string = match.group(1)
        return tex_string
    else:
        print("No TeX data found.")
        return None

def get_equation_frequency(equation_frequency_level):
    if equation_frequency_level == 1:
        return "Use hardly any equations. Explain all concepts in simple words, and use equations only if absolutely necessary, keeping them to a minimum."
    elif equation_frequency_level == 2:
        return "Use equations sparingly, explaining primarily in prose. Use simple equations only when needed."
    elif equation_frequency_level == 3:
        return "Strike a balance between equations and prose. Use equations to express important concepts, with prose providing supplementary explanation."
    elif equation_frequency_level == 4:
        return "Use equations actively to accurately express concepts and relationships, but still provide key explanations in prose as well."
    elif equation_frequency_level == 5:
        return "Make full use of equations. Express as many concepts and relationships as possible using equations."

### Output from LLM and storing results in the graph

In [None]:
book_node = book_graph.nodes[book_node_name]
next_parent_list = [book_node_name]

for depth in range(max_depth):
    parent_list = next_parent_list
    next_parent_list = []
    for parent_node_name in parent_list:
        for _, child_node_name in enumerate(book_graph.successors(parent_node_name)):
            parant_node = book_graph.nodes[parent_node_name]
            child_node = book_graph.nodes[child_node_name]


            if (child_node["needsSubdivision"] or child_node["n_pages"] >= max_output_pages) and depth < max_depth-1:

                # Output from the LLM
                prompt = PromptTemplate.from_template(prompt_section_list_creation)
                chain = prompt | llm

                result = chain.invoke(
                    {
                        "book_title": book_node["title"],
                        "book_summary": book_node["summary"],
                        "equation_frequency": get_equation_frequency(book_graph.graph["equation_frequency_level"]),
                        "target": child_node["title"],
                        "n_pages": child_node["n_pages"],
                        "section_summary": child_node["summary"]
                    }
                )

                # Convert the output to a dictionary
                section_json = extract_section_list(result.content)

                # Create graph nodes and store the result
                book_graph.add_nodes_from([(child_node_name + "-" + str(idx+1), grandchild) for idx, grandchild in enumerate(section_json)])
                book_graph.add_edges_from([(child_node_name, child_node_name + "-" + str(idx+1)) for idx in range(len(section_json))])

                # Add to next parent list only if subdivided
                next_parent_list.append(child_node_name)

            elif not child_node["needsSubdivision"] or depth == max_depth-1:

                # Output from the LLM
                prompt = PromptTemplate.from_template(prompt_content_creation)
                chain = prompt | llm

                result = chain.invoke(
                    {
                        "book_title": book_node["title"],
                        "book_summary": book_node["summary"],
                        "equation_frequency": get_equation_frequency(book_graph.graph["equation_frequency_level"]),
                        "target": child_node["title"],
                        "n_pages": child_node["n_pages"],
                        "section_summary": child_node["summary"]
                    }
                )

                # Save outputs to a file
                contents_tex = extract_section_content(result.content)
                with open(child_node_name + "-p.tex", mode='w', encoding='UTF-8') as f:
                    f.write(contents_tex)

                # Create graph nodes and store the result
                book_graph.add_nodes_from([(child_node_name + "-p", {"content_file_path": child_node_name + "-p.tex"})])
                book_graph.add_edges_from([(child_node_name, child_node_name + "-p")])

            else:
                print("Error: needsSubdivision attribute is not set")

## Book Graph Display

In [None]:
pos = graphviz_layout(book_graph, prog="dot")

# matplotlib settings
fig = plt.figure(figsize=(20, 10), dpi=300)
ax = fig.add_subplot(1, 1, 1)

# draw the network
nx.draw(book_graph,
        ax=ax,
        pos=pos,
        with_labels=True,
        node_size=100,
        node_color='w',
        alpha=0.4,
        node_shape='s',
        width=0.5)

## Book Creation

### Function Definition

In [None]:
def extract_content_list(string_list):
    # This function extracts only the strings from the input string_list
    # that match a specific pattern (a combination of numbers and hyphens ending with '-p'),
    # and returns them as a new list.
    pattern = r'(?:\d+-)*\d+-p'
    return [s for s in string_list if re.match(pattern, s)]

def custom_sort_key(s):
    # This function splits the string 's' at the numeric parts,
    # converts them into a list of integers, and generates a custom key for sorting
    # in numerical order.
    parts = re.split(r'[-p]', s)
    return [int(part) for part in parts if part != '']

def sort_strings(string_list):
    # This function sorts the input string_list using the custom key defined
    # in the custom_sort_key function and returns the sorted list.
    sorted_strings = sorted(string_list, key=custom_sort_key)
    return sorted_strings

### LaTeX document Creation

In [None]:
# Create a LaTeX document with pylatex
geometry_options = {"tmargin": "3cm", "lmargin": "3cm"}
doc = Document(documentclass="report", geometry_options=geometry_options)

# Add a preamble and title
doc.packages.append(Package('amsmath'))
doc.packages.append(Package('amssymb'))
doc.packages.append(Package('amsfonts'))
doc.packages.append(Package('mathtools'))
doc.packages.append(Package('bm'))
doc.preamble.append(Command("title", book_graph.nodes[book_node_name]["title"]))
doc.preamble.append(Command("date", NoEscape(r"\today")))
doc.append(NoEscape(r"\maketitle"))

In [None]:
# Sort nodes containing main content in order
content_str_list = extract_content_list(list(book_graph.nodes))
sorted_content_str_list = sort_strings(content_str_list)

# Add main content
for heading_number_str in sorted_content_str_list:
    heading_number = custom_sort_key(heading_number_str)

    # Add chapter headings
    if len(heading_number[1:]) == 0 or all(x == 1 for x in heading_number[1:]):
        node_name = "-".join(map(str, heading_number[0:1]))
        with doc.create(Chapter(book_graph.nodes[node_name]["title"])):
            doc.append(NoEscape(book_graph.nodes[node_name]["summary"].replace("\\\\","\\")))

    # Add section headings
    if (len(heading_number[2:]) == 0 and len(heading_number[:2]) > 1) or (len(heading_number[2:]) > 0 and all(x == 1 for x in heading_number[2:])):
        node_name = "-".join(map(str, heading_number[0:2]))
        with doc.create(Section(book_graph.nodes[node_name]["title"])):
            doc.append(NoEscape(book_graph.nodes[node_name]["summary"].replace("\\\\","\\")))

    # Add subsection headings
    if (len(heading_number[3:]) == 0 and len(heading_number[:3]) > 2) or (len(heading_number[3:]) > 0 and all(x == 1 for x in heading_number[3:])):
        node_name = "-".join(map(str, heading_number[0:3]))
        with doc.create(Subsection(book_graph.nodes[node_name]["title"])):
            doc.append(NoEscape(book_graph.nodes[node_name]["summary"].replace("\\\\","\\")))

    # Add main text content
    tex_file_path = book_graph.nodes[heading_number_str]["content_file_path"]
    with open(tex_file_path, "r", encoding='UTF-8') as file:
        tex_content = file.read()
        doc.append(NoEscape(tex_content))

### PDF Output

In [None]:
# Create a pdf file
doc.generate_pdf(book_node["title"], clean_tex=False)

In [None]:
# Download
files.download("/content/" + book_node["title"] + ".pdf")