In [23]:
import openai
import string
import re
from fpdf import FPDF
import PyPDF2
import re

# Set up the OpenAI API client (Replace with your API key)
openai.api_key = ""

def answer_question(ques):
    questions=[{"role": "system", "content": ques}]
    text_variations = openai.ChatCompletion.create(
        model="gpt-4",
        messages = questions)
    
    return text_variations['choices'][0]['message']['content']

#Replace with whatever you want your book to be called
book_title = 'A preparation guide for Quantitative Researcher: Machine Learning, Execution technical interview at Two Sigma'

In [32]:
ques = f"Give me the table of contents for a book called {book_title}"
book_chapters = answer_question(ques)
book_chapters = [x.strip() for x in book_chapters.split('\n')]

book_chapters = [x for x in book_chapters if x != '']
book_chapters

chapterdict = {}
chaptitle = ''
for x in book_chapters[1:]:
    print(x)
    if x.startswith('Chapter'):
        chaptitle = x
        chapterdict[chaptitle] = {}
        continue
    if len(x) > 0:
        subtopic = x
        chapter = chaptitle
        ques = f"You are writing a book titled {book_title}. \
        Expand on the topic of {subtopic} in chapter {chapter}, \
        providing examples, technical details, case studies and code where appropriate"
        chapterdict[chaptitle][x] = answer_question(ques)
        
#Mesh together a PDF with Table of Contents
def process_subsections(chapter_content):
    # Adjust the regex to capture both 1.3 and 1.3.1 formats
    pattern = re.compile(r'(\d+\.\d+(?:\.\d+)? [^\n]+)')
    parts = pattern.split(chapter_content)
    # Adjust the iteration logic to put titles as keys
    sections = {parts[i]: parts[i + 1].strip() for i in range(1, len(parts), 2) if i + 1 < len(parts)}
    return sections

def safe_encode(text):
    # Return text as is if it's pure ASCII, otherwise replace problematic characters
    try:
        text.encode('latin-1')
        return text
    except UnicodeEncodeError:
        return text.encode('latin-1', errors='replace').decode('latin-1')

class PDF(FPDF):
    def __init__(self, toc_entries=None):
        super().__init__()
        self.toc_entries = toc_entries if toc_entries else []
        self.set_margins(20, 10)  # Set left and right margins to 20 each

    def header(self):
        self.set_font("Arial", "B", 12)

    def chapter_title(self, num, label):
        self.add_page()  # New page for each chapter
        link_index = self.add_link()
        self.set_link(link_index, 0, self.page_no())
        self.set_font("Arial", "B", 16)
        self.cell(0, 8, safe_encode(f"Chapter {num}: {label}"), 0, 1, "L")
        self.ln(2)
        self.toc_entries.append((safe_encode(f"Chapter {num}: {label}"), link_index, self.get_y()))

    def section_title(self, num, label):
        link_index = self.add_link()
        self.set_link(link_index, 0, self.page_no())
        self.set_font("Arial", "B", 14)
        self.ln(10)
        self.cell(0, 8, safe_encode(f"{label}"), 0, 1, "L")
        self.ln(2)
        self.toc_entries.append((safe_encode(f"{num}: {label}"), link_index, self.get_y()))
        
    def subsection_title(self, num, label):
        link_index = self.add_link()
        self.set_link(link_index, 0, self.page_no())
        self.set_font("Arial", "B", 12)  # Italicized and smaller font for subsection
        self.ln(5)  # Space between content and subsection start
        self.cell(0, 8, safe_encode(f"{label}"), 0, 1, "L")
        self.ln(2)  # Reduce extra space after subsection title
        self.toc_entries.append((safe_encode(f"{num}: {label}"), link_index, self.get_y()))

    def bold_title(self, num, label):
        self.set_font("Arial", "I", 12)  # Italicized and smaller font for subsection
        self.cell(0, 8, safe_encode(f"{label}"), 0, 1, "L")
        self.ln(1)

    def content(self, text):
        self.set_font("Arial", "", 12)
        lines = text.replace(":\n",":").replace(': \n\n',':').replace(":\n\n",":").split("\n")
        for line in lines:
            if ":" in line:
                parts = line.split(":", 1)  # Split at the first occurrence of ":"
                # Bold the part before the ":"
                self.set_font("Arial", "B", 12)
                self.multi_cell(0, 8, safe_encode(parts[0] + ":"))  # Reduced line height
                # Write the part after the ":" in normal font
                self.set_font("Arial", "", 12)
                self.multi_cell(0, 8, safe_encode(parts[1]))  # Reduced line height
            else:
                self.multi_cell(0, 8, safe_encode(line))  # Reduced line height
            self.ln(1)  # Reduced line spacing

    def create_content(self, data):
        content_filename = "content.pdf"
        chapter_number = 1

        for chapter, sections in data.items():
            chapter_parts = chapter.split(":")
            chapter_title = chapter_parts[1].strip() if len(chapter_parts) > 1 else chapter
            self.chapter_title(chapter_number, chapter_title)

            for section, content in sections.items():
                # First, check for subsections
                subsections = process_subsections(content)

                if subsections:  # If there are subsections, process them
                    for subsection_title, subsection_content in subsections.items():
                        self.subsection_title(subsection_title, "")
                        self.content(safe_encode(subsection_content))
                else:  # If no subsections, process the section as usual
                    section_parts = section.split()
                    section_num = section_parts[0] if len(section_parts) > 0 else section
                    section_title_parts = section.split(":")
                    section_title = section_title_parts[1].strip() if len(section_title_parts) > 1 else section
                    self.section_title(section_num, section_title)
                    self.content(safe_encode(content))

            chapter_number += 1
        print(safe_encode(content_filename))
        self.output(safe_encode(content_filename))
        return content_filename


    def create_toc(self):
        if not self.toc_entries:
            raise ValueError("TOC entries are empty. Can't create TOC.")

        toc_filename = "toc.pdf"
        toc_pages = 1  # start with 1 for the current TOC page
        self.add_page()
        self.set_font("Arial", "B", 16)
        self.cell(0, 10, "Table of Contents", 0, 1, "C")
        self.ln(5)

        for title, link, _ in self.toc_entries:
            toc_pages += 1  # account for each TOC page added
            # Fixing the link's page number
            self.set_link(link, page=self.page_no() + toc_pages)
            if "Chapter" in title:
                self.set_font("Arial", "B", 14)
            else:
                self.set_font("Arial", "", 12)

            # Making words before ":" bold in TOC
            if ":" in title:
                parts = title.split(":", 1)
                self.set_font("Arial", "B", 12)
                self.cell(self.get_string_width(parts[0]) + 6, 10, safe_encode(parts[0] + ":"))
                self.set_font("Arial", "", 12)
                self.cell(0, 10, " " + safe_encode(parts[1]))
            else:
                self.cell(0, 10, safe_encode(title))
            self.ln(7)
        self.output(toc_filename)
        return toc_filename



def merge_pdfs(toc_filename, content_filename, output_filename=f"twosig.pdf"):
    merger = PyPDF2.PdfMerger()
    merger.append(toc_filename)
    merger.append(content_filename)
    merger.write(output_filename)
    merger.close()

def create_pdf(data):
    content_pdf = PDF()
    content_filename = content_pdf.create_content(data)
    
    toc_pdf = PDF(content_pdf.toc_entries)
    toc_filename = toc_pdf.create_toc()
    
    merge_pdfs(toc_filename, content_filename)


create_pdf(chapterdict)


content.pdf
