In [299]:
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
from dataclasses import dataclass

In [300]:
# Author, Work, Book, Line, Commentary Name, URL
# "Homer", "Iliad", 1, 45, "firstthreebooks03homegoog", "https://archive.org/details/firstthreebooks03homegoog/page/n210/mode/2up"

In [301]:
@dataclass
class SingleLineComment:
    book_number: int
    line_number: int
    archive_page_number: int
    archive_link: str
    line_commentary: str
    secotion_commentary: str

In [302]:
@dataclass
class Commentary:
    modern_author: str
    ancient_author: str
    ancient_work: str
    modern_title: str
    archive_id: str
    archive_url: str
    single_line_comment: list[SingleLineComment] = None

In [303]:
book = epub.read_epub("example-texts/firstthreebooks03homegoog_firstthreebooks03homegoog.epub")

In [304]:
type(book)

ebooklib.epub.EpubBook

In [305]:
for item in book.get_items():
    if item.get_type() == ebooklib.ITEM_DOCUMENT:
        print('==================================')
        print('NAME : ', item.get_name())
        print('----------------------------------')
        print(item.get_content())
        print('==================================')

NAME :  cover.xhtml
----------------------------------
b'<?xml version=\'1.0\' encoding=\'utf-8\'?>\n<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#" lang="en" xml:lang="en">\n  <head/>\n  <body><img src="images/cover.png" alt="Cover"/>\n </body>\n</html>\n'
NAME :  chap_0001.xhtml
----------------------------------
NAME :  nav.xhtml
----------------------------------
b'<?xml version=\'1.0\' encoding=\'utf-8\'?>\n<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#" lang="en" xml:lang="en">\n  <head/>\n  <body><nav epub:type="toc" id="id" role="doc-toc">\n      <h2>The first three books of Homer\'s Iliad, with introduction, commentary, and vocabulary, for the use of schools</h2>\n      <ol>\n        <li>\n          <a href="chap_0001.xhtml">The fi

In [306]:
# Get ITEM_DOCUMENT
chapters = []
for item in book.get_items():
    if item.get_type() == ebooklib.ITEM_DOCUMENT:
        chapters.append(item.get_content())
chapters

[b'<?xml version=\'1.0\' encoding=\'utf-8\'?>\n<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#" lang="en" xml:lang="en">\n  <head/>\n  <body><img src="images/cover.png" alt="Cover"/>\n </body>\n</html>\n',
 b'<?xml version=\'1.0\' encoding=\'utf-8\'?>\n<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#" lang="en" xml:lang="en">\n  <head/>\n  <body><nav epub:type="toc" id="id" role="doc-toc">\n      <h2>The first three books of Homer\'s Iliad, with introduction, commentary, and vocabulary, for the use of schools</h2>\n      <ol>\n        <li>\n          <a href="chap_0001.xhtml">The first three books of Homer\'s Iliad, with introduction, commentary, and vocabulary, for the use of schools</a>\n        </li>\n      </ol>\n    </nav>\n    <nav epub:

In [307]:
# Get commentaries and web page links
import re
soup1 = BeautifulSoup(chapters[1], 'html.parser')
find = [para for para in soup1.find_all(['p','span'])]
# print(find)
# print(find[0].get_text())
# print(find[20].attrs['id'])
# print(find[2000])
# print(find[20])

In [308]:
# Initiate Commentary OBJ
seymour_comm = Commentary(modern_author=book.get_metadata('DC', 'creator')[1][0],
                         ancient_author=book.get_metadata('DC', 'creator')[0][0],
                         ancient_work="Iliad",
                         modern_title=book.get_metadata('DC', 'title')[0][0],
                         archive_id=book.get_metadata('DC', 'identifier')[0][0],
                         archive_url=book.get_metadata('DC', 'identifier')[1][0].split(': ')[1])
seymour_comm.modern_author
seymour_comm.archive_url

'http://archive.org/details/firstthreebooks03homegoog'

In [309]:
book_dict = {'FIRST': 1, 'SECOND': 2, 'THIRD': 3}

In [310]:
seymour_comm.single_line_comment = []
archive_url = seymour_comm.archive_url

curr_book = 0
curr_page = 0
curr_sec_comm = ''
start = False

for para in find:
    if para.name == 'p':
        text = para.get_text()
        if text == 'COMMENTARY. ' and not start:
            start = True
            
        # Commentary begins
        if start:
            
            # Book Number
            if re.search('BOOK\sOF\sTHE\sILIAD.\s$', text):
                curr_book = book_dict[text.split()[0]]
                
            # Section Comm (1-7. ) and Edge Case (28-32 = 11-15)
            elif re.search('^\d{1,3}-\d{1,3}\s=\s\d{1,3}-\d{1,3}', text) or re.search('^\d{1,3}-\d{1,3}.\s', text):
                curr_sec_comm = text
            
            # Line Comm Edge Case 2 (324 = 137, 451 f. = 37 f . )
            elif (re.search('^\d{1,3}\s=\s\d{1,3}', text) or (re.search('^\d{1,3}\sf.\s=', text))):
                line_num = int(re.search('^\d+. ', text).group()[:-1])
                seymour_comm.single_line_comment.append(SingleLineComment(book_number = curr_book, 
                                                                          line_number = line_num,
                                                                          archive_page_number = curr_page,
                                                                          archive_link = curr_link,
                                                                          line_commentary = text,
                                                                          secotion_commentary = curr_sec_comm))
            # Line Comm (23. )
            elif re.search('^\d{1,3}.\s', text):
                line_num = int(re.search('^\d{1,3}.\s', text).group()[:-2])
                seymour_comm.single_line_comment.append(SingleLineComment(book_number = curr_book, 
                                                                          line_number = line_num,
                                                                          archive_page_number = curr_page,
                                                                          archive_link = curr_link,
                                                                          line_commentary = text,
                                                                          secotion_commentary = curr_sec_comm))
    elif para.name == 'span' and para.attrs['epub:type'] == "pagebreak":
        curr_page = int(para.attrs['id'])+1
        curr_link = archive_url + '/page/n'+str(curr_page)+'/mode/2up'
        
