#### Imports

In [4]:
import pandas as pd
import os
import json
import re
from IPython.display import display, Image
from pathlib import Path


# PDF processing
import PyPDF2
from pdf2image import convert_from_path
import pytesseract

# Environment
from dotenv import load_dotenv


# Load environment variables
load_dotenv()


True

#### Paths

In [5]:
# Project paths
PROJECT_ROOT = Path.cwd().parent  # Go to project directory
DATA_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

# PDF file - UPDATE THIS WITH YOUR ACTUAL FILENAME
PDF_PATH = DATA_DIR /  "Man_myth_magic_encyclopedia.pdf"  # Change to your actual filename


### Structure the data

##### Idetify the Table of content

In [6]:
# Open the PDF
with open(PDF_PATH, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    
    # Basic info
    num_pages = len(pdf_reader.pages)
    metadata = pdf_reader.metadata
    
    print("=" * 50) # aesthetic
    print("PDF BASIC INFORMATION")
    print("=" * 50)

    print(f"Total pages: {num_pages}")
    print(f"\nMetadata:")
    if metadata:
        for key, value in metadata.items():
            print(f"  {key}: {value}")
    else:
        print("  No metadata found")

PDF BASIC INFORMATION
Total pages: 3144

Metadata:
  /Title: Man, myth & magic  the illustrated encyclopedia of mythology 21 volumes in 1
  /Producer: macOS Version 11.7.10 (Build 20G1427) Quartz PDFContext
  /Author: Autumn Walker
  /Creator: Preview
  /CreationDate: D:20251220004857Z00'00'
  /ModDate: D:20251220004857Z00'00'


In [7]:

def image_to_text(pages_numbers:list, file_path:str):
# OCR a single page
    images = convert_from_path(
        str(file_path),
        first_page=pages_numbers[0],
        last_page = pages_numbers[-1],
        dpi=300
    )

    toc_text = pytesseract.image_to_string(images[0])
    print(f"Extracted {len(toc_text)} characters")
    
    return toc_text

In [8]:

toc_pages = [17, 21]  #  Table Of Content page numbers
path = str(PDF_PATH)

toc_text = image_to_text(toc_pages, path)
toc_text

Extracted 3554 characters


'CONTENTS OF VOLUMES\n\nVOLUME 1\n\nVOLUME 2\n\nAbbots Bromley 46 Apollo 114 Atavism 151 Berbers 215\nAberdeen Witches 46 Apollonius 115 Ate 152 Bermuda Triangle 217\nAbominable Snowman 48 Apotheosis 116 Aten 152 Annie Besant 218\nAbracadabra 48 Apparition 116 Athene 153 Bhagavad Gita 220\nAchilles 49 Apple 116 Atlantis 156 Bigfoot 225\nAcorn 50 Aquarius 118 Atlas 160 Bilocation 226\nAcupuncture 51 Aries 119 Atman 160 Birds 227\nAdonis 53 Aromatherapy 119 Atonement 160 Black Madonnas 232\nAfrica 54 Arthur 120 Attila 161 Black Magic 233\nAfro-American Lore 62 Ash 126 Attis 161 Black Mass 235\nAgrippa 64 Ashanti 126 Augury 162 Black Muslims 236\nAbriman 66 Ashes 129 Aura 163 William Blake 239\nAbura Mazda 68 Asmodeus 131 Australia 164 Madame Blavatsky 244\nAir 68 Ass 131 Automatic Art 171 Blood 248\nAix-en-Provence Nuns 69 Assyria 131 Auxonne Nuns 175 Blue 249\nAlchemy 71 Astarte 132 Avalon 176 Bluebeard 249\nAlexander the Great 78 Astral Body 135 Axe 176 Boar 250\nAlgonquin Indians 82s 

##### Parse and organize the data

In [9]:
# lines = toc_text.split("\n")

# lines

In [13]:
for line in toc_text.split("\n"):  
    if "volume" not in line.lower():
        pattern = r'([A-Za-z][A-Za-z\s\-]+?)\s+(\d{1,4})(?=\s+[A-Z]|\s*$)'
        matches = re.findall(pattern, line)
        print(matches)

[]
[]
[]
[('Abbots Bromley', '46'), ('Apollo', '114'), ('Atavism', '151'), ('Berbers', '215')]
[('Aberdeen Witches', '46'), ('Apollonius', '115'), ('Ate', '152'), ('Bermuda Triangle', '217')]
[('Abominable Snowman', '48'), ('Apotheosis', '116'), ('Aten', '152'), ('Annie Besant', '218')]
[('Abracadabra', '48'), ('Apparition', '116'), ('Athene', '153'), ('Bhagavad Gita', '220')]
[('Achilles', '49'), ('Apple', '116'), ('Atlantis', '156'), ('Bigfoot', '225')]
[('Acorn', '50'), ('Aquarius', '118'), ('Atlas', '160'), ('Bilocation', '226')]
[('Acupuncture', '51'), ('Aries', '119'), ('Atman', '160'), ('Birds', '227')]
[('Adonis', '53'), ('Aromatherapy', '119'), ('Atonement', '160'), ('Black Madonnas', '232')]
[('Africa', '54'), ('Arthur', '120'), ('Attila', '161'), ('Black Magic', '233')]
[('Afro-American Lore', '62'), ('Ash', '126'), ('Attis', '161'), ('Black Mass', '235')]
[('Agrippa', '64'), ('Ashanti', '126'), ('Augury', '162'), ('Black Muslims', '236')]
[('Abriman', '66'), ('Ashes', '129'

In [11]:

def create_dataframe(toc_text:list):

    toc_lst = []

    for line in toc_text.split("\n"):  
        if "volume" not in line.lower() :
            pattern = r'([A-Za-z][A-Za-z\s\-]+?)\s+(\d{1,4})(?=\s+[A-Z]|\s*$)'
            matches = re.findall(pattern, line)
            
            if not matches:
                continue

            for match in matches:
                entry = match[0].strip()
                page = int(match[1])

                toc_lst.append({
                "entry_name" : entry,
                "page_number" : page
            })

    toc_df = pd.DataFrame(toc_lst)
    toc_df = toc_df.sort_values('entry_name')
    return toc_df

In [12]:
toc_df = create_dataframe(toc_text)
toc_df

Unnamed: 0,entry_name,page_number
0,Abbots Bromley,46
4,Aberdeen Witches,46
8,Abominable Snowman,48
12,Abracadabra,48
44,Abriman,66
...,...,...
87,Tibetan,262
47,William Blake,239
208,William Crookes,485
68,s Astrology,138
