In [None]:
"""
This code gets file paths from a directory, reads PDF and EPUB documents, extracts file metadata and fill Mongodb databse for querying in RAG.
"""

from pymongo import MongoClient
from collections import defaultdict
from nltk.corpus import stopwords

import os
from markdown import markdown
from bs4 import BeautifulSoup
import fitz  # pip install pymupdf
import ebooklib
from ebooklib import epub
import warnings 
import logging

logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore') 

from multiprocessing import Process, Value, Lock,  Semaphore, Queue, JoinableQueue
from threading import Thread
from dotenv import load_dotenv

load_dotenv()

stop_words_en = set(stopwords.words('english'))


# ---------------------------------------------------------------------------------
def collect_files(directory, extensions):
    """
    First get the file names and labels from file, then traverse the given directory and collect all file paths and put labels.
    """

    # Traverse the directory and collect file paths    
    file_counter=0  #it is required because some files may have the same name
    file_dict = defaultdict(list)
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(extensions):                       
                file_path = os.path.join(root, file)                                 #Obtain filepath and remove path length control with \\?\\
                file_name, file_ext = os.path.splitext(file)                         #Obtain filename without extension                                   
                parent_path = os.path.basename(os.path.dirname(file_path))           #Obtain parent directory
                file_size = os.path.getsize(file_path)                               #Obtain file size
                file_dict[file_path].append([file_name, parent_path, file_size, file_ext])     
                
                file_counter += 1                
    
    logger.info(f"Number of files under all folders (files may have same name): {file_counter}")

    return file_dict, file_counter

# ---------------------------------------------------------------------------------
def get_doc_from_pdf_epub(file_path, logger, file_no, file_path_print, task_id):
    """Read pdf and epub documents, obtain text content and put them in document with metadata"""

    #Initialize output variables
    error_flag = False  
    full_text = ""
    page_starts = []

    if file_path.lower().endswith(".pdf"):
        try:
            doc = fitz.open(file_path)
            parts = []
            page_starts = []
            current = 0

            for page in doc:
                page_starts.append(current)           # this page starts here
                text = page.get_text("text")        # type: ignore[attr-defined]
                parts.append(text)
                current += len(text)

            full_text = "".join(parts)

        except Exception as e:
            # Capture the exception and traceback
            error_flag = True      
            logger.exception(f"Error in task {task_id}: {e} - file_counter: {file_no} -- {file_path_print}")
            
    elif file_path.lower().endswith(".epub"):
        try:
            book = epub.read_epub(file_path, options={'ignore_ncx': True})

            parts: list[str] = []
            page_starts: list[int] = []
            current = 0

            # spine = real reading order
            spine_ids = [item_id for (item_id, _) in book.spine]

            DOC_TYPE = getattr(ebooklib, "ITEM_DOCUMENT", 9)

            for item_id in spine_ids:
                item = book.get_item_with_id(item_id)
                if item is None:
                    continue
                if item.get_type() != DOC_TYPE:
                    continue

                html = item.get_content().decode("utf-8")
                soup = BeautifulSoup(html, "html.parser")
                text = soup.get_text(separator="\n", strip=True)

                # record start of this section
                page_starts.append(current)
                parts.append(text)

                # for strict parity with your PDF code, we join with "" (no extra \n)
                current += len(text)

            full_text = "".join(parts)
      
        except Exception as e:
            # Capture the exception and traceback
            error_flag = True      
            logger.exception(f"Error in task {task_id}: {e} - file_counter: {file_no} -- {file_path_print}") 
    
    return full_text, page_starts, error_flag

In [2]:
files_folder = '\\\\?\\D:\\Bilgi\\Kitaplar'


extensions=("html", "htm", "pdf", "epub")  # Specify the file extensions to look for
logger.info("Data reading from folders started...")
input_files, number_of_files = collect_files(files_folder, extensions)

counter = 1
books = []

# Path, filename, author, date, keywords --> -1 if None, "" if empty or value
files_status_list = []

for file_path, info in input_files.items():

    #Obtain file path + name and file extension
    file_path_print = file_path[4:]  #From file path "\\\\?\\" part is removed
    file_extension = info[0][3]

    #Inintialize inputs
    error_flag = False  

    if file_path.lower().endswith(".pdf"):
        try:
            doc = fitz.open(file_path)

            #print(f"doc-{counter}: {doc} -- {doc.metadata}")

            metadata = doc.metadata if doc.metadata else {}
            files_status_list.append([file_path_print, file_extension, info[0][0], metadata.get("author", "-1"), metadata.get("creationDate", "-1"), metadata.get("keywords", "-1")])

            name = info[0][0]
            author = metadata.get("author", "-1")
            year = int(metadata.get("creationDate", "-1")[2:6])
            keywords = name
            path = file_path_print

            books.append({
                "name": name,
                "path": path,
                "year": year,
                "author": author,
                "keywords": keywords
            })

            
            logger.info(f"Book-{counter}/{number_of_files}: {name}.pdf -- Author: {author} -- Year: {year} -- Keywords: {keywords}")
                        

        except Exception as e:
            # Capture the exception and traceback
            error_flag = True      
            logger.error(f"Error in opening pdf file")
            
    elif file_path.lower().endswith(".epub"):
        
        try:
            book = epub.read_epub(file_path, options={'ignore_ncx': True})

            # 1. Define the Dublin Core namespace
            DC_NAMESPACE = 'DC'

            def get_metadata_value(book, key):
                """Helper to safely extract the first value of a metadata key."""
                # get_metadata returns a list of tuples: [('Value', {attributes}), ...]
                data = book.get_metadata(DC_NAMESPACE, key)
                if data:
                    return data[0][0]  # Return the text content of the first match
                return "-1"

            # 2. Extract specific fields
            author = get_metadata_value(book, 'creator') # 'creator' is used for Author
            year = int(get_metadata_value(book, 'date')[2:6])      # 'date' is usually the publishing date


            # 3. Extract Keywords (stored as 'subject')
            # Since there can be multiple keywords, we might want all of them, not just the first.2
            name = info[0][0]
            subjects = book.get_metadata(DC_NAMESPACE, 'subject')      
            keywords = name
            path = file_path_print

            books.append({
                "name": name,
                "path": path,
                "year": year,
                "author": author,
                "keywords": keywords
            })

            files_status_list.append([file_path_print, file_extension, info[0][0], author, year, keywords])

            logger.info(f"Book-{counter}/{number_of_files}: {name}.epub -- Author: {author} -- Year: {year} -- Keywords: {keywords}")

        except Exception as e:
            # Capture the exception and traceback
            error_flag = True      
            logger.error(f"Error in opening epub file")
    counter += 1

logger.info(f"Total number of documents: {counter-1}")



num_total_files = 0
num_pdf_files = 0
num_epub_files = 0
num_metadata_exist_pdf = 0
num_metadata_exist_epub = 0
for files_status in files_status_list:
    num_total_files += 1

    if files_status[1].lower() == ".pdf":
        num_pdf_files += 1
        if files_status[4] != "" and files_status[4] != "-1" :  # filename
            num_metadata_exist_pdf += 1
    if files_status[1].lower() == ".epub":
        num_epub_files += 1
        if files_status[3] != "" and files_status[3] != "-1" or files_status[4] != "" and files_status[4] != "-1":  # filename
            num_metadata_exist_epub += 1

logger.info(f"PDF -  metadata exist for all fields (author, date, keywords): {num_metadata_exist_pdf} / {num_pdf_files} / {num_total_files}")
logger.info(f"EPUB - metadata exist for all fields (author, date, keywords): {num_metadata_exist_epub} / {num_epub_files} / {num_total_files}")


2025-11-30 11:45:33,909 - __main__ - INFO - Data reading from folders started...
2025-11-30 11:45:33,956 - __main__ - INFO - Number of files under all folders (files may have same name): 1957
2025-11-30 11:45:33,958 - __main__ - INFO - Book-1/1957: Sporda Öğrenme-2_Bölüm1_makale.pdf -- Author: ergun -- Year: 2016 -- Keywords: Sporda Öğrenme-2_Bölüm1_makale
2025-11-30 11:45:33,961 - __main__ - INFO - Book-2/1957: (Ebook For Dummies) - Complete Idiot's Guide To Finance And Accounting.pdf -- Author: Karl Arnar gisson -- Year: 2007 -- Keywords: (Ebook For Dummies) - Complete Idiot's Guide To Finance And Accounting
2025-11-30 11:45:33,963 - __main__ - INFO - Book-3/1957: ++++++Accounting.An.Introduction,.Eddie.McLaney,.Peter.Atrill,.5ed,.PH,.2010.pdf -- Author: Lin Yan -- Year: 2011 -- Keywords: ++++++Accounting.An.Introduction,.Eddie.McLaney,.Peter.Atrill,.5ed,.PH,.2010
2025-11-30 11:45:33,965 - __main__ - INFO - Book-4/1957: 010711_English_Finance_Management_Accounting_The_agile_manager's

In [3]:
mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017")
database_name = os.getenv("DATABASE_NAME", "bookdb")
collection_name = os.getenv("COLLECTION_NAME", "books")

logger.info(f"Connecting to MongoDB: {mongodb_uri}")
client = MongoClient(mongodb_uri)
db = client[database_name]
collection = db[collection_name]

try:
    # Clear existing data
    logger.info("Clearing existing books data...")
    collection.delete_many({})

    # Insert sample books
    logger.info(f"Inserting {len(books)} sample books...")
    result = collection.insert_many(books)
    logger.info(f"✅ Inserted {len(result.inserted_ids)} books successfully")

    # Create indexes
    logger.info("Creating indexes...")
    collection.create_index("name")
    collection.create_index("path")    
    collection.create_index("author")
    collection.create_index("year")
    collection.create_index("keywords")
    logger.info("✅ Indexes created successfully")

    # Verify data
    total_count = collection.count_documents({})
    logger.info(f"📚 Total books in database: {total_count}")

    # Sample query tests
    logger.info("\n--- Sample Query Tests ---")
    john_smith_books = collection.count_documents({"author": {"$regex": "Dennis Lewis", "$options": "i"}})
    logger.info(f"Books by John Smith: {john_smith_books}")

    ml_books = collection.count_documents({"keywords": {"$in": ["machine learning"]}})
    logger.info(f"Books with 'machine learning' keyword: {ml_books}")

    books_2021_2023 = collection.count_documents({"year": {"$gte": 2021, "$lte": 2023}})
    logger.info(f"Books from 2021-2023: {books_2021_2023}")

    logger.info("\n✅ Database seeding completed successfully!")

except Exception as e:
    logger.error(f"❌ Error during database seeding: {e}")
    raise
finally:
    client.close()
    logger.info("MongoDB connection closed")


2025-11-30 11:45:47,939 - __main__ - INFO - Connecting to MongoDB: mongodb://localhost:27017
2025-11-30 11:45:47,955 - __main__ - INFO - Clearing existing books data...
2025-11-30 11:45:47,999 - __main__ - INFO - Inserting 1783 sample books...
2025-11-30 11:45:48,045 - __main__ - INFO - ✅ Inserted 1783 books successfully
2025-11-30 11:45:48,045 - __main__ - INFO - Creating indexes...
2025-11-30 11:45:48,253 - __main__ - INFO - ✅ Indexes created successfully
2025-11-30 11:45:48,275 - __main__ - INFO - 📚 Total books in database: 1783
2025-11-30 11:45:48,275 - __main__ - INFO - 
--- Sample Query Tests ---
2025-11-30 11:45:48,288 - __main__ - INFO - Books by John Smith: 1
2025-11-30 11:45:48,295 - __main__ - INFO - Books with 'machine learning' keyword: 0
2025-11-30 11:45:48,303 - __main__ - INFO - Books from 2021-2023: 89
2025-11-30 11:45:48,303 - __main__ - INFO - 
✅ Database seeding completed successfully!
2025-11-30 11:45:48,305 - __main__ - INFO - MongoDB connection closed
