<a href="https://colab.research.google.com/github/iliavrtn/final-project/blob/main/PDF_to_TXT_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### We tests three popular libraries for this purpose: PyMuPDF (also known as fitz), pdfminer.six, and PyPDF2. These libraries vary in terms of ease of use, speed, and the fidelity of the conversion.


1. Speed of Conversion - How fast each library processes the PDF documents.
2. Accuracy and Fidelity - How accurately the text is extracted (including
handling of special formatting or characters).
3. Differences in Results - Any noticeable differences in the output text.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install PyMuPDF pdfminer.six PyPDF2

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2, PyMuPDFb, PyMuPDF, pdfminer.six
Successfully installed PyMuPDF-1.24.7 Py

In [None]:
import os
import time
import fitz  # PyMuPDF
from pdfminer.high_level import extract_text as pdfminer_extract_text
from PyPDF2 import PdfReader

In [None]:
def convert_with_pymupdf(file_path):
    start = time.time()
    doc = fitz.open(file_path)
    text = ''
    for page in doc:
        text += page.get_text()
    duration = time.time() - start
    return text, duration

def convert_with_pdfminer(file_path):
    start = time.time()
    text = pdfminer_extract_text(file_path)
    duration = time.time() - start
    return text, duration

def convert_with_pypdf2(file_path):
    start = time.time()
    reader = PdfReader(file_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text() or ''
    duration = time.time() - start
    return text, duration

def save_text(text, output_folder, filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as file:
        file.write(text)

def process_files(folder_path):
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")

            # Convert with PyMuPDF
            pymupdf_text, pymupdf_duration = convert_with_pymupdf(file_path)
            save_text(pymupdf_text, os.path.join(folder_path, filename[:-4], 'PyMuPDF'), filename[:-4] + '.txt')
            print(f"PyMuPDF Time: {pymupdf_duration:.2f} seconds")

            # Convert with pdfminer.six
            pdfminer_text, pdfminer_duration = convert_with_pdfminer(file_path)
            save_text(pdfminer_text, os.path.join(folder_path, filename[:-4], 'PDFMiner'), filename[:-4] + '.txt')
            print(f"PDFMiner Time: {pdfminer_duration:.2f} seconds")

            # Convert with PyPDF2
            pypdf2_text, pypdf2_duration = convert_with_pypdf2(file_path)
            save_text(pypdf2_text, os.path.join(folder_path, filename[:-4], 'PyPDF2'), filename[:-4] + '.txt')
            print(f"PyPDF2 Time: {pypdf2_duration:.2f} seconds")

# Specify the path to your folder
folder_path = '/content/drive/MyDrive/🅰 Aleph - Capstone Project 2024/PDF to TXT books TEST'
process_files(folder_path)

Processing: COPY_axiomatic-set-theory.pdf
PyMuPDF Time: 2.95 seconds
PDFMiner Time: 32.02 seconds
PyPDF2 Time: 5.68 seconds
Processing: COPY_abstract_set_theory.pdf
PyMuPDF Time: 1.84 seconds
PDFMiner Time: 11.25 seconds
PyPDF2 Time: 4.02 seconds
Processing: COPY_The_Continuum_Huntington_edited.pdf
PyMuPDF Time: 1.64 seconds
PDFMiner Time: 0.14 seconds
PyPDF2 Time: 0.07 seconds
Processing: COPY_SetTheoryPart1ofPart1.pdf
PyMuPDF Time: 1.39 seconds
PDFMiner Time: 0.22 seconds
PyPDF2 Time: 0.03 seconds
Processing: PDF_Set_Theory.pdf
PyMuPDF Time: 1.47 seconds
PDFMiner Time: 8.88 seconds
PyPDF2 Time: 2.55 seconds
Processing: PDF_Sets, Relations, Functions.pdf
PyMuPDF Time: 0.24 seconds
PDFMiner Time: 4.93 seconds
PyPDF2 Time: 1.44 seconds
Processing: PDF_SetTheoreticApproach.pdf
PyMuPDF Time: 1.31 seconds
PDFMiner Time: 10.09 seconds
PyPDF2 Time: 3.81 seconds
Processing: PDF_SetTheoryUnited.pdf
PyMuPDF Time: 1.31 seconds
PDFMiner Time: 20.59 seconds
PyPDF2 Time: 7.33 seconds


In [None]:
import os
import difflib

def compare_texts(text1, text2, max_length=1000):
    """ Calculate the similarity score between shortened versions of two texts. """
    # Shorten texts to the first max_length characters
    text1 = text1[:max_length]
    text2 = text2[:max_length]
    return difflib.SequenceMatcher(None, text1, text2).ratio()


def read_text(file_path):
    """ Read text file and return content. """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def process_book_folder(book_folder):
    """ Process each book folder to compare text outputs from different libraries and check for empty files. """
    library_texts = {}
    empty_files = []
    for library_folder in os.listdir(book_folder):
        full_library_path = os.path.join(book_folder, library_folder)
        if os.path.isdir(full_library_path):
            for file in os.listdir(full_library_path):
                if file.endswith('.txt'):
                    text_path = os.path.join(full_library_path, file)
                    text_content = read_text(text_path)
                    if not text_content.strip():  # Check if the text content is empty
                        empty_files.append((library_folder, file))
                    library_texts[library_folder] = text_content

    # Report empty files
    if empty_files:
        print(f"Empty text files found in {os.path.basename(book_folder)}:")
        for lib, file in empty_files:
            print(f"  Library: {lib}, File: {file}")

    # Compare texts from different libraries if not empty
    library_names = list(library_texts.keys())
    for i in range(len(library_names)):
        for j in range(i + 1, len(library_names)):
            lib1, lib2 = library_names[i], library_names[j]
            if library_texts[lib1] and library_texts[lib2]:  # Only compare if both texts are non-empty
                score = compare_texts(library_texts[lib1], library_texts[lib2])
                print(f"Similarity score between {lib1} and {lib2} for {os.path.basename(book_folder)}: {score:.2f}")

def main():
    base_path = '/content/drive/MyDrive/🅰 Aleph - Capstone Project 2024/PDF to TXT books TEST'
    for book_folder_name in os.listdir(base_path):
        book_folder_path = os.path.join(base_path, book_folder_name)
        if os.path.isdir(book_folder_path):
            print(f"Processing book: {book_folder_name}")
            process_book_folder(book_folder_path)

if __name__ == "__main__":
    main()


Processing book: COPY_axiomatic-set-theory
Similarity score between PyMuPDF and PDFMiner for COPY_axiomatic-set-theory: 0.88
Similarity score between PyMuPDF and PyPDF2 for COPY_axiomatic-set-theory: 0.99
Similarity score between PDFMiner and PyPDF2 for COPY_axiomatic-set-theory: 0.86
Processing book: COPY_abstract_set_theory
Similarity score between PyMuPDF and PDFMiner for COPY_abstract_set_theory: 0.66
Similarity score between PyMuPDF and PyPDF2 for COPY_abstract_set_theory: 1.00
Similarity score between PDFMiner and PyPDF2 for COPY_abstract_set_theory: 0.62
Processing book: COPY_The_Continuum_Huntington_edited
Empty text files found in COPY_The_Continuum_Huntington_edited:
  Library: PyMuPDF, File: COPY_The_Continuum_Huntington_edited.txt
  Library: PDFMiner, File: COPY_The_Continuum_Huntington_edited.txt
  Library: PyPDF2, File: COPY_The_Continuum_Huntington_edited.txt
Processing book: COPY_SetTheoryPart1ofPart1
Empty text files found in COPY_SetTheoryPart1ofPart1:
  Library: PyMu

Recommendation

Given your needs for both speed and reliable text extraction:

- PyMuPDF appears to be the best overall choice due to its consistently fast processing times and generally high similarity scores compared to the other libraries. This makes it a strong candidate for handling a large number of documents quickly while still providing reliable output.

- PyPDF2 could be considered as a secondary option, especially in cases where PyMuPDF might not provide satisfactory results. Its performance and accuracy are generally acceptable, though not as fast as PyMuPDF.

- PDFMiner might be reserved for specific cases where you suspect that PyMuPDF and PyPDF2 are missing some text or when you need to extract text that relies heavily on the exact formatting preserved by PDFMiner. Despite its slower speed, it can sometimes handle complex PDF structures better.

### Pipeline of converting PDF books into txt format with PyMuPDF and Tesseract

In [None]:
# Install Tesseract
!sudo apt update
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

# Install Python libraries
!pip install pytesseract
!pip install pdf2image

[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Connecting to security.ubuntu.com] [Waiting for headers] [Connecting to ppa[0m                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [[0m                                                                               Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
[33m0% [3 InRelease 12.7 kB/128 kB 10%] [Connecting to security.ubuntu.com (185.125[0m                                                                               Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [

In [None]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 46 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.4 [186 kB]
Fetched 186 kB in 0s (1,024 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 122105 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.4_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.4) ...
Setting up poppler-utils (22.02.0-2ubuntu0.4) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
import pytesseract
from pdf2image import convert_from_path
import os

def pdf_to_text_tesseract(pdf_path):
    """
    Convert a PDF file to text using Tesseract OCR and save output in a specific folder.
    :param pdf_path: Path to the PDF file.
    """
    # Extract book name and create output folder
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_folder = f'/content/drive/My Drive/🅰 Aleph - Capstone Project 2024/PDF to TXT books TEST/TESSERACT_{book_name}'  # Customize the path as needed
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert PDF to list of images
    pages = convert_from_path(pdf_path, dpi=300)  # Adjust DPI for better accuracy if needed

    # Process each page with Tesseract OCR
    for i, page in enumerate(pages):
        text = pytesseract.image_to_string(page, lang='eng')  # Change lang if different language

        # Save text to a file
        output_file_path = os.path.join(output_folder, f'page_{i + 1}.txt')
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Processed and saved page {i + 1}")

# Example usage for two books
book_paths = [
    '/content/drive/MyDrive/🅰 Aleph - Capstone Project 2024/PDF to TXT books TEST/COPY_SetTheoryPart1ofPart1.pdf',
    '/content/drive/MyDrive/🅰 Aleph - Capstone Project 2024/PDF to TXT books TEST/COPY_The_Continuum_Huntington_edited.pdf'
]

for book_path in book_paths:
    pdf_to_text_tesseract(book_path)


Processed and saved page 1
Processed and saved page 2
Processed and saved page 3
Processed and saved page 4
Processed and saved page 5
Processed and saved page 6
Processed and saved page 7
Processed and saved page 8
Processed and saved page 9
Processed and saved page 10
Processed and saved page 11
Processed and saved page 12
Processed and saved page 13
Processed and saved page 14
Processed and saved page 15
Processed and saved page 16
Processed and saved page 17
Processed and saved page 18
Processed and saved page 19
Processed and saved page 20
Processed and saved page 21
Processed and saved page 22
Processed and saved page 23
Processed and saved page 24
Processed and saved page 25
Processed and saved page 26
Processed and saved page 27
Processed and saved page 28
Processed and saved page 29
Processed and saved page 30
Processed and saved page 31
Processed and saved page 32
Processed and saved page 33
Processed and saved page 34
Processed and saved page 35
Processed and saved page 36
P

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install necessary libraries
!pip install PyMuPDF
!pip install pdf2image
!pip install pytesseract
!apt-get install poppler-utils  # For pdf2image on Linux
!apt-get install tesseract-ocr  # For Tesseract on Linux
!pip install Pillow
!pip install psutil
!pip install tqdm

# Import libraries
import os
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import shutil
import logging
import gc
import psutil
from tqdm import tqdm  # For progress bar

# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# Define the base directory where 'CS' and 'MATH' folders are located
base_dir = '/content/drive/MyDrive/Capstone Project 2024-2025'

cs_dir = os.path.join(base_dir, 'CS')
math_dir = os.path.join(base_dir, 'MATH')

# Output directory for the text files
output_dir = os.path.join(base_dir, 'TextFiles')
os.makedirs(output_dir, exist_ok=True)

# Define the path for the log file
log_file_path = os.path.join(base_dir, 'processing_logs.log')

# Configure logging to write to a file and to the console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file_path),  # Log to file
        logging.StreamHandler()              # Also log to console
    ]
)

def memory_usage():
    """
    Logs the current memory usage.
    """
    process = psutil.Process(os.getpid())
    mem = process.memory_info().rss / (1024 * 1024)  # Convert bytes to MB
    logging.info(f"Current memory usage: {mem:.2f} MB")

def count_pdfs(directory):
    """
    Counts the total number of PDF files in the directory and its subdirectories.
    """
    pdf_count = 0
    for root, dirs, files in os.walk(directory):
        pdf_count += sum(1 for file in files if file.lower().endswith('.pdf'))
    return pdf_count

def extract_text_pymupdf(pdf_path):
    """
    Extract text from PDF using PyMuPDF (fitz).
    Returns the extracted text.
    """
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
            page = None  # Release page resource
        doc.close()
        return text
    except Exception as e:
        logging.error(f"Error extracting text with PyMuPDF from {pdf_path}: {e}")
        return ""

def process_pdf(pdf_path, txt_output_path, tesseract_pdfs):
    """
    Process a single PDF file:
    - Try to extract text using PyMuPDF.
    - If no text is extracted, add the PDF to tesseract_pdfs list.
    """
    logging.info(f"Processing PDF: {pdf_path}")
    # Try PyMuPDF first
    text = extract_text_pymupdf(pdf_path)
    if text.strip():
        logging.info(f"Text extracted using PyMuPDF from {pdf_path}")
        # Save the extracted text
        os.makedirs(os.path.dirname(txt_output_path), exist_ok=True)
        with open(txt_output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        logging.info(f"Extracted text saved to {txt_output_path}")
    else:
        logging.info(f"No text extracted using PyMuPDF from {pdf_path}. Skipping Tesseract OCR for now.")
        # Add the PDF path to the list for later processing with Tesseract
        tesseract_pdfs.append(pdf_path)
        logging.info(f"Added {pdf_path} to Tesseract processing list.")

    # Clean up
    del text
    gc.collect()

def process_all_pdfs(base_input_dir, base_output_dir, tesseract_pdfs):
    total_pdfs = count_pdfs(base_input_dir)
    logging.info(f"Total PDFs to process in {base_input_dir}: {total_pdfs}")

    processed_count = 0

    # Use tqdm for a progress bar
    for root, dirs, files in os.walk(base_input_dir):
        pdf_files = [file for file in files if file.lower().endswith('.pdf')]
        if pdf_files:
            for file in tqdm(pdf_files, desc=f"Processing PDFs in {root}", unit='file'):
                pdf_path = os.path.join(root, file)
                # Construct the output text file path, mirroring the directory structure
                relative_path = os.path.relpath(root, base_input_dir)
                txt_output_dir = os.path.join(output_dir, relative_path)
                txt_output_path = os.path.join(txt_output_dir, f"{os.path.splitext(file)[0]}.txt")
                # Check if the text file already exists to avoid reprocessing
                if os.path.exists(txt_output_path):
                    logging.info(f"Text file already exists for {pdf_path}. Skipping.")
                    processed_count += 1
                    continue
                # Process the PDF
                process_pdf(pdf_path, txt_output_path, tesseract_pdfs)
                processed_count += 1
                # Monitor memory usage
                memory_usage()
                logging.info(f"Processed {processed_count}/{total_pdfs} PDFs.")

# List to keep track of PDFs that need Tesseract OCR
tesseract_pdfs = []

# Process PDFs in the 'CS' folder
logging.info("Starting processing of CS PDFs...")
process_all_pdfs(cs_dir, os.path.join(output_dir, 'CS'), tesseract_pdfs)

# Process PDFs in the 'MATH' folder
logging.info("Starting processing of MATH PDFs...")
process_all_pdfs(math_dir, os.path.join(output_dir, 'MATH'), tesseract_pdfs)

# Save the list of PDFs that need Tesseract OCR for later processing
tesseract_list_path = os.path.join(base_dir, 'tesseract_pdfs.txt')
with open(tesseract_list_path, 'w', encoding='utf-8') as f:
    for pdf in tesseract_pdfs:
        f.write(f"{pdf}\n")
logging.info(f"List of PDFs that need Tesseract OCR saved to {tesseract_list_path}")

# Processing PDFs with Tesseract OCR (to be run later when ready)
def extract_text_tesseract(pdf_path):
    """
    Extract text from PDF using Tesseract OCR, processing one page at a time.
    Returns the extracted text.
    """
    try:
        text = ""
        doc = fitz.open(pdf_path)
        num_pages = len(doc)
        doc.close()
        for page_num in range(1, num_pages + 1):
            images = convert_from_path(pdf_path, dpi=200, first_page=page_num, last_page=page_num)
            for img in images:
                # Use Tesseract to do OCR on the image
                custom_oem_psm_config = r'--oem 1 --psm 3'
                text += pytesseract.image_to_string(img, config=custom_oem_psm_config)
                img.close()  # Close image to free memory
                del img
                gc.collect()
            del images
            gc.collect()
        return text
    except Exception as e:
        logging.error(f"Error extracting text with Tesseract from {pdf_path}: {e}")
        return ""

def process_pdf_with_tesseract(pdf_path):
    logging.info(f"Processing PDF with Tesseract: {pdf_path}")
    text = extract_text_tesseract(pdf_path)
    if text.strip():
        logging.info(f"Text extracted using Tesseract OCR from {pdf_path}")
        # Construct the output text file path, mirroring the directory structure
        relative_path = ''
        if pdf_path.startswith(cs_dir):
            relative_path = os.path.relpath(os.path.dirname(pdf_path), cs_dir)
            txt_output_dir = os.path.join(output_dir, 'CS', relative_path)
        elif pdf_path.startswith(math_dir):
            relative_path = os.path.relpath(os.path.dirname(pdf_path), math_dir)
            txt_output_dir = os.path.join(output_dir, 'MATH', relative_path)
        else:
            logging.error(f"PDF path {pdf_path} does not match CS or MATH directories.")
            return
        txt_output_path = os.path.join(txt_output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}.txt")
        # Save the extracted text
        os.makedirs(txt_output_dir, exist_ok=True)
        with open(txt_output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        logging.info(f"Extracted text saved to {txt_output_path}")
    else:
        logging.warning(f"No text extracted from {pdf_path} using Tesseract OCR.")

    # Clean up
    del text
    gc.collect()

# Load the list of PDFs that need Tesseract OCR
tesseract_list_path = os.path.join(base_dir, 'tesseract_pdfs.txt')
with open(tesseract_list_path, 'r', encoding='utf-8') as f:
    tesseract_pdfs = [line.strip() for line in f.readlines()]
logging.info(f"Total PDFs to process with Tesseract OCR: {len(tesseract_pdfs)}")

# Process PDFs with Tesseract OCR
logging.info("Starting Tesseract OCR processing...")
for pdf_path in tqdm(tesseract_pdfs, desc="Processing PDFs with Tesseract OCR", unit='file'):
    process_pdf_with_tesseract(pdf_path)
    # Monitor memory usage
    memory_usage()


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


Processing PDFs in /content/drive/MyDrive/Capstone Project 2024-2025/CS/Functional programming: 100%|██████████| 10/10 [00:00<00:00, 2610.67file/s]
Processing PDFs in /content/drive/MyDrive/Capstone Project 2024-2025/CS/Automata: 100%|██████████| 10/10 [00:00<00:00, 4252.56file/s]
Processing PDFs in /content/drive/MyDrive/Capstone Project 2024-2025/CS/Data Struct and Algs: 100%|██████████| 12/12 [00:00<00:00, 2533.05file/s]
Processing PDFs in /content/drive/MyDrive/Capstone Project 2024-2025/CS/Compiler: 100%|██████████| 11/11 [00:00<00:00, 33.07file/s]
Processing PDFs in /content/drive/MyDrive/Capstone Project 2024-2025/CS/Imperative programming: 100%|██████████| 11/11 [00:00<00:00, 2257.21file/s]
Processing PDFs in /content/drive/MyDrive/Capstone Project 2024-2025/CS/OOP: 100%|██████████| 10/10 [00:00<00:00, 2807.81file/s]
Processing PDFs in /content/drive/MyDrive/Capstone Project 2024-2025/CS/OS: 100%|██████████| 10/10 [00:00<00:00, 3835.67file/s]
Processing PDFs in /content/drive/M