In [None]:

# Cell: Install All Dependencies
!pip install PyMuPDF pytesseract pillow
# For Colab or Linux systems only
!apt-get update
!apt-get install -y tesseract-ocr tesseract-ocr-ara


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,724 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,979 kB]
Hit:10 https://ppa.launchpadcontent.net/graphic

In [None]:
# Cell 1: Import Libraries (run this first)
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import os
import re
import logging
from pathlib import Path

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Cell 2: Updated Name Extraction Function
def extract_student_name_optimized(page, x, y, width, height):
    """
    Extract student name using the optimal configuration found in testing
    """
    try:
        # Use the coordinates that worked in your test
        rect = fitz.Rect(x, y, x + width, y + height)

        # Higher resolution for better OCR
        mat = fitz.Matrix(3.0, 3.0)
        pix = page.get_pixmap(matrix=mat, clip=rect)

        # Convert to PIL Image
        img_data = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_data))

        # Using the optimal OCR configuration
        optimal_config = r'--oem 3 --psm 7 -l ara'
        text = pytesseract.image_to_string(img, config=optimal_config)

        # Clean and process the extracted text
        text = text.strip()

        if text:

            if "اسم الطالب:" in text:
                # Extract everything after "اسم الطالب:"
                name_part = text.split("اسم الطالب:")[-1].strip()
            elif "الطالب:" in text:
                # Alternative: just "الطالب:"
                name_part = text.split("الطالب:")[-1].strip()
            else:
                # If no prefix found, use the whole text
                name_part = text

            # Cleaning the name
            name_part = name_part.strip()

            # Remove any remaining unwanted characters or words
            # Remove common OCR artifacts
            name_part = re.sub(r'^[:\-\s]+', '', name_part)  # Remove leading colons, dashes, spaces
            name_part = re.sub(r'[:\-\s]+$', '', name_part)  # Remove trailing colons, dashes, spaces

            # reasonable name length
            if len(name_part) > 2 and len(name_part) < 100:
                return name_part

        return None

    except Exception as e:
        logger.error(f"Error extracting name: {e}")
        return None

def clean_filename(filename):
    """Clean filename to be safe for filesystem"""
    if not filename:
        return "unknown_name"

    # Remove or replace invalid characters for filenames
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '_')

    # Remove extra whitespace
    filename = re.sub(r'\s+', ' ', filename).strip()
    filename = filename.strip('.')

    # Limit filename length
    if len(filename) > 80:
        filename = filename[:80]

    return filename if filename else "unknown_name"

# Cell 3: Main Processing Function
def process_certificates_final(input_pdf_path, output_directory="final_certificates"):
    """
    Process certificates using the optimal extraction method
    """
    try:
        # Create output directory
        Path(output_directory).mkdir(exist_ok=True)

        # Open PDF
        pdf_document = fitz.open(input_pdf_path)
        logger.info(f"Processing PDF with {len(pdf_document)} pages")

        results = []

        # Using the coordinates that worked in your Cell 7 test
        EXTRACTION_X = 440
        EXTRACTION_Y = 155
        EXTRACTION_WIDTH = 250
        EXTRACTION_HEIGHT = 20

        print(f"Using coordinates: x={EXTRACTION_X}, y={EXTRACTION_Y}, w={EXTRACTION_WIDTH}, h={EXTRACTION_HEIGHT}")

        for page_num in range(len(pdf_document)):
            try:
                print(f"\nProcessing page {page_num + 1}...")
                page = pdf_document.load_page(page_num)

                # Extract name using optimal method
                student_name = extract_student_name_optimized(
                    page, EXTRACTION_X, EXTRACTION_Y, EXTRACTION_WIDTH, EXTRACTION_HEIGHT
                )

                if student_name:
                    print(f" Extracted name: '{student_name}'")
                    filename = clean_filename(student_name)
                else:
                    print(f" Could not extract name from page {page_num + 1}")
                    filename = f"student_page_{page_num + 1}"

                # Create individual PDF
                individual_pdf = fitz.open()
                individual_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)

                # Generate unique filename
                output_path = os.path.join(output_directory, f"{filename}.pdf")
                counter = 1
                while os.path.exists(output_path):
                    base_filename = filename
                    output_path = os.path.join(output_directory, f"{base_filename}_{counter}.pdf")
                    counter += 1

                # Save individual PDF
                individual_pdf.save(output_path)
                individual_pdf.close()

                results.append({
                    'page': page_num + 1,
                    'extracted_name': student_name,
                    'filename': os.path.basename(output_path),
                    'success': student_name is not None
                })

                print(f"Saved: {os.path.basename(output_path)}")

            except Exception as e:
                logger.error(f"Error processing page {page_num + 1}: {e}")
                results.append({
                    'page': page_num + 1,
                    'extracted_name': None,
                    'filename': None,
                    'success': False,
                    'error': str(e)
                })

        pdf_document.close()
        return results

    except Exception as e:
        logger.error(f"Error processing PDF: {e}")
        return None

# Cell 4: Test Single Page First
def test_single_page(pdf_path, page_num=0, x=200, y=200, width=300, height=50):
    """
    Test extraction on a single page with your coordinates
    """
    try:
        pdf_document = fitz.open(pdf_path)
        page = pdf_document.load_page(page_num)

        name = extract_student_name_optimized(page, x, y, width, height)

        print(f"Test extraction from page {page_num + 1}:")
        print(f"Coordinates: x={x}, y={y}, w={width}, h={height}")
        print(f"Extracted name: '{name}'")
        print(f"Clean filename: '{clean_filename(name)}'")

        pdf_document.close()
        return name

    except Exception as e:
        print(f"Error: {e}")
        return None

# Cell 5: Test with your working coordinates
# These are the coordinates that worked in your Cell 7
TEST_X = 440
TEST_Y = 155
TEST_WIDTH = 250
TEST_HEIGHT = 20

# Test on first page
print("Testing extraction with optimal settings...")
test_single_page("اولى 3.pdf", 0, TEST_X, TEST_Y, TEST_WIDTH, TEST_HEIGHT)

# Cell 6: Process All Pages
def run_final_processing():
    input_pdf = "اولى 3.pdf"
    output_dir = "final_extracted_certificates"

    print("Starting final processing...")
    print("=" * 50)

    # Make sure to update the coordinates in Cell 3 before running this!
    results = process_certificates_final(input_pdf, output_dir)

    if results:
        print("\n" + "=" * 50)
        print("FINAL PROCESSING RESULTS")
        print("=" * 50)

        successful = 0
        failed = 0

        for result in results:
            if result['success']:
                successful += 1
                print(f" Page {result['page']}: '{result['extracted_name']}' -> {result['filename']}")
            else:
                failed += 1
                error_msg = result.get('error', 'Unknown error')
                print(f" Page {result['page']}: Failed ({error_msg})")

        print(f"\nSUMMARY:")
        print(f" Successful: {successful}")
        print(f" Failed: {failed}")
        print(f" Output directory: {output_dir}/")

        if successful > 0:
            print(f"\n Successfully extracted {successful} certificates with Arabic names!")
    else:
        print(" Failed to process PDF")


run_final_processing()

Testing extraction with optimal settings...
Test extraction from page 1:
Coordinates: x=440, y=155, w=250, h=20
Extracted name: 'أسامه يوسف خميس الحضرمي'
Clean filename: 'أسامه يوسف خميس الحضرمي'
Starting final processing...
Using coordinates: x=440, y=155, w=250, h=20

Processing page 1...
 Extracted name: 'أسامه يوسف خميس الحضرمي'
Saved: أسامه يوسف خميس الحضرمي_1.pdf

Processing page 2...
 Extracted name: 'ابراهيم محمد ابن علي الشيخي'
Saved: ابراهيم محمد ابن علي الشيخي_1.pdf

Processing page 3...
 Extracted name: 'احمد ابراهيم احمد العمري'
Saved: احمد ابراهيم احمد العمري_1.pdf

Processing page 4...
 Extracted name: 'امير عمر ثابت الجهني'
Saved: امير عمر ثابت الجهني_1.pdf

Processing page 5...
 Extracted name: 'اياد راكان سليمان الطنينى'
Saved: اياد راكان سليمان الطنينى_1.pdf

Processing page 6...
 Extracted name: 'بتال عبدالله جابر السلمي'
Saved: بتال عبدالله جابر السلمي_1.pdf

Processing page 7...
 Extracted name: 'تركي صالح عثمان المالكي'
Saved: تركي صالح عثمان المالكي_1.pdf

Proce

In [None]:
!zip -r folder.zip final_extracted_certificates

  adding: final_extracted_certificates/ (stored 0%)
  adding: final_extracted_certificates/خالد راجح عثمان الزهراني.pdf (deflated 1%)
  adding: final_extracted_certificates/فيصل أحمد حامد القرني.pdf (deflated 1%)
  adding: final_extracted_certificates/عبدالله منير عالي البقمي.pdf (deflated 1%)
  adding: final_extracted_certificates/فهد مهدي أحمد الصمداني.pdf (deflated 1%)
  adding: final_extracted_certificates/بتال عبدالله جابر السلمي.pdf (deflated 1%)
  adding: final_extracted_certificates/يوسف دخيل الله نجم السلمي.pdf (deflated 1%)
  adding: final_extracted_certificates/سلمان عبدالعزيز محسن الحارثي.pdf (deflated 1%)
  adding: final_extracted_certificates/تركي صالح عثمان المالكي.pdf (deflated 1%)
  adding: final_extracted_certificates/امير عمر ثابت الجهني.pdf (deflated 1%)
  adding: final_extracted_certificates/سند عبدالله احمد الزهراني.pdf (deflated 1%)
  adding: final_extracted_certificates/مبارك علي محسن الحارثي.pdf (deflated 1%)
  adding: final_extracted_certificates/رامي عماد سال

In [None]:
from google.colab import files
files.download('folder.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>