In [None]:
%pip install pdf2image pillow

# Converting PDFs into Images

## Description
- **Single-page PDFs**: Images of single-page PDFs will be saved in the folder **`Images_of_pdf_with_single_page`**.
- **Multi-page PDFs**: Images of multi-page PDFs will be saved in the folder **`Images_of_pdf_with_multi_page`**.


In [2]:
# Cell 1: Import required libraries
import os
from pdf2image import convert_from_path
from pathlib import Path
from PIL import Image
import psutil
import logging
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [3]:
# Cell 2: PDF to Image Conversion Functions
def create_output_directories(base_path):
    """Create output directories if they don't exist."""
    single_page_dir = os.path.join(base_path, "Images_of_pdf_with_single_page")
    multi_page_dir = os.path.join(base_path, "Images_of_pdf_with_multi_page")
    os.makedirs(single_page_dir, exist_ok=True)
    os.makedirs(multi_page_dir, exist_ok=True)
    return single_page_dir, multi_page_dir

def get_all_pdf_files(directory):
    """Recursively get all PDF files from directory and its subdirectories."""
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                full_path = os.path.join(root, file)
                pdf_files.append(full_path)
    return pdf_files

def convert_and_save_pdf(pdf_path, single_page_dir, multi_page_dir):
    """Convert PDF to images and save them in appropriate directory based on page count."""
    try:
        # Convert PDF to images with higher DPI for better quality
        images = convert_from_path(
            pdf_path,
            dpi=200,
            poppler_path=r"C:\poppler\poppler-24.02.0\Library\bin"
        )
        
        # Get filename without extension but include parent folder name for uniqueness
        parent_folder = os.path.basename(os.path.dirname(pdf_path))
        filename = os.path.splitext(os.path.basename(pdf_path))[0]
        unique_filename = f"{parent_folder}_{filename}"
        
        # Determine output directory based on number of pages
        output_dir = single_page_dir if len(images) == 1 else multi_page_dir
        
        # Save each image
        for i, image in enumerate(images):
            output_path = os.path.join(output_dir, f"{unique_filename}_page_{i+1}.jpg")
            image.save(output_path, "JPEG", quality=95)
        
        logging.info(f"Successfully converted {unique_filename} ({len(images)} pages)")
        return True
    except Exception as e:
        logging.error(f"Error converting {pdf_path}: {str(e)}")
        return False

def process_pdfs(base_path):
    """Process all PDFs in the specified directory."""
    pdf_dir = os.path.join(base_path, "VTM_Invoice_Data_PDFs")
    
    # Create output directories
    single_page_dir, multi_page_dir = create_output_directories(base_path)
    
    # Check if PDF directory exists
    if not os.path.exists(pdf_dir):
        logging.error(f"Error: PDF directory not found at {pdf_dir}")
        return False
    
    # Get list of all PDF files recursively
    pdf_files = get_all_pdf_files(pdf_dir)
    if not pdf_files:
        logging.error(f"No PDF files found in {pdf_dir} or its subdirectories")
        return False
    
    logging.info(f"Found {len(pdf_files)} PDF files to process")
    logging.info("Processing files from the following directories:")
    
    # Print unique directories containing PDFs
    unique_dirs = set(os.path.dirname(pdf) for pdf in pdf_files)
    for dir_path in unique_dirs:
        logging.info(f"- {dir_path}")
    
    # Process each PDF
    successful_conversions = 0
    for pdf_path in pdf_files:
        if convert_and_save_pdf(pdf_path, single_page_dir, multi_page_dir):
            successful_conversions += 1
    
    logging.info(f"\nConversion completed!")
    logging.info(f"Successfully converted {successful_conversions} out of {len(pdf_files)} PDFs")
    return True

In [4]:
# Cell 3: Image Processing Functions
def is_file_locked(filepath):
    """Check if a file is locked."""
    try:
        with open(filepath, "a"):
            return False
    except IOError:
        return True

def process_images(folder_path, target_width=848, target_height=1667):
    """Process and filter images based on dimensions."""
    # Check if the folder exists
    if not os.path.exists(folder_path):
        logging.error(f"The folder '{folder_path}' does not exist.")
        return False
    
    # Iterate through files in the folder
    for file_name in os.listdir(folder_path):
        # Construct full file path
        file_path = os.path.join(folder_path, file_name)
        
        # Check if the file is a JPG image
        if os.path.isfile(file_path) and file_name.endswith('.jpg'):
            try:
                # Check if the file is locked
                if is_file_locked(file_path):
                    logging.warning(f"File is locked, retrying: {file_name}")
                    time.sleep(1)  # Small delay to allow the lock to be released
                    continue
                
                # Open the image file
                with Image.open(file_path) as img:
                    width, height = img.size
                
                # Check if dimensions match the target
                if width == target_width and height == target_height:
                    os.remove(file_path)
                    logging.info(f"Deleted file with target dimensions: {file_name}")
                else:
                    logging.info(f"Skipped file with dimensions {width}x{height}: {file_name}")
                    
            except Exception as e:
                logging.error(f"Error processing file {file_name}: {e}")
    
    logging.info("Image processing completed.")
    return True

In [5]:
# Cell 4: Main Function
def main():
    """Main function to orchestrate the entire process."""
    # Define base directory
    base_path = r"D:\VTM_Orginal_dataset"
    
    # Step 1: Process PDFs
    logging.info("Starting PDF processing...")
    if not process_pdfs(base_path):
        logging.error("PDF processing failed. Stopping execution.")
        return
    
    # Step 2: Process Images
    logging.info("Starting image processing...")
    multi_page_dir = os.path.join(base_path, "Images_of_pdf_with_multi_page")
    if not process_images(multi_page_dir):
        logging.error("Image processing failed.")
        return
    
    logging.info("Complete pipeline executed successfully!")

In [6]:
# Cell 5: Execute the main function
if __name__ == "__main__":
    main()

2025-01-17 17:17:51,748 - INFO - Starting PDF processing...
2025-01-17 17:17:51,786 - INFO - Found 8897 PDF files to process
2025-01-17 17:17:51,786 - INFO - Processing files from the following directories:
2025-01-17 17:17:51,814 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data_PDFs\Export Inv waybill 127
2025-01-17 17:17:51,815 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data_PDFs\Export Inv waybill 147
2025-01-17 17:17:51,815 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data_PDFs\Export Inv waybill 114
2025-01-17 17:17:51,816 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data_PDFs\Export Inv waybill 95
2025-01-17 17:17:51,816 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data_PDFs\Export Inv waybill 74
2025-01-17 17:17:51,817 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data_PDFs\Export Inv waybill 146
2025-01-17 17:17:51,817 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data_PDFs\Export Inv waybill 143
2025-01-17 17:17:51,817 - INFO - - D:\VTM_Orginal_dataset\VTM_Invoice_Data

## Deleting all images that were from page 2 from initial PDF

In [7]:
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the folder path
folder_path = r"D:\VTM_Orginal_dataset\Images_of_pdf_with_multi_page"

# Check if the folder exists
if not os.path.exists(folder_path):
    logging.error(f"The folder '{folder_path}' does not exist.")
    exit(1)

# Iterate through files in the folder
for file_name in os.listdir(folder_path):
    # Construct full file path
    file_path = os.path.join(folder_path, file_name)

    # Check if the file is a JPG image and contains 'page_2'
    if os.path.isfile(file_path) and file_name.endswith('.jpg') and 'page_2' in file_name:
        try:
            # Delete the file
            os.remove(file_path)
            logging.info(f"Deleted file: {file_name}")
        except Exception as e:
            logging.error(f"Error deleting file {file_name}: {e}")

logging.info("Process completed.")

2025-01-17 18:04:32,908 - INFO - Deleted file: Export Inv waybill 100_2762244-496011883332216092571131917570282 (1)_page_2.jpg
2025-01-17 18:04:32,910 - INFO - Deleted file: Export Inv waybill 100_2763706-49611393599146842769427309719191 (1)_page_2.jpg
2025-01-17 18:04:32,911 - INFO - Deleted file: Export Inv waybill 100_2763706-49611393599146842769427309719191_page_2.jpg
2025-01-17 18:04:32,912 - INFO - Deleted file: Export Inv waybill 100_2769052-49614933526187978886828774644671 (1)_page_2.jpg
2025-01-17 18:04:32,914 - INFO - Deleted file: Export Inv waybill 100_2769052-49614933526187978886828774644671_page_2.jpg
2025-01-17 18:04:32,915 - INFO - Deleted file: Export Inv waybill 100_2779023-4964697637034192502524591059115 (1)_page_2.jpg
2025-01-17 18:04:32,916 - INFO - Deleted file: Export Inv waybill 100_2779023-4964697637034192502524591059115 (2)_page_2.jpg
2025-01-17 18:04:32,918 - INFO - Deleted file: Export Inv waybill 100_2779023-4964697637034192502524591059115_page_2.jpg
2025-0

## Renaming all files in both the newly created folders

In [8]:
import os
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def rename_files_in_directory(directory_path):
    """
    Rename all files in a directory with ascending numbers.
    
    Args:
        directory_path (str): Path to the directory containing files to rename
    """
    try:
        # Ensure directory exists
        if not os.path.exists(directory_path):
            logging.error(f"Directory not found: {directory_path}")
            return False
            
        # Get list of all jpg files in the directory
        files = [f for f in os.listdir(directory_path) if f.endswith('.jpg')]
        
        # Sort files to ensure consistent ordering
        files.sort()
        
        # Keep track of successful renames
        successful_renames = 0
        
        # Rename files
        for index, old_name in enumerate(files, start=1):
            old_path = os.path.join(directory_path, old_name)
            new_name = f"{index}.jpg"
            new_path = os.path.join(directory_path, new_name)
            
            try:
                os.rename(old_path, new_path)
                successful_renames += 1
                logging.info(f"Renamed: {old_name} → {new_name}")
            except Exception as e:
                logging.error(f"Error renaming {old_name}: {str(e)}")
                
        logging.info(f"Successfully renamed {successful_renames} out of {len(files)} files in {directory_path}")
        return True
        
    except Exception as e:
        logging.error(f"Error processing directory {directory_path}: {str(e)}")
        return False

def main():
    # Define base directory and subdirectories
    base_path = r"D:\VTM_Orginal_dataset"
    single_page_dir = os.path.join(base_path, "Images_of_pdf_with_single_page")
    multi_page_dir = os.path.join(base_path, "Images_of_pdf_with_multi_page")
    
    # Process single page directory
    logging.info("\nProcessing single page directory...")
    if rename_files_in_directory(single_page_dir):
        logging.info("Single page directory processing completed successfully")
    else:
        logging.error("Failed to process single page directory")
    
    # Process multi page directory
    logging.info("\nProcessing multi page directory...")
    if rename_files_in_directory(multi_page_dir):
        logging.info("Multi page directory processing completed successfully")
    else:
        logging.error("Failed to process multi page directory")
    
    logging.info("\nFile renaming process completed!")

if __name__ == "__main__":
    main()

2025-01-17 18:35:41,274 - INFO - 
Processing single page directory...
2025-01-17 18:35:41,279 - INFO - Renamed: Export Inv waybill 100_2762246-496011909546615922566659109151570_page_1.jpg → 1.jpg
2025-01-17 18:35:41,281 - INFO - Renamed: Export Inv waybill 100_2763947-49612100077225509127672840765148_page_1.jpg → 2.jpg
2025-01-17 18:35:41,282 - INFO - Renamed: Export Inv waybill 100_2766267-496127836177012631299485601549353_page_1.jpg → 3.jpg
2025-01-17 18:35:41,283 - INFO - Renamed: Export Inv waybill 100_2767384-496134029329016848766176628557215_page_1.jpg → 4.jpg
2025-01-17 18:35:41,284 - INFO - Renamed: Export Inv waybill 100_2767554-496134268535416017063786702777656_page_1.jpg → 5.jpg
2025-01-17 18:35:41,285 - INFO - Renamed: Export Inv waybill 100_2768048-49613275137709260793781460494126_page_1.jpg → 6.jpg
2025-01-17 18:35:41,286 - INFO - Renamed: Export Inv waybill 100_2768068-49613270222502567446936158165075_page_1.jpg → 7.jpg
2025-01-17 18:35:41,287 - INFO - Renamed: Export In