### Check if PDF is ok

In [1]:
import os
from PyPDF2 import PdfReader
import logging

def is_pdf_valid(file_path):
    """
    Checks if a PDF file is valid and not corrupted.

    Parameters:
        file_path (str): The path to the PDF file.

    Returns:
        bool: True if the PDF is valid, False otherwise.
    """
    try:
        with open(file_path, 'rb') as f:
            reader = PdfReader(f)
            # Attempt to read the number of pages
            num_pages = len(reader.pages)
        return True
    except Exception as e:
        logging.error(f"Error reading {file_path}: {e}")
        return False

def check_pdfs_in_folder(folder_path):
    """
    Checks all PDF files in a folder for validity.

    Parameters:
        folder_path (str): The path to the folder containing PDF files.

    Returns:
        tuple: Lists of valid and invalid PDF file paths.
    """
    valid_pdfs = []
    invalid_pdfs = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            if is_pdf_valid(file_path):
                # print(f"{filename} is valid.")
                valid_pdfs.append(file_path)
                # Proceed to process the PDF
            else:
                print(f"{filename} is corrupted or invalid.")
                invalid_pdfs.append(file_path)
    return valid_pdfs, invalid_pdfs

if __name__ == "__main__":
    # Configure logging to write errors to a file
    logging.basicConfig(filename='pdf_errors.log', level=logging.ERROR)

    # Replace 'path_to_your_folder' with the path to your folder containing PDFs
    folder_path = r"D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy"
    valid_pdfs, invalid_pdfs = check_pdfs_in_folder(folder_path)

    print(f"\nValid PDFs ({len(valid_pdfs)}):")
    # for pdf in valid_pdfs:
    #     print(pdf)

    print(f"\nInvalid PDFs ({len(invalid_pdfs)}):")
    for pdf in invalid_pdfs:
        print(pdf)


Cam kết năm sản xuất XP2i - Vector Infotech.pdf is corrupted or invalid.
PO2407053-01-full signed.pdf is corrupted or invalid.
sailor-sart-ii-type-examination-certificate-module-b_tuv.pdf is corrupted or invalid.
sign-F58_Vendor Registration Form_Rev1.pdf is corrupted or invalid.

Valid PDFs (3727):

Invalid PDFs (4):
D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\Cam kết năm sản xuất XP2i - Vector Infotech.pdf
D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\PO2407053-01-full signed.pdf
D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\sailor-sart-ii-type-examination-certificate-module-b_tuv.pdf
D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\sign-F58_Vendor Registration Form_Rev1.pdf


### Compare filename without extension between DOC and PDF folders

In [1]:
import os
from pathlib import Path

def get_filename_without_extension(filename: str) -> str:
    """
    Extract filename without extension.
    Example: 'document.pdf' -> 'document'
    
    Args:
        filename (str): Full filename with extension
        
    Returns:
        str: Filename without extension
    """
    return Path(filename).stem

def compare_folders(folder_a_path: str, folder_b_path: str) -> tuple[list[tuple[str, str]], int]:
    """
    Compare two folders and find files that exist in folder A but not in folder B,
    ignoring file extensions.
    
    Args:
        folder_a_path (str): Path to folder A
        folder_b_path (str): Path to folder B
        
    Returns:
        tuple: List of tuples (filename with extension, filename without extension) 
        unique to folder A and count of such files
    """
    # Convert paths to Path objects
    folder_a = Path(folder_a_path)
    folder_b = Path(folder_b_path)
    
    # Get filenames with and without extensions
    files_a = [(file.name, get_filename_without_extension(file.name)) 
               for file in folder_a.iterdir() if file.is_file()]
    files_b = [get_filename_without_extension(file.name) 
               for file in folder_b.iterdir() if file.is_file()]
    
    # Find files that are in A but not in B (comparing without extensions)
    unique_to_a = [(full_name, name_without_ext) 
                   for full_name, name_without_ext in files_a 
                   if name_without_ext not in files_b]
    
    # Sort the list by filename without extension for better readability
    unique_to_a.sort(key=lambda x: x[1])
    
    return unique_to_a, len(unique_to_a)

def main():
    # Replace these paths with your actual folder paths
    folder_a_path = r"F:\orig\CT_doc_files"
    folder_b_path = r"F:\orig\CT_in_Procur_merge_11184_orig - Copy"
    
    try:
        unique_files, count = compare_folders(folder_a_path, folder_b_path)
        
        print(f"\nFound {count} files in folder A that are not in folder B (ignoring extensions):\n")
        
        # Group files by base name for better organization
        current_base = None
        for full_name, base_name in unique_files:
            if base_name != current_base:
                print(f"\nBase name: {base_name}")
                current_base = base_name
            print(f"  - {full_name}")
            
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("Please check if both folder paths exist and are accessible.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


Found 1868 files in folder A that are not in folder B (ignoring extensions):


Base name: ! Vilas 60 HƯỚNG DẪN CHI TIẾT ĐO KIỂM
  - ! Vilas 60 HƯỚNG DẪN CHI TIẾT ĐO KIỂM.docx

Base name: !!! Hop chuan, hop quy, do kiem
  - !!! Hop chuan, hop quy, do kiem.docx

Base name: !!! Note
  - !!! Note.docx

Base name: #10_HD VMPC-VECTOR 20231003 (FINAL)
  - #10_HD VMPC-VECTOR 20231003 (FINAL).doc

Base name: (RV1)HỢP ĐỒNG AMIVN- VECTOR cmt
  - (RV1)HỢP ĐỒNG AMIVN- VECTOR cmt.docx

Base name: 0. Bang ke chung tu thuoc Phu luc III
  - 0. Bang ke chung tu thuoc Phu luc III.doc

Base name: 0. Phụ lục HD 03_V2_Vector cmt
  - 0. Phụ lục HD 03_V2_Vector cmt.doc

Base name: 01.PLDC
  - 01.PLDC.docx

Base name: 0110-HĐMB-KTV-VIVN-2024 - cmt
  - 0110-HĐMB-KTV-VIVN-2024 - cmt.doc

Base name: 01_Purchase Contract_Terms  Conditions_G2
  - 01_Purchase Contract_Terms  Conditions_G2.docx

Base name: 02_Exhibit I_Schedule of Rates  Prices_G2
  - 02_Exhibit I_Schedule of Rates  Prices_G2.docx

Base name: 03. Ba

### Split big DOC folder to smaller folders

In [1]:
import os
import shutil

# Source folder containing the PDFs
source_folder = r"D:\fd_doc_Prop_1885_split3"

# Target folders to copy the files
target_folder_1 = r"D:\fd_doc_Prop_1885_split31"
target_folder_2 = r"D:\fd_doc_Prop_1885_split32"
target_folder_3 = r"D:\fd_doc_Prop_1885_split33"

# Create target folders if they don't exist
os.makedirs(target_folder_1, exist_ok=True)
os.makedirs(target_folder_2, exist_ok=True)
os.makedirs(target_folder_3, exist_ok=True)

# Get list of all PDF files in the source folder
pdf_files = [file for file in os.listdir(source_folder) if file.lower().endswith(('.doc', '.docx'))]

# Sort the files to ensure consistent distribution
pdf_files.sort()

# Total number of files
total_files = len(pdf_files)

# Number of folders
num_folders = 3

# Calculate the base number of files per folder and the remainder
files_per_folder = total_files // num_folders  # Integer division
remainder_files = total_files % num_folders    # Modulo operation

# Initialize folder distributions
folder_distribution = [files_per_folder] * num_folders

# Distribute the remainder files to the first few folders
for i in range(remainder_files):
    folder_distribution[i] += 1

# Copy files to target folders
current_index = 0
target_folders = [target_folder_1, target_folder_2, target_folder_3]

for folder_index, file_count in enumerate(folder_distribution):
    target_folder = target_folders[folder_index]
    for _ in range(file_count):
        source_file = os.path.join(source_folder, pdf_files[current_index])
        destination_file = os.path.join(target_folder, pdf_files[current_index])
        shutil.copy(source_file, destination_file)
        current_index += 1

# Verify and print the number of files copied to each folder
files_copied_1 = len(os.listdir(target_folder_1))
files_copied_2 = len(os.listdir(target_folder_2))
files_copied_3 = len(os.listdir(target_folder_3))

total_copied = files_copied_1 + files_copied_2 + files_copied_3

print(f"Files have been successfully copied into three folders with distribution:")
print(f"Folder 1: {files_copied_1} files")
print(f"Folder 2: {files_copied_2} files")
print(f"Folder 3: {files_copied_3} files")
print(f"Total files copied: {total_copied}")

# Check if the total copied files equal the total files in source
if total_copied == total_files:
    print("All files have been successfully copied.")
else:
    print("There was an error in copying files.")

Files have been successfully copied into three folders with distribution:
Folder 1: 210 files
Folder 2: 210 files
Folder 3: 210 files
Total files copied: 630
All files have been successfully copied.


### Split big PDF folder to smaller folders

In [None]:
import os
import shutil

# Source folder containing the PDFs
source_folder = r"F:\Proposal_merge_20352"

# Number of folders to split into
num_folders = 8

# Generate target folder paths dynamically
target_folders = [os.path.join(source_folder + f"_split{i+1}") for i in range(num_folders)]

# Create target folders if they don't exist
for folder in target_folders:
    os.makedirs(folder, exist_ok=True)

# Get list of all PDF files in the source folder
pdf_files = [file for file in os.listdir(source_folder) if file.lower().endswith('.pdf')]

# Sort the files to ensure consistent distribution
pdf_files.sort()

# Total number of files
total_files = len(pdf_files)

# Calculate the base number of files per folder and the remainder
files_per_folder = total_files // num_folders  # Integer division
remainder_files = total_files % num_folders    # Modulo operation

# Initialize folder distributions
folder_distribution = [files_per_folder] * num_folders

# Distribute the remainder files to the first few folders
for i in range(remainder_files):
    folder_distribution[i] += 1

# Copy files to target folders
current_index = 0

for folder_index, file_count in enumerate(folder_distribution):
    target_folder = target_folders[folder_index]
    for _ in range(file_count):
        source_file = os.path.join(source_folder, pdf_files[current_index])
        destination_file = os.path.join(target_folder, pdf_files[current_index])
        shutil.copy(source_file, destination_file)
        current_index += 1

# Verify and print the number of files copied to each folder
files_copied = []
for folder in target_folders:
    files_copied.append(len([file for file in os.listdir(folder) if file.lower().endswith('.pdf')]))

total_copied = sum(files_copied)

print(f"Files have been successfully copied into {num_folders} folders with distribution:")
for i, count in enumerate(files_copied, 1):
    print(f"Folder {i}: {count} files")
print(f"Total files copied: {total_copied}")

# Check if the total copied files equal the total files in source
if total_copied == total_files:
    print("All files have been successfully copied.")
else:
    print("There was an error in copying files.")


Files have been successfully copied into 8 folders with distribution:
Folder 1: 2544 files
Folder 2: 2544 files
Folder 3: 2544 files
Folder 4: 2544 files
Folder 5: 2544 files
Folder 6: 2544 files
Folder 7: 2544 files
Folder 8: 2544 files
Total files copied: 20352
All files have been successfully copied.


### Count total of splitted files to see sum is correct

In [8]:
import os
from pathlib import Path

def count_files_in_folder(folder_path):
    """
    Count all files (not directories) in a folder and its subfolders.
    
    Args:
        folder_path (str or Path): Path to the folder
        
    Returns:
        int: Number of files found
    """
    folder = Path(folder_path)
    if not folder.exists():
        print(f"Warning: Folder not found: {folder}")
        return 0
        
    return sum(1 for _ in folder.rglob('*') if _.is_file())

def count_files_in_split_folders(base_folder, base_name, start_num=1):
    """
    Count files in sequentially numbered folders.
    
    Args:
        base_folder (str): Base directory containing the split folders
        base_name (str): Base name of the folders (e.g., "Proposal_merge_20352_split")
        start_num (int): Starting number for the sequence
        
    Returns:
        dict: Dictionary with folder numbers as keys and file counts as values
        int: Total number of files across all folders
    """
    counts = {}
    total_files = 0
    folder_number = start_num
    
    while True:
        folder_path = os.path.join(base_folder, f"{base_name}{folder_number}")
        if not os.path.exists(folder_path):
            break
            
        file_count = count_files_in_folder(folder_path)
        counts[folder_number] = file_count
        total_files += file_count
        folder_number += 1
    
    return counts, total_files

if __name__ == "__main__":
    # Set your base folder and naming pattern
    base_folder = r"F:"  # Adjust this to your drive/path
    base_name = "Proposal_merge_20352_split"
    
    print("Counting files in folders...\n")
    counts, total = count_files_in_split_folders(base_folder, base_name)
    
    if counts:
        # Print individual folder counts
        print("Files per folder:")
        print("-" * 40)
        for folder_num, count in counts.items():
            print(f"Split {folder_num}: {count:,} files")
            
        # Print total
        print("\nSummary:")
        print("-" * 40)
        print(f"Total folders found: {len(counts)}")
        print(f"Total files across all folders: {total:,}")
    else:
        print("No matching folders found!")

Counting files in folders...

Files per folder:
----------------------------------------
Split 1: 2,541 files
Split 2: 2,542 files
Split 3: 2,543 files
Split 4: 2,542 files
Split 5: 2,543 files
Split 6: 2,543 files
Split 7: 2,542 files
Split 8: 2,539 files

Summary:
----------------------------------------
Total folders found: 8
Total files across all folders: 20,335


### Check missing file in splitted folder

In [None]:
import os
from pathlib import Path
import hashlib

def get_files_info(folder_path):
    """
    Get information about all files in a folder and its subfolders.
    
    Args:
        folder_path (str or Path): Path to the folder
        
    Returns:
        dict: Dictionary with filenames as keys and tuples of (size, path) as values
    """
    folder = Path(folder_path)
    files_info = {}
    
    for file_path in folder.rglob('*'):
        if file_path.is_file():
            # Store both file size and full path
            files_info[file_path.name] = (file_path.stat().st_size, str(file_path))
            
    return files_info

def compare_folders(original_folder, split_base_folder, split_base_name):
    """
    Compare files between original folder and split folders.
    
    Args:
        original_folder (str): Path to the original folder
        split_base_folder (str): Base directory containing the split folders
        split_base_name (str): Base name of the split folders
    """
    print("Gathering files information...")
    
    # Get files from original folder
    original_files = get_files_info(original_folder)
    print(f"Original folder contains {len(original_files):,} files")
    
    # Get files from all split folders
    split_files = {}
    folder_number = 1
    
    while True:
        folder_path = os.path.join(split_base_folder, f"{split_base_name}{folder_number}")
        if not os.path.exists(folder_path):
            break
            
        folder_files = get_files_info(folder_path)
        split_files.update(folder_files)
        folder_number += 1
    
    print(f"Split folders contain {len(split_files):,} files")
    
    # Find files in original but not in splits
    missing_in_splits = set(original_files.keys()) - set(split_files.keys())
    
    # Find files in splits but not in original
    extra_in_splits = set(split_files.keys()) - set(original_files.keys())
    
    # Print results
    print("\nComparison Results:")
    print("-" * 50)
    
    if missing_in_splits:
        print("\nFiles present in original but missing from splits:")
        for filename in missing_in_splits:
            size, path = original_files[filename]
            print(f"- {filename}")
            print(f"  Location: {path}")
            print(f"  Size: {size:,} bytes")
    
    if extra_in_splits:
        print("\nFiles present in splits but missing from original:")
        for filename in extra_in_splits:
            size, path = split_files[filename]
            print(f"- {filename}")
            print(f"  Location: {path}")
            print(f"  Size: {size:,} bytes")
    
    if not missing_in_splits and not extra_in_splits:
        print("No file differences found!")
        
        # Double-check file sizes
        size_mismatches = []
        for filename in original_files:
            orig_size = original_files[filename][0]
            split_size = split_files[filename][0]
            if orig_size != split_size:
                size_mismatches.append((filename, orig_size, split_size))
        
        if size_mismatches:
            print("\nFiles with different sizes:")
            for filename, orig_size, split_size in size_mismatches:
                print(f"- {filename}")
                print(f"  Original size: {orig_size:,} bytes")
                print(f"  Split size: {split_size:,} bytes")

if __name__ == "__main__":
    # Set your paths
    original_folder = r"F:\Proposal_merge_20344_orig"
    split_base_folder = r"F:"
    split_base_name = "Proposal_merge_20352_split"
    
    compare_folders(original_folder, split_base_folder, split_base_name)

Gathering files information...
Original folder contains 20,336 files
Split folders contain 20,335 files

Comparison Results:
--------------------------------------------------

Files present in original but missing from splits:
- Hirschmann_IT_USD_Q4-2021 (Regional).pdf
  Location: F:\Proposal_merge_20344_orig\Hirschmann_IT_USD_Q4-2021 (Regional).pdf
  Size: 192,826 bytes


### Split doc to many smaller folders then convert to PDF

In [None]:
import os
import shutil
import subprocess
from pathlib import Path
import time
from datetime import datetime

class ConversionManager:
    def __init__(self, source_folder, temp_split_base, final_output_folder, num_splits=20):
        self.source_folder = Path(source_folder)
        self.temp_split_base = Path(temp_split_base)
        self.final_output_folder = Path(final_output_folder)
        self.num_splits = num_splits
        self.start_time = None
        
    def create_split_folders(self):
        """Create temporary folders for split files"""
        split_folders = []
        for i in range(self.num_splits):
            folder_path = self.temp_split_base / f"split_{i + 1}"
            folder_path.mkdir(parents=True, exist_ok=True)
            split_folders.append(folder_path)
        return split_folders
    
    def split_documents(self):
        """Split documents into multiple folders"""
        print("\n=== Splitting Documents ===")
        
        # Get all DOC/DOCX files
        doc_files = []
        for ext in ['.doc', '.docx', '.DOC', '.DOCX']:
            doc_files.extend(list(self.source_folder.glob(f"*{ext}")))
        
        doc_files.sort()
        total_files = len(doc_files)
        
        if total_files == 0:
            raise ValueError("No DOC/DOCX files found in source folder")
        
        print(f"Found {total_files} documents to process")
        
        # Create split folders
        split_folders = self.create_split_folders()
        
        # Calculate distribution
        base_files_per_folder = total_files // self.num_splits
        remainder = total_files % self.num_splits
        
        # Distribute files
        current_index = 0
        for folder_index, folder in enumerate(split_folders):
            # Calculate number of files for this folder
            files_for_folder = base_files_per_folder + (1 if folder_index < remainder else 0)
            
            print(f"\nPopulating split folder {folder_index + 1}/{self.num_splits}")
            print(f"Copying {files_for_folder} files...")
            
            # Copy files to this folder
            for _ in range(files_for_folder):
                if current_index < len(doc_files):
                    source_file = doc_files[current_index]
                    destination_file = folder / source_file.name
                    shutil.copy2(source_file, destination_file)
                    print(f"Copied: {source_file.name}")
                    current_index += 1
        
        return split_folders
    
    def convert_to_pdf(self, input_folder):
        """Convert DOC/DOCX files in a folder to PDF"""
        # Create temporary output folder for this split
        output_folder = input_folder.parent / f"{input_folder.name}_pdf"
        output_folder.mkdir(exist_ok=True)
        
        # Get all documents in this folder
        doc_files = []
        for ext in ['.doc', '.docx', '.DOC', '.DOCX']:
            doc_files.extend(list(input_folder.glob(f"*{ext}")))
        
        total_files = len(doc_files)
        successful = 0
        failed = []
        
        print(f"\nConverting {total_files} files in {input_folder.name}")
        
        # Use LibreOffice for conversion
        if os.name == 'nt':  # Windows
            soffice_path = r"C:\Program Files\LibreOffice\program\soffice.exe"
        else:  # Linux/Mac
            soffice_path = 'soffice'
        
        for idx, doc_file in enumerate(doc_files, 1):
            try:
                print(f"Converting {idx}/{total_files}: {doc_file.name}")
                
                subprocess.check_call([
                    soffice_path,
                    '--headless',
                    '--convert-to', 'pdf',
                    '--outdir', str(output_folder),
                    str(doc_file)
                ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
                
                successful += 1
                print(f"✓ Success: {doc_file.name}")
                
            except Exception as e:
                failed.append(doc_file.name)
                print(f"✗ Failed: {doc_file.name} - {str(e)}")
        
        return output_folder, successful, failed
    
    def merge_pdfs(self, pdf_folders):
        """Merge all PDFs into final output folder"""
        print("\n=== Merging PDFs ===")
        self.final_output_folder.mkdir(parents=True, exist_ok=True)
        
        total_copied = 0
        failed_copies = []
        
        for folder in pdf_folders:
            pdf_files = list(folder.glob('*.pdf'))
            for pdf_file in pdf_files:
                try:
                    destination = self.final_output_folder / pdf_file.name
                    shutil.copy2(pdf_file, destination)
                    total_copied += 1
                    print(f"Copied: {pdf_file.name}")
                except Exception as e:
                    failed_copies.append(pdf_file.name)
                    print(f"Failed to copy {pdf_file.name}: {str(e)}")
        
        return total_copied, failed_copies
    
    def cleanup_temp_folders(self, split_folders, pdf_folders):
        """Clean up temporary folders"""
        print("\n=== Cleaning up temporary folders ===")
        for folder in split_folders + pdf_folders:
            try:
                shutil.rmtree(folder)
                print(f"Removed: {folder}")
            except Exception as e:
                print(f"Failed to remove {folder}: {str(e)}")
    
    def process(self):
        """Main process to split, convert, and merge"""
        self.start_time = datetime.now()
        print(f"Starting process at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
        
        try:
            # Split documents
            split_folders = self.split_documents()
            
            # Convert each split folder
            pdf_folders = []
            total_successful = 0
            all_failed = []
            
            for idx, folder in enumerate(split_folders, 1):
                print(f"\n=== Processing Split {idx}/{self.num_splits} ===")
                pdf_folder, successful, failed = self.convert_to_pdf(folder)
                pdf_folders.append(pdf_folder)
                total_successful += successful
                all_failed.extend(failed)
            
            # Merge PDFs
            total_merged, failed_merges = self.merge_pdfs(pdf_folders)
            
            # Cleanup
            self.cleanup_temp_folders(split_folders, pdf_folders)
            
            # Final report
            end_time = datetime.now()
            duration = end_time - self.start_time
            
            print("\n" + "=" * 50)
            print("FINAL REPORT")
            print("=" * 50)
            print(f"Process completed at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
            print(f"Total duration: {duration}")
            print(f"\nConversion Results:")
            print(f"- Successfully converted: {total_successful}")
            print(f"- Failed conversions: {len(all_failed)}")
            print(f"- Total merged to final folder: {total_merged}")
            
            if all_failed:
                print("\nFailed conversions:")
                for file in all_failed:
                    print(f"- {file}")
            
            if failed_merges:
                print("\nFailed merges:")
                for file in failed_merges:
                    print(f"- {file}")
                    
        except Exception as e:
            print(f"\nCritical error: {str(e)}")
            raise

if __name__ == "__main__":
    try:
        # Define your paths here
        source = r"D:\fd_doc_Prop_1885_split3"  # Source folder with DOC files
        temp_base = r"D:\temp_conversion"  # Temporary folder for splits
        final_output = r"D:\fd_doc_Prop_1885_split3_to_pdf"  # Final output folder for all PDFs
        
        # Create and run converter
        converter = ConversionManager(
            source_folder=source,
            temp_split_base=temp_base,
            final_output_folder=final_output,
            num_splits=20  # Number of splits
        )
        
        converter.process()
        
    except Exception as e:
        print(f"Error: {str(e)}")