### Split to small folders

In [1]:
import os
import shutil

# Source folder containing the PDFs
source_folder = r"F:\CT_in_Procur_merge_11184"

# Target folders to copy the files
target_folder_1 = r"F:\CT_in_Procur_merge_11184_split1"
target_folder_2 = r"F:\CT_in_Procur_merge_11184_split2"
target_folder_3 = r"F:\CT_in_Procur_merge_11184_split3"

# Create target folders if they don't exist
os.makedirs(target_folder_1, exist_ok=True)
os.makedirs(target_folder_2, exist_ok=True)
os.makedirs(target_folder_3, exist_ok=True)

# Get list of all PDF files in the source folder
pdf_files = [file for file in os.listdir(source_folder) if file.lower().endswith('.pdf')]

# Sort the files to ensure consistent distribution
pdf_files.sort()

# Total number of files
total_files = len(pdf_files)

# Number of folders
num_folders = 3

# Calculate the base number of files per folder and the remainder
files_per_folder = total_files // num_folders  # Integer division
remainder_files = total_files % num_folders    # Modulo operation

# Initialize folder distributions
folder_distribution = [files_per_folder] * num_folders

# Distribute the remainder files to the first few folders
for i in range(remainder_files):
    folder_distribution[i] += 1

# Copy files to target folders
current_index = 0
target_folders = [target_folder_1, target_folder_2, target_folder_3]

for folder_index, file_count in enumerate(folder_distribution):
    target_folder = target_folders[folder_index]
    for _ in range(file_count):
        source_file = os.path.join(source_folder, pdf_files[current_index])
        destination_file = os.path.join(target_folder, pdf_files[current_index])
        shutil.copy(source_file, destination_file)
        current_index += 1

# Verify and print the number of files copied to each folder
files_copied_1 = len(os.listdir(target_folder_1))
files_copied_2 = len(os.listdir(target_folder_2))
files_copied_3 = len(os.listdir(target_folder_3))

total_copied = files_copied_1 + files_copied_2 + files_copied_3

print(f"Files have been successfully copied into three folders with distribution:")
print(f"Folder 1: {files_copied_1} files")
print(f"Folder 2: {files_copied_2} files")
print(f"Folder 3: {files_copied_3} files")
print(f"Total files copied: {total_copied}")

# Check if the total copied files equal the total files in source
if total_copied == total_files:
    print("All files have been successfully copied.")
else:
    print("There was an error in copying files.")

Files have been successfully copied into three folders with distribution:
Folder 1: 3728 files
Folder 2: 3728 files
Folder 3: 3728 files
Total files copied: 11184
All files have been successfully copied.


In [1]:
import os
import shutil

# Source folder containing the PDFs
source_folder = r"F:\Proposal_merge_20352"

# Number of folders to split into
num_folders = 8

# Generate target folder paths dynamically
target_folders = [os.path.join(source_folder + f"_split{i+1}") for i in range(num_folders)]

# Create target folders if they don't exist
for folder in target_folders:
    os.makedirs(folder, exist_ok=True)

# Get list of all PDF files in the source folder
pdf_files = [file for file in os.listdir(source_folder) if file.lower().endswith('.pdf')]

# Sort the files to ensure consistent distribution
pdf_files.sort()

# Total number of files
total_files = len(pdf_files)

# Calculate the base number of files per folder and the remainder
files_per_folder = total_files // num_folders  # Integer division
remainder_files = total_files % num_folders    # Modulo operation

# Initialize folder distributions
folder_distribution = [files_per_folder] * num_folders

# Distribute the remainder files to the first few folders
for i in range(remainder_files):
    folder_distribution[i] += 1

# Copy files to target folders
current_index = 0

for folder_index, file_count in enumerate(folder_distribution):
    target_folder = target_folders[folder_index]
    for _ in range(file_count):
        source_file = os.path.join(source_folder, pdf_files[current_index])
        destination_file = os.path.join(target_folder, pdf_files[current_index])
        shutil.copy(source_file, destination_file)
        current_index += 1

# Verify and print the number of files copied to each folder
files_copied = []
for folder in target_folders:
    files_copied.append(len([file for file in os.listdir(folder) if file.lower().endswith('.pdf')]))

total_copied = sum(files_copied)

print(f"Files have been successfully copied into {num_folders} folders with distribution:")
for i, count in enumerate(files_copied, 1):
    print(f"Folder {i}: {count} files")
print(f"Total files copied: {total_copied}")

# Check if the total copied files equal the total files in source
if total_copied == total_files:
    print("All files have been successfully copied.")
else:
    print("There was an error in copying files.")


Files have been successfully copied into 8 folders with distribution:
Folder 1: 2544 files
Folder 2: 2544 files
Folder 3: 2544 files
Folder 4: 2544 files
Folder 5: 2544 files
Folder 6: 2544 files
Folder 7: 2544 files
Folder 8: 2544 files
Total files copied: 20352
All files have been successfully copied.


### Check if PDF is ok

In [2]:
import os
from PyPDF2 import PdfReader
import logging

def is_pdf_valid(file_path):
    """
    Checks if a PDF file is valid and not corrupted.

    Parameters:
        file_path (str): The path to the PDF file.

    Returns:
        bool: True if the PDF is valid, False otherwise.
    """
    try:
        with open(file_path, 'rb') as f:
            reader = PdfReader(f)
            # Attempt to read the number of pages
            num_pages = len(reader.pages)
        return True
    except Exception as e:
        logging.error(f"Error reading {file_path}: {e}")
        return False

def check_pdfs_in_folder(folder_path):
    """
    Checks all PDF files in a folder for validity.

    Parameters:
        folder_path (str): The path to the folder containing PDF files.

    Returns:
        tuple: Lists of valid and invalid PDF file paths.
    """
    valid_pdfs = []
    invalid_pdfs = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            if is_pdf_valid(file_path):
                # print(f"{filename} is valid.")
                valid_pdfs.append(file_path)
                # Proceed to process the PDF
            else:
                print(f"{filename} is corrupted or invalid.")
                invalid_pdfs.append(file_path)
    return valid_pdfs, invalid_pdfs

if __name__ == "__main__":
    # Configure logging to write errors to a file
    logging.basicConfig(filename='pdf_errors.log', level=logging.ERROR)

    # Replace 'path_to_your_folder' with the path to your folder containing PDFs
    folder_path = r"F:\CT_in_Procur_merge_11188"
    valid_pdfs, invalid_pdfs = check_pdfs_in_folder(folder_path)

    print(f"\nValid PDFs ({len(valid_pdfs)}):")
    # for pdf in valid_pdfs:
    #     print(pdf)

    print(f"\nInvalid PDFs ({len(invalid_pdfs)}):")
    for pdf in invalid_pdfs:
        print(pdf)


11122 VECTOR INT PCELL COO R1.pdf is corrupted or invalid.
11122 VECTOR INT PCELL COQ R1.pdf is corrupted or invalid.
11122 VECTOR INT PCELL R1.pdf is corrupted or invalid.
2.1. COO Chamber - Autronica (Attachment).pdf is corrupted or invalid.
6. SIL 2 Certificate of flame detector.pdf is corrupted or invalid.
7. SIL 2 Certificate of flame detector.pdf is corrupted or invalid.
Aegex_USD_Q4-2021 (Regional) pass vit2011.pdf is corrupted or invalid.
AWB_Fibertek.pdf is corrupted or invalid.
Belden COO - Spider.pdf is corrupted or invalid.
Cam kết năm sản xuất XP2i - Vector Infotech.pdf is corrupted or invalid.
ce700a_kvm_extender_catalogue.pdf is corrupted or invalid.
Chamber of Commerce   Certificate of Origin.pdf is corrupted or invalid.
CI & PL_xoá giá.pdf is corrupted or invalid.
CI_Fibertek.pdf is corrupted or invalid.
CO PTM item 4 - tubing.pdf is corrupted or invalid.
COO Chamber - Autronica (Attachment).pdf is corrupted or invalid.
CQ ProSoft.pdf is corrupted or invalid.
CQ Suppli