### Filter PDF

In [9]:
import os
import shutil
from pathlib import Path

def count_pdf_files(directory):
    """Count the number of PDF files in a directory"""
    return len([f for f in os.listdir(directory) if f.lower().endswith('.pdf')])

def move_excluded_pdfs(source_dir, destination_dir, excluded_substrings):
    """
    Move PDF files containing excluded substrings to a destination directory.
    
    Args:
        source_dir (str): Source directory containing PDF files
        destination_dir (str): Destination directory for excluded PDF files
        excluded_substrings (list): List of substrings to check in filenames
    """
    # Create destination directory if it doesn't exist
    Path(destination_dir).mkdir(parents=True, exist_ok=True)
    
    # Count initial PDF files
    initial_count = count_pdf_files(source_dir)
    print(f"Initial PDF count in source directory: {initial_count}")
    
    # Counter for moved files
    moved_count = 0
    
    # Iterate through all files in source directory
    for filename in os.listdir(source_dir):
        if filename.lower().endswith('.pdf'):
            # Check if any excluded substring is in the filename
            if any(substr in filename for substr in excluded_substrings):
                source_path = os.path.join(source_dir, filename)
                dest_path = os.path.join(destination_dir, filename)
                
                try:
                    # Move the file
                    shutil.move(source_path, dest_path)
                    print(f"Moved: {filename}")
                    moved_count += 1
                except Exception as e:
                    print(f"Error moving {filename}: {str(e)}")
    
    # Count remaining PDF files
    remaining_count = count_pdf_files(source_dir)
    print(f"\nOperation completed:")
    print(f"- Initially: {initial_count} PDF files")
    print(f"- Moved: {moved_count} files")
    print(f"- Remaining in source directory: {remaining_count} PDF files")

# Example usage
if __name__ == "__main__":
    # Your specified paths
    source_directory = r"D:\all\cp_CT_Procur\to_pdf_fd_doc_Procur_2435"
    destination_directory = r"D:\all\cp_CT_Procur\rm_Procur_2435"
    
    # Your excluded substrings list
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note', 'DNTT', 'ĐNTT',
        'Shipping mark', 'Shipping Mark', 'shipping mark', 'SHIPPING MARK',
        'De nghi thanh toan', 'thong bao giao hang', 'Thông báo giao hàng',
        'phieu kiem hang', 'kiểm hàng', 'cu di cong tac', 'bao hanh',
        'thong bao hang den', 'Thông báo hàng đến', 'Taxi fee', 'Timesheet',
        'Yeu cau boi thuong', 'Yeu cau mua bao hiem', 'BBGH', 'Bien ban ban giao', 'Biên bản giao hàng',
        'phieu kiem hang', 'TBC', 'Certificate', 'CO_CQ', 'CO, CQ', 'CO-CQ', 'phieu bao hanh',
        'job completion', 'Scan_', 'TBHD', 'Vendor daily report', 'WaybillDoc', 'TVL_', 'BBBG',
        'Hóa đơn', 'Hoa don', 'Invoice', 'MA VACH', 'De nghi thanh toan',
        'Payment request', '01GTKT0_', 'Bang doi chieu cong no ', 'ToKhai',
        'Packing List', 'PO Receipt Confirm', 'Order Acknowledgement for',
        'FedEX AWB and', 'Cước ', 'AWB#', 'AWB #', 'FedEX AWB and', 'Letter of confirmation',
        'STFFD-P1-', 'PL_PO', 'Đề nghị thanh toán', 'CQ by', 'THONG BAO CUOC',
        'CO Chamber ', 'CO by Chamber ', 'Test Report ', 'TransportLabel_',
        'Vector InfoTech (VN) MC ', 'Phiếu bảo hành', 'Supplier confirmation for the use',
        'COO-VIVN-', 'Raw material Test Report_', 'Repair agreement No', 
        'Thong tin hang', 'F30 Supplier Monitoring & Review Report', 'thông tin hàng',
        'VECTOR INFOTECH-F1-', 'VECTOR INFOTECH-F2-', 'VECTOR- VOL', 'VIVN Drafter Claim ',
        'Vu Dinh Toan', 'Electronic ticket receipt', 'VECTOR-VOL', 'daily report Vector',
        'COO Chamber', 'COO by', 'COO 23', 'COO & ', 'COO-VIVN_MO', 'Raw material Test Report',
        'TNT AWB and CI&PL', 'To khai', 'Vector Infotech Vietnam 22A7021v-A',
        'DN VECTOR 21', 'DN VECTOR 22', 'DN VECTOR 23', 'DN VECTOR 24', 'daily report Vector',
        'Nguyen Van Hiep', 'Nguyen Tuan Anh', 'Nguyen Thuong Thuong', 'Nguyen Thanh Tung',
        'Nguyen Thai Duy', 'Ngo Nhan Tam', 'Ma Bao Nguyen', 'Le Xuan Hien',
        'Chamberized CO', 'Daily report_MAINTENANCE SERVICE', 'Tran Hao Hiep',
        'bang ke chi tiet', 'STFFD-Change Order-Variation Order Request-Rev1', 'SGNIR0023',
        'SGNIR0022', 'SGNIR0021', 'SGNIR0020', 'SGNIR0019', 'SGNIR0018', 'SGNIR0017', 'SGNIR0016',
        'SGNIR0015', 'SGNIR0013', 'Product test report_', 'order confirmation 20', 'COO_CQ', 'COO_COC',
        'Confirmation Letter', 'CO issued by ', 'Biên bản nghiệm thu', 'Biên bản Kiểm tra Hàng hóa',
        'AWB 77', '08-195353-06-PO-VXM-VECTOR-HQC- signed PO_Page', 'Work complet', ' HHNK 00',
        'HHNK + TBTP', ' VCND ', 'ihoadon.vn_031', 'Huynh Phuc Tho', 'EIR - CT190', ' VECTOR VIETNAM dated ',
        'DN190', 'DN - CT19', 'DHL AWB and CI', 'Delivery Ticket', 'Delivery Ninh Binh FertilizerToxic Gas  Detector',
        'CO by', 'CI for PO 44 ', 'Chung nhan xuat xu cap boi', 'Vo Tram Anh-', 'Vector Infotech Vietnam 22A7021v - CI Rev.3'
        ]
    
    move_excluded_pdfs(source_directory, destination_directory, excluded_substrings)

Initial PDF count in source directory: 2151
Moved: 2023 VCND PC00072.pdf
Moved: 24-0002 BBBG_Vector Infotech-Cisco Router 2911.pdf
Moved: 281022 - Payment request.pdf
Moved: 7. Exhibit V - Work Completion Certificate.pdf
Moved: Attachment No. 4 - Timesheet Form.pdf
Moved: BBBG   Hanwha Life OCt 24 VAT.pdf
Moved: BBBG   Hanwha Life Rev on Sep 25 2014.pdf
Moved: BBBG   Hanwha Life Rev on VISG.pdf
Moved: BBBG hàng gửi dán tem PCCC.pdf
Moved: BBBG NT  Hanwha Life Rev on Oct 14 2014.pdf
Moved: BELDEN Confirmation Letter.pdf
Moved: Biên bản Kiểm tra Hàng hóa Eltron for Khanh Gia.pdf
Moved: Biên bản Kiểm tra Hàng hóa F5 for Hanwha Life.pdf
Moved: Biên bản Kiểm tra Hàng hóa Hirschmann for ABB.pdf
Moved: Biên bản Kiểm tra Hàng hóa Hirschmann for Galaxy.pdf
Moved: Biên bản Kiểm tra Hàng hóa Motorola  for Bien Dong.pdf
Moved: Biên bản Kiểm tra Hàng hóa Nội bộ.pdf
Moved: Biên bản Kiểm tra Hàng hóa Televes for Bien Dong.pdf
Moved: Biên bản Kiểm tra Hàng hóa Vtech for Khanh Gia.pdf
Moved: Biên bản n

In [10]:
import os
import shutil
from pathlib import Path

def count_doc_files(directory):
    """Count the number of DOC files in a directory"""
    return len([f for f in os.listdir(directory) if f.lower().endswith('.pdf')])

def should_copy_file(filename, excluded_substrings):
    """
    Check if the file should be copied based on excluded substrings
    Returns True if file should be copied (no excluded substrings found)
    """
    return not any(substr in filename for substr in excluded_substrings)

def copy_included_docs(directory_a, source_dir, excluded_substrings):
    """
    Recursively copy DOC files from directory A and its subfolders to source directory,
    skipping files that contain excluded substrings.
    
    Args:
        directory_a (str): Root directory to search for DOC files
        source_dir (str): Destination directory for copied DOC files
        excluded_substrings (list): List of substrings to check in filenames
    """
    # Create source directory if it doesn't exist
    Path(source_dir).mkdir(parents=True, exist_ok=True)
    
    # Counter for copied files
    copied_count = 0
    skipped_count = 0
    
    print(f"Starting to search for DOC files in: {directory_a}")
    
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(directory_a):
        for filename in files:
            if filename.lower().endswith('.pdf'):
                if should_copy_file(filename, excluded_substrings):
                    source_path = os.path.join(root, filename)
                    # Generate unique name if file already exists
                    dest_filename = filename
                    base, ext = os.path.splitext(filename)
                    counter = 1
                    while os.path.exists(os.path.join(source_dir, dest_filename)):
                        dest_filename = f"{base}_{counter}{ext}"
                        counter += 1
                    
                    dest_path = os.path.join(source_dir, dest_filename)
                    
                    try:
                        # Copy the file
                        shutil.copy2(source_path, dest_path)
                        print(f"Copied: {filename} -> {dest_filename}")
                        copied_count += 1
                    except Exception as e:
                        print(f"Error copying {filename}: {str(e)}")
                else:
                    print(f"Skipped (excluded): {filename}")
                    skipped_count += 1
    
    # Count files in destination directory
    final_count = count_doc_files(source_dir)
    print(f"\nOperation completed:")
    print(f"- Files processed: {copied_count + skipped_count}")
    print(f"- Files copied: {copied_count}")
    print(f"- Files skipped (excluded): {skipped_count}")
    print(f"- Total DOC files in destination: {final_count}")

# Example usage
if __name__ == "__main__":
    # Your specified paths
    directory_a = r"D:\Vector Cloud\Procurement\Procurement\2. Contract\2018"  # Directory to search
    source_directory = r"D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_18"  # Where to copy files
    
    # Your excluded substrings list
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note', 'DNTT', 'ĐNTT',
        'Shipping mark', 'Shipping Mark', 'shipping mark', 'SHIPPING MARK',
        'De nghi thanh toan', 'thong bao giao hang', 'Thông báo giao hàng',
        'phieu kiem hang', 'kiểm hàng', 'cu di cong tac', 'bao hanh',
        'thong bao hang den', 'Thông báo hàng đến', 'Taxi fee', 'Timesheet',
        'Yeu cau boi thuong', 'Yeu cau mua bao hiem', 'BBGH', 'Bien ban ban giao', 'Biên bản giao hàng',
        'phieu kiem hang', 'TBC', 'Certificate', 'CO_CQ', 'CO, CQ', 'CO-CQ', 'phieu bao hanh',
        'job completion', 'Scan_', 'TBHD', 'Vendor daily report', 'WaybillDoc', 'TVL_', 'BBBG',
        'Hóa đơn', 'Hoa don', 'Invoice', 'MA VACH', 'De nghi thanh toan',
        'Payment request', '01GTKT0_', 'Bang doi chieu cong no ', 'ToKhai',
        'Packing List', 'PO Receipt Confirm', 'Order Acknowledgement for',
        'FedEX AWB and', 'Cước ', 'AWB#', 'AWB #', 'FedEX AWB and', 'Letter of confirmation',
        'STFFD-P1-', 'PL_PO', 'Đề nghị thanh toán', 'CQ by', 'THONG BAO CUOC',
        'CO Chamber ', 'CO by Chamber ', 'Test Report ', 'TransportLabel_',
        'Vector InfoTech (VN) MC ', 'Phiếu bảo hành', 'Supplier confirmation for the use',
        'COO-VIVN-', 'Raw material Test Report_', 'Repair agreement No', 
        'Thong tin hang', 'F30 Supplier Monitoring & Review Report', 'thông tin hàng',
        'VECTOR INFOTECH-F1-', 'VECTOR INFOTECH-F2-', 'VECTOR- VOL', 'VIVN Drafter Claim ',
        'Vu Dinh Toan', 'Electronic ticket receipt', 'VECTOR-VOL', 'daily report Vector',
        'COO Chamber', 'COO by', 'COO 23', 'COO & ', 'COO-VIVN_MO', 'Raw material Test Report',
        'TNT AWB and CI&PL', 'To khai', 'Vector Infotech Vietnam 22A7021v-A',
        'DN VECTOR 21', 'DN VECTOR 22', 'DN VECTOR 23', 'DN VECTOR 24', 'daily report Vector',
        'Nguyen Van Hiep', 'Nguyen Tuan Anh', 'Nguyen Thuong Thuong', 'Nguyen Thanh Tung',
        'Nguyen Thai Duy', 'Ngo Nhan Tam', 'Ma Bao Nguyen', 'Le Xuan Hien',
        'Chamberized CO', 'Daily report_MAINTENANCE SERVICE', 'Tran Hao Hiep',
        'bang ke chi tiet', 'STFFD-Change Order-Variation Order Request-Rev1', 'SGNIR0023',
        'SGNIR0022', 'SGNIR0021', 'SGNIR0020', 'SGNIR0019', 'SGNIR0018', 'SGNIR0017', 'SGNIR0016',
        'SGNIR0015', 'SGNIR0013', 'Product test report_', 'order confirmation 20', 'COO_CQ', 'COO_COC',
        'Confirmation Letter', 'CO issued by ', 'Biên bản nghiệm thu', 'Biên bản Kiểm tra Hàng hóa',
        'AWB 77', '08-195353-06-PO-VXM-VECTOR-HQC- signed PO_Page', 'Work complet', ' HHNK 00',
        'HHNK + TBTP', ' VCND ', 'ihoadon.vn_031', 'Huynh Phuc Tho', 'EIR - CT190', ' VECTOR VIETNAM dated ',
        'DN190', 'DN - CT19', 'DHL AWB and CI', 'Delivery Ticket', 'Delivery Ninh Binh FertilizerToxic Gas  Detector',
        'CO by', 'CI for PO 44 ', 'Chung nhan xuat xu cap boi', 'Vo Tram Anh-', 'Vector Infotech Vietnam 22A7021v - CI Rev.3'
        ]
    
    copy_included_docs(directory_a, source_directory, excluded_substrings)

Starting to search for DOC files in: D:\Vector Cloud\Procurement\Procurement\2. Contract\2018
Copied: inv 121 - Gas Vector.pdf -> inv 121 - Gas Vector.pdf
Copied: inv 127 - gas vector.pdf -> inv 127 - gas vector.pdf
Copied: Inv 184.pdf -> Inv 184.pdf
Copied: PO-0021-18-VI12-Signed PO-Revised.pdf -> PO-0021-18-VI12-Signed PO-Revised.pdf
Skipped (excluded): CO, CQ 2424.pdf
Copied: CV190305 - de xuat doi model item.pdf -> CV190305 - de xuat doi model item.pdf
Skipped (excluded): CW181042-01 - Phieu bao hanh.pdf
Skipped (excluded): CW181042-02 - Phieu bao hanh.pdf
Copied: Thu xac nhan.pdf -> Thu xac nhan.pdf
Copied: CPO - Vector.pdf -> CPO - Vector.pdf
Copied: DN1810042-01 - signed DN vs VAT179.pdf -> DN1810042-01 - signed DN vs VAT179.pdf
Copied: PO Bien Dong BD-OPS-2018-098C.pdf -> PO Bien Dong BD-OPS-2018-098C.pdf
Copied: CQ, CO.pdf -> CQ, CO.pdf
Copied: CT1810042 - Documents.pdf -> CT1810042 - Documents.pdf
Copied: smk.pdf -> smk.pdf
Copied: 40001675PFInv-R1.pdf -> 40001675PFInv-R1.pdf

### Filter and move docs from Procurment 

In [16]:
import os
import shutil
from pathlib import Path

def count_doc_files(directory):
    """Count the number of DOC files in a directory"""
    return len([f for f in os.listdir(directory) if f.lower().endswith(('.doc', '.docx'))])

def should_copy_file(filename, excluded_substrings):
    """
    Check if the file should be copied based on excluded substrings
    Returns True if file should be copied (no excluded substrings found)
    """
    return not any(substr in filename for substr in excluded_substrings)

def copy_included_docs(directory_a, source_dir, excluded_substrings):
    """
    Recursively copy DOC files from directory A and its subfolders to source directory,
    skipping files that contain excluded substrings.
    
    Args:
        directory_a (str): Root directory to search for DOC files
        source_dir (str): Destination directory for copied DOC files
        excluded_substrings (list): List of substrings to check in filenames
    """
    # Create source directory if it doesn't exist
    Path(source_dir).mkdir(parents=True, exist_ok=True)
    
    # Counter for copied files
    copied_count = 0
    skipped_count = 0
    
    print(f"Starting to search for DOC files in: {directory_a}")
    
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(directory_a):
        for filename in files:
            if filename.lower().endswith(('.doc', '.docx')):
                if should_copy_file(filename, excluded_substrings):
                    source_path = os.path.join(root, filename)
                    # Generate unique name if file already exists
                    dest_filename = filename
                    base, ext = os.path.splitext(filename)
                    counter = 1
                    while os.path.exists(os.path.join(source_dir, dest_filename)):
                        dest_filename = f"{base}_{counter}{ext}"
                        counter += 1
                    
                    dest_path = os.path.join(source_dir, dest_filename)
                    
                    try:
                        # Copy the file
                        shutil.copy2(source_path, dest_path)
                        print(f"Copied: {filename} -> {dest_filename}")
                        copied_count += 1
                    except Exception as e:
                        print(f"Error copying {filename}: {str(e)}")
                else:
                    print(f"Skipped (excluded): {filename}")
                    skipped_count += 1
    
    # Count files in destination directory
    final_count = count_doc_files(source_dir)
    print(f"\nOperation completed:")
    print(f"- Files processed: {copied_count + skipped_count}")
    print(f"- Files copied: {copied_count}")
    print(f"- Files skipped (excluded): {skipped_count}")
    print(f"- Total DOC files in destination: {final_count}")

# Example usage
if __name__ == "__main__":
    # Your specified paths
    directory_a = r"D:\Vector Cloud\Procurement\Procurement\2. Contract\2019"  # Directory to search
    source_directory = r"D:\all\cp_CT_Procur\source_doc_Procur_2019"  # Where to copy files
    
    # Your excluded substrings list
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note', 'DNTT', 'ĐNTT',
        'Shipping mark', 'Shipping Mark', 'shipping mark', 'SHIPPING MARK',
        'De nghi thanh toan', 'thong bao giao hang', 'Thông báo giao hàng',
        'phieu kiem hang', 'kiểm hàng', 'cu di cong tac', 'bao hanh',
        'thong bao hang den', 'Thông báo hàng đến', 'Taxi fee', 'Timesheet',
        'Yeu cau boi thuong', 'Yeu cau mua bao hiem', 'BBGH', 'Bien ban ban giao', 'Biên bản giao hàng',
        'phieu kiem hang', 'TBC', 'Certificate', 'CO_CQ', 'CO, CQ', 'CO-CQ', 'phieu bao hanh',
        'job completion', 'Scan_', 'TBHD', 'Vendor daily report', 'WaybillDoc', 'TVL_', 'BBBG',
        'Hóa đơn', 'Hoa don', 'Invoice', 'MA VACH', 'De nghi thanh toan',
        'Payment request', '01GTKT0_', 'Bang doi chieu cong no ', 'ToKhai',
        'Packing List', 'PO Receipt Confirm', 'Order Acknowledgement for',
        'FedEX AWB and', 'Cước ', 'AWB#', 'AWB #', 'FedEX AWB and', 'Letter of confirmation',
        'STFFD-P1-', 'PL_PO', 'Đề nghị thanh toán', 'CQ by', 'THONG BAO CUOC',
        'CO Chamber ', 'CO by Chamber ', 'Test Report ', 'TransportLabel_',
        'Vector InfoTech (VN) MC ', 'Phiếu bảo hành', 'Supplier confirmation for the use',
        'COO-VIVN-', 'Raw material Test Report_', 'Repair agreement No', 
        'Thong tin hang', 'F30 Supplier Monitoring & Review Report', 'thông tin hàng',
        'VECTOR INFOTECH-F1-', 'VECTOR INFOTECH-F2-', 'VECTOR- VOL', 'VIVN Drafter Claim ',
        'Vu Dinh Toan', 'Electronic ticket receipt', 'VECTOR-VOL', 'daily report Vector',
        'COO Chamber', 'COO by', 'COO 23', 'COO & ', 'COO-VIVN_MO', 'Raw material Test Report',
        'TNT AWB and CI&PL', 'To khai', 'Vector Infotech Vietnam 22A7021v-A',
        'DN VECTOR 21', 'DN VECTOR 22', 'DN VECTOR 23', 'DN VECTOR 24', 'daily report Vector',
        'Nguyen Van Hiep', 'Nguyen Tuan Anh', 'Nguyen Thuong Thuong', 'Nguyen Thanh Tung',
        'Nguyen Thai Duy', 'Ngo Nhan Tam', 'Ma Bao Nguyen', 'Le Xuan Hien',
        'Chamberized CO', 'Daily report_MAINTENANCE SERVICE', 'Tran Hao Hiep',
        'bang ke chi tiet', 'STFFD-Change Order-Variation Order Request-Rev1', 'SGNIR0023',
        'SGNIR0022', 'SGNIR0021', 'SGNIR0020', 'SGNIR0019', 'SGNIR0018', 'SGNIR0017', 'SGNIR0016',
        'SGNIR0015', 'SGNIR0013', 'Product test report_', 'order confirmation 20', 'COO_CQ', 'COO_COC',
        'Confirmation Letter', 'CO issued by ', 'Biên bản nghiệm thu', 'Biên bản Kiểm tra Hàng hóa',
        'AWB 77', '08-195353-06-PO-VXM-VECTOR-HQC- signed PO_Page', 'Work complet', ' HHNK 00',
        'HHNK + TBTP', ' VCND ', 'ihoadon.vn_031', 'Huynh Phuc Tho', 'EIR - CT190', ' VECTOR VIETNAM dated ',
        'DN190', 'DN - CT19', 'DHL AWB and CI', 'Delivery Ticket', 'Delivery Ninh Binh FertilizerToxic Gas  Detector',
        'CO by', 'CI for PO 44 ', 'Chung nhan xuat xu cap boi', 'Vo Tram Anh-', 'Vector Infotech Vietnam 22A7021v - CI Rev.3'
        ]
    
    copy_included_docs(directory_a, source_directory, excluded_substrings)

Starting to search for DOC files in: D:\Vector Cloud\Procurement\Procurement\2. Contract\2019
Skipped (excluded): CW1901001 - phieu bao hanh.docx
Skipped (excluded): DN1901001 - for S5T.docx
Copied: EIR1901001 -  Phieu kiem hang.docx -> EIR1901001 -  Phieu kiem hang.docx
Copied: PO1901001 - FiberTek for S5T.docx -> PO1901001 - FiberTek for S5T.docx
Skipped (excluded): PO1901001 - Thong tin hang.docx
Skipped (excluded): PO1901001 - Thong tin hang.docx
Skipped (excluded): CW1901002 - phieu bao hanh.docx
Skipped (excluded): CW1901002 - phieu bao hanh.docx
Skipped (excluded): DN1901002 - Delivery note.docx
Skipped (excluded): CW1901004 - phieu bao hanh.docx
Skipped (excluded): DN1901004 - Delivery note.docx
Skipped (excluded): DN1901005 - Delivery note.docx
Copied: PO1901005 - FiberTek for TB Engineer.docx -> PO1901005 - FiberTek for TB Engineer.docx
Skipped (excluded): PO1901005 - Thong tin hang.docx
Skipped (excluded): DN1901005 - Delivery note.docx
Skipped (excluded): PO1901005 - Thong 

### Merge files from subfolders to 1 folder

In [13]:
import os
import shutil
import glob
from pathlib import Path

def copy_files_to_folder(source_folders, destination_folder):
    """
    Copy all files from multiple source folders to a destination folder.
    
    Args:
        source_folders (list): List of source folder paths
        destination_folder (str): Path to destination folder
    
    Returns:
        tuple: (success_count, error_count, error_files)
    """
    # Create destination folder if it doesn't exist
    os.makedirs(destination_folder, exist_ok=True)
    
    success_count = 0
    error_count = 0
    error_files = []
    
    # Process each source folder
    for source_folder in source_folders:
        if not os.path.exists(source_folder):
            print(f"Warning: Source folder '{source_folder}' does not exist. Skipping...")
            continue
            
        # Get all files in the source folder
        for root, _, files in os.walk(source_folder):
            for file in files:
                source_path = os.path.join(root, file)
                
                # Create relative path to maintain folder structure if needed
                rel_path = os.path.relpath(root, source_folder)
                if rel_path == ".":
                    dest_path = os.path.join(destination_folder, file)
                else:
                    dest_path = os.path.join(destination_folder, rel_path, file)
                    
                # Create any necessary subdirectories
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
                
                try:
                    shutil.copy2(source_path, dest_path)
                    success_count += 1
                    print(f"Copied: {source_path} -> {dest_path}")
                except Exception as e:
                    error_count += 1
                    error_files.append((source_path, str(e)))
                    print(f"Error copying {source_path}: {e}")
    
    return success_count, error_count, error_files

def main():
    # Define source folders
    source_folders = [
        r"D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_14_332_178",
        r"D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_15_901_561",
        r"D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_16_549_198",
        r"D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_17_616_299",
        r"D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_18_600_249",
    ]
    
    # Define destination folder
    destination_folder = r"D:\all\cp_CT_Procur\CT_all_14_18"
    
    print(f"Starting to copy files to {destination_folder}...")
    
    # Copy files
    success_count, error_count, error_files = copy_files_to_folder(source_folders, destination_folder)
    
    # Print summary
    print("\nCopy Operation Complete!")
    print(f"Successfully copied: {success_count} files")
    print(f"Failed to copy: {error_count} files")
    
    if error_files:
        print("\nFiles that failed to copy:")
        for file_path, error in error_files:
            print(f"- {file_path}: {error}")

if __name__ == "__main__":
    main()

Starting to copy files to D:\all\cp_CT_Procur\CT_all_14_18...
Copied: D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_14_332_178\055-Vector.pdf -> D:\all\cp_CT_Procur\CT_all_14_18\055-Vector.pdf
Copied: D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_14_332_178\141204-VIT Cabinet NSP Quote.pdf -> D:\all\cp_CT_Procur\CT_all_14_18\141204-VIT Cabinet NSP Quote.pdf
Copied: D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_14_332_178\17-OB140706-Vector-Aug (Vietnam) Customs only.pdf -> D:\all\cp_CT_Procur\CT_all_14_18\17-OB140706-Vector-Aug (Vietnam) Customs only.pdf
Copied: D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_14_332_178\17-OB140706-Vector-Aug (Vietnam) FiNAL.pdf -> D:\all\cp_CT_Procur\CT_all_14_18\17-OB140706-Vector-Aug (Vietnam) FiNAL.pdf
Copied: D:\all\cp_CT_Procur\CT_in_Procur_2019_to_2024\CT_in_Procur_14_332_178\20140727155941128.pdf -> D:\all\cp_CT_Procur\CT_all_14_18\20140727155941128.pdf
Copied: D:\all\cp_CT_Procur\CT_in_Procur_2

In [23]:
import os
import hashlib
from pathlib import Path
from typing import Dict, List, Tuple

def get_file_hash(file_path: str, block_size: int = 65536) -> str:
    """
    Calculate SHA256 hash of a file for comparison.
    Uses buffered reading for large files.
    """
    sha256 = hashlib.sha256()
    with open(file_path, 'rb') as f:
        while True:
            data = f.read(block_size)
            if not data:
                break
            sha256.update(data)
    return sha256.hexdigest()

def get_pdf_files(folder: str) -> Dict[str, str]:
    """
    Get all PDF files from a folder with their hashes.
    Returns dict of {filename: hash}
    """
    pdf_files = {}
    folder_path = Path(folder)
    
    for file_path in folder_path.glob('*.pdf'):
        try:
            file_hash = get_file_hash(str(file_path))
            pdf_files[str(file_path)] = file_hash
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    return pdf_files

def remove_duplicates(folder_a: str, folder_b: str) -> Tuple[List[str], List[str]]:
    """
    Remove files from folder A that exist in folder B.
    Returns lists of removed files and errors encountered.
    """
    # Get all PDF files and their hashes
    files_a = get_pdf_files(folder_a)
    files_b = get_pdf_files(folder_b)
    
    # Store hashes from folder B for comparison
    hashes_b = set(files_b.values())
    
    removed_files = []
    errors = []
    
    # Check each file in folder A
    for file_path_a, hash_a in files_a.items():
        try:
            if hash_a in hashes_b:
                # File exists in folder B, remove from folder A
                os.remove(file_path_a)
                removed_files.append(file_path_a)
                print(f"Removed duplicate file: {file_path_a}")
        except Exception as e:
            errors.append(f"Error removing {file_path_a}: {e}")
            print(f"Error removing {file_path_a}: {e}")
    
    return removed_files, errors

def main():
    # Define folder paths
    folder_a = r"D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy"
    folder_b = r"F:\file_can_pass_or_loi"
    
    print("\nStarting duplicate file removal process...")
    print(f"Checking files in {folder_a} against {folder_b}")
    
    # Verify folders exist
    if not os.path.exists(folder_a) or not os.path.exists(folder_b):
        print("Error: One or both folders do not exist!")
        return
    
    # Get initial file counts
    initial_count_a = len(list(Path(folder_a).glob('*.pdf')))
    count_b = len(list(Path(folder_b).glob('*.pdf')))
    
    print(f"\nInitial file counts:")
    print(f"Folder A: {initial_count_a} PDF files")
    print(f"Folder B: {count_b} PDF files")
    
    # Remove duplicates
    removed_files, errors = remove_duplicates(folder_a, folder_b)
    
    # Get final file count
    final_count_a = len(list(Path(folder_a).glob('*.pdf')))
    
    # Print summary
    print("\nOperation Complete!")
    print(f"Files removed: {len(removed_files)}")
    print(f"Final count in Folder A: {final_count_a} PDF files")
    
    if errors:
        print("\nErrors encountered:")
        for error in errors:
            print(f"- {error}")
    
    if removed_files:
        print("\nRemoved files:")
        for file in removed_files:
            print(f"- {Path(file).name}")

if __name__ == "__main__":
    main()


Starting duplicate file removal process...
Checking files in D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy against F:\file_can_pass_or_loi

Initial file counts:
Folder A: 3735 PDF files
Folder B: 31 PDF files
Removed duplicate file: D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\Aegex_USD_Q4-2021 (Regional) pass vit2011.pdf
Removed duplicate file: D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\FiberTek_USD-Q122 (Regional).pdf
Removed duplicate file: D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\HealthCert_Nguyen Cao Ky Luan.pdf
Removed duplicate file: D:\all\cp_CT_Procur\CT_Procur_merge_doc_3735 - Copy\W&T_USD_Q1-2022(Regional).pdf

Operation Complete!
Files removed: 4
Final count in Folder A: 3731 PDF files

Removed files:
- Aegex_USD_Q4-2021 (Regional) pass vit2011.pdf
- FiberTek_USD-Q122 (Regional).pdf
- HealthCert_Nguyen Cao Ky Luan.pdf
- W&T_USD_Q1-2022(Regional).pdf
