### Only compare doc/pdf files

In [None]:
import os
import shutil
import re

def compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path):
    """
    Compare doc_files and pdf_files subfolders between set1 and set2, copying different files
    to output folders only (fd_doc, fd_pdf, tree_structure, and merge_pdf).
    Excludes files with only numbers in their filename.
    
    Args:
        set1 (list): List of folder paths representing the first set
        set2 (list): List of folder paths representing the second set
        exclusion_folder (str): Path to the folder containing files to exclude
        exclusive_path (str or tuple): Path(s) where specific subfolders should be excluded
    
    Returns:
        dict: Dictionary with pair folder names as keys and absolute path of merge_pdf folder as value
    """
    # Define the main output folder
    if os.path.exists(updated_fd):
        shutil.rmtree(updated_fd)
    os.makedirs(updated_fd)

    # Define the file extensions to consider
    doc_extensions = ('.doc', '.docx', '.DOC', '.DOCX')
    pdf_extensions = ('.pdf', '.PDF')

    # Define substrings to exclude in filenames
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note', 'DNTT', 'ĐNTT',
            'Shipping mark', 'Shipping Mark', 'shipping mark', 'SHIPPING MARK',
            'De nghi thanh toan', 'thong bao giao hang', 'Thông báo giao hàng',
            'phieu kiem hang', 'kiểm hàng', 'cu di cong tac', 'bao hanh',
            'thong bao hang den', 'Thông báo hàng đến', 'Taxi fee', 'Timesheet',
            'Yeu cau boi thuong', 'Yeu cau mua bao hiem', 'BBGH', 'Bien ban ban giao', 'Biên bản giao hàng',
            'phieu kiem hang', 'TBC', 'Certificate', 'CO_CQ', 'CO, CQ', 'CO-CQ', 'phieu bao hanh',
            'job completion', 'Scan_', 'TBHD', 'Vendor daily report', 'WaybillDoc', 'TVL_', 'BBBG',
            'Hóa đơn', 'Hoa don', 'Invoice', 'MA VACH', 'De nghi thanh toan',
            'Payment request', '01GTKT0_', 'Bang doi chieu cong no ', 'ToKhai',
            'Packing List', 'PO Receipt Confirm', 'Order Acknowledgement for',
            'FedEX AWB and', 'Cước ', 'AWB#', 'AWB #', 'FedEX AWB and', 'Letter of confirmation',
            'STFFD-P1-', 'PL_PO', 'Đề nghị thanh toán', 'CQ by', 'THONG BAO CUOC',
            'CO Chamber ', 'CO by Chamber ', 'Test Report ', 'TransportLabel_',
            'Vector InfoTech (VN) MC ', 'Phiếu bảo hành', 'Supplier confirmation for the use',
            'COO-VIVN-', 'Raw material Test Report_', 'Repair agreement No', 
            'Thong tin hang', 'F30 Supplier Monitoring & Review Report', 'thông tin hàng',
            'VECTOR INFOTECH-F1-', 'VECTOR INFOTECH-F2-', 'VECTOR- VOL', 'VIVN Drafter Claim ',
            'Vu Dinh Toan', 'Electronic ticket receipt', 'VECTOR-VOL', 'daily report Vector',
            'COO Chamber', 'COO by', 'COO 23', 'COO & ', 'COO-VIVN_MO', 'Raw material Test Report',
            'TNT AWB and CI&PL', 'To khai', 'Vector Infotech Vietnam 22A7021v-A',
            'DN VECTOR 21', 'DN VECTOR 22', 'DN VECTOR 23', 'DN VECTOR 24', 'daily report Vector',
            'Nguyen Van Hiep', 'Nguyen Tuan Anh', 'Nguyen Thuong Thuong', 'Nguyen Thanh Tung',
            'Nguyen Thai Duy', 'Ngo Nhan Tam', 'Ma Bao Nguyen', 'Le Xuan Hien',
            'Chamberized CO', 'Daily report_MAINTENANCE SERVICE', 'Tran Hao Hiep',
            'bang ke chi tiet'  
            ]
    # Dictionary to store merge_pdf folder paths
    merge_pdf_paths = {}

    # Ensure exclusive_path is a tuple or list
    if isinstance(exclusive_path, str):
        exclusive_path = (exclusive_path,)

    # Check that both sets have the same number of folders
    if len(set1) != len(set2):
        raise ValueError("Both sets of folders must have the same number of folders.")

    def get_files_from_folder(folder_path, extensions):
        """Helper function to get all files with specific extensions from a folder"""
        files = {}
        if os.path.exists(folder_path):
            for root, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extensions):
                        files[filename.lower()] = os.path.join(root, filename)
        return files
    
    def get_filename_without_extension(filename):
        """Helper function to get filename without extension"""
        return os.path.splitext(filename.lower())[0]

    def should_exclude_file(filename):
        """Helper function to check if file should be excluded based on substrings or numbers-only filename"""
        # Get filename without extension
        name_without_ext = os.path.splitext(filename)[0]
        
        # Check if filename contains only numbers (allowing spaces and common separators)
        cleaned_name = re.sub(r'[-_\s]', '', name_without_ext)
        if cleaned_name.isdigit():
            return True
            
        # Check for excluded substrings
        return any(substr.lower() in filename.lower() for substr in excluded_substrings)

    # Get excluded files
    excluded_files = set()
    if os.path.exists(exclusion_folder):
        excluded_files = {f.lower() for f in os.listdir(exclusion_folder) 
                        if f.lower().endswith(doc_extensions + pdf_extensions)}

    # Process each pair of folders
    for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=1):
        name_A = os.path.basename(os.path.normpath(folder_A))
        name_B = os.path.basename(os.path.normpath(folder_B))
        pair_folder_name = f"{name_A}_vs_{name_B}"
        output_folder = os.path.join(updated_fd, pair_folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Create output subfolders
        fd_doc = os.path.join(output_folder, 'fd_doc')
        fd_pdf = os.path.join(output_folder, 'fd_pdf')
        tree_structure = os.path.join(output_folder, 'tree_structure')
        merge_pdf_folder = os.path.join(output_folder, 'merge_pdf')
        
        for folder in [fd_doc, fd_pdf, tree_structure, merge_pdf_folder]:
            os.makedirs(folder, exist_ok=True)

        # Store absolute path of merge_pdf folder in the dictionary
        merge_pdf_paths[pair_folder_name] = os.path.abspath(merge_pdf_folder)

        # Get reference to doc_files and pdf_files in folder_A for comparison
        folder_A_doc = os.path.join(folder_A, 'doc_files')
        folder_A_pdf = os.path.join(folder_A, 'pdf_files')

        # Get all files from folder_A (both doc and pdf) for name comparison
        all_doc_files_A = get_files_from_folder(folder_A_doc, doc_extensions)
        all_pdf_files_A = get_files_from_folder(folder_A_pdf, pdf_extensions)
        
        # Create sets of filenames without extensions for comparison
        doc_names_A = {get_filename_without_extension(f) for f in all_doc_files_A.keys()}
        pdf_names_A = {get_filename_without_extension(f) for f in all_pdf_files_A.keys()}

        # Process DOC files
        doc_files_B = {}
        doc_files_B_full_paths = {}
        
        # Walk through folder B to find all doc files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(doc_extensions):
                    # Skip if the file should be excluded based on substrings or numbers-only filename
                    if should_exclude_file(file):
                        continue
                        
                    # Check for existing filenames
                    file_name_without_ext = get_filename_without_extension(file)
                    # Skip if the filename exists in either doc_files or pdf_files of folder_A
                    if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                        doc_files_B[file.lower()] = os.path.join(root, file)
                        doc_files_B_full_paths[file.lower()] = (root, file)

        # Process PDF files
        pdf_files_B = {}
        pdf_files_B_full_paths = {}
        
        # Walk through folder B to find all pdf files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(pdf_extensions):
                    # Skip if the file should be excluded based on substrings or numbers-only filename
                    if should_exclude_file(file):
                        continue
                        
                    # Check if filename exists in either doc_files or pdf_files of folder_A
                    file_name_without_ext = get_filename_without_extension(file)
                    if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                        pdf_files_B[file.lower()] = os.path.join(root, file)
                        pdf_files_B_full_paths[file.lower()] = (root, file)

        # Find different files
        diff_docs = set(doc_files_B.keys()) - excluded_files
        diff_pdfs = set(pdf_files_B.keys()) - excluded_files

        # Copy DOC files to output folders only
        for doc in diff_docs:
            src = doc_files_B[doc]
            dst_output = os.path.join(fd_doc, os.path.basename(src))
            root, filename = doc_files_B_full_paths[doc]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy files to output locations only
                for dst in [dst_output, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
            except Exception as e:
                print(f"Error processing {src}: {e}")

        # Copy PDF files to output folders only
        for pdf in diff_pdfs:
            src = pdf_files_B[pdf]
            dst_output = os.path.join(fd_pdf, os.path.basename(src))
            root, filename = pdf_files_B_full_paths[pdf]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                for dst in [dst_output, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
            except Exception as e:
                pass
                # print(f"Error copying {src}: {e}")

        # Copy PDFs to merge_pdf folder
        print(f"\nMerging PDFs for pair {idx} ({name_A} vs {name_B}):")
        
        # Copy PDFs from fd_pdf
        pdf_count = 0
        for filename in os.listdir(fd_pdf):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(fd_pdf, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        print(f"  - Total PDFs merged: {pdf_count}")
        print(f"  - Merge PDF location: {merge_pdf_paths[pair_folder_name]}")

        # Print results for this pair
        print(f"\nResults for pair {idx} ({name_A} vs {name_B}):")
        print(f"  - Processed {len(diff_docs)} unique DOC files")
        print(f"  - Processed {len(diff_pdfs)} unique PDF files")

    print(f"\nOverall results completed")
    return merge_pdf_paths

# Example usage
if __name__ == "__main__":
    exclusion_folder = "./file_can_pass_or_loi"
    exclusive_path = ("./dont_care")
    updated_fd = "./updated_check_not_convert"

    set1 = [
        # r"D:\RAG_models_files_backup\pdf_fd\Sale_3807_188",
        r"F:\orig\CT_filter1_fd",
        # r"D:\RAG_models_files_backup\pdf_fd\Prop_20344_1885_294",
    ]
    set2 = [
        # r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
        r"D:\Vector Cloud\Procurement\Procurement",
        # r"D:\Vector Cloud\Proposal\1. Project", 
    ]
    
result_path_dict = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)

In [2]:
# Example usage
if __name__ == "__main__":
    exclusion_folder = "./file_can_pass_or_loi"
    exclusive_path = ("./dont_care")
    updated_fd = "./updated_check_not_convert"

    set1 = [
        # r"D:\RAG_models_files_backup\pdf_fd\Sale_3807_188",
        r"F:\orig\CT_filter1_fd",
        # r"D:\RAG_models_files_backup\pdf_fd\Prop_20344_1885_294",
    ]
    set2 = [
        # r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
        r"D:\Vector Cloud\Procurement\Procurement",
        # r"D:\Vector Cloud\Proposal\1. Project", 
    ]
    
result_path_dict = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)

Error processing D:\Vector Cloud\Procurement\Procurement\3. Logistics & Customs\Customs regulation\1. Certification\GPKD MMDS\Bo sung nganh nghe _ GL\HDDV - TVDT - VECTOR VIETNAM. - Bo sung nganh.doc: [WinError 3] The system cannot find the path specified

Merging PDFs for pair 1 (CT_filter1_fd vs Procurement):
  - Total PDFs merged: 1296
  - Merge PDF location: f:\local_RAG_pdf\second_docu\part12_cronjob\notebook_fd\updated_check_not_convert\CT_filter1_fd_vs_Procurement\merge_pdf

Results for pair 1 (CT_filter1_fd vs Procurement):
  - Processed 8 unique DOC files
  - Processed 1296 unique PDF files

Overall results completed


### Compare doc/pdf files but not convert

In [1]:
import os
import shutil

def compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path):
    """
    Compare doc_files and pdf_files subfolders between set1 and set2, copying different files
    to respective folders. PDF conversion is commented out.
    
    Args:
        set1 (list): List of folder paths representing the first set
        set2 (list): List of folder paths representing the second set
        exclusion_folder (str): Path to the folder containing files to exclude
        exclusive_path (str or tuple): Path(s) where specific subfolders should be excluded
    
    Returns:
        dict: Dictionary with pair folder names as keys and absolute path of merge_pdf folder as value
    """
    # Define the main output folder
    if os.path.exists(updated_fd):
        shutil.rmtree(updated_fd)
    os.makedirs(updated_fd)

    # Define the file extensions to consider
    doc_extensions = ('.doc', '.docx', '.DOC', '.DOCX')
    pdf_extensions = ('.pdf', '.PDF')

    # Define substrings to exclude in doc filenames
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note', 'DNTT', 'ĐNTT',
                           'Shipping mark', 'Shipping Mark', 'shipping mark', 'SHIPPING MARK',
                           'De nghi thanh toan', 'thong bao giao hang', 'Thông báo giao hàng',
                           'phieu kiem hang', 'kiểm hàng', 'cu di cong tac', 'bao hanh',
                           'thong bao hang den', 'Thông báo hàng đến']

    # Dictionary to store merge_pdf folder paths
    merge_pdf_paths = {}

    # Ensure exclusive_path is a tuple or list
    if isinstance(exclusive_path, str):
        exclusive_path = (exclusive_path,)

    # Check that both sets have the same number of folders
    if len(set1) != len(set2):
        raise ValueError("Both sets of folders must have the same number of folders.")

    def get_files_from_folder(folder_path, extensions):
        """Helper function to get all files with specific extensions from a folder"""
        files = {}
        if os.path.exists(folder_path):
            for root, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extensions):
                        files[filename.lower()] = os.path.join(root, filename)
        return files
    
    def get_filename_without_extension(filename):
        """Helper function to get filename without extension"""
        return os.path.splitext(filename.lower())[0]

    # Get excluded files
    excluded_files = set()
    if os.path.exists(exclusion_folder):
        excluded_files = {f.lower() for f in os.listdir(exclusion_folder) 
                        if f.lower().endswith(doc_extensions + pdf_extensions)}

    # Process each pair of folders
    for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=1):
        name_A = os.path.basename(os.path.normpath(folder_A))
        name_B = os.path.basename(os.path.normpath(folder_B))
        pair_folder_name = f"{name_A}_vs_{name_B}"
        output_folder = os.path.join(updated_fd, pair_folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Create all output subfolders
        fd_doc = os.path.join(output_folder, 'fd_doc')
        fd_pdf = os.path.join(output_folder, 'fd_pdf')
        tree_structure = os.path.join(output_folder, 'tree_structure')
        # doc_to_pdf_folder = os.path.join(output_folder, 'doc_to_pdf')
        merge_pdf_folder = os.path.join(output_folder, 'merge_pdf')
        
        # for folder in [fd_doc, fd_pdf, tree_structure, doc_to_pdf_folder, merge_pdf_folder]:
        for folder in [fd_doc, fd_pdf, tree_structure, merge_pdf_folder]:
            os.makedirs(folder, exist_ok=True)

        # Store absolute path of merge_pdf folder in the dictionary
        merge_pdf_paths[pair_folder_name] = os.path.abspath(merge_pdf_folder)

        # Ensure doc_files and pdf_files exist in folder_A
        folder_A_doc = os.path.join(folder_A, 'doc_files')
        folder_A_pdf = os.path.join(folder_A, 'pdf_files')
        os.makedirs(folder_A_doc, exist_ok=True)
        os.makedirs(folder_A_pdf, exist_ok=True)

        # Get all files from folder_A (both doc and pdf) for name comparison
        all_doc_files_A = get_files_from_folder(folder_A_doc, doc_extensions)
        all_pdf_files_A = get_files_from_folder(folder_A_pdf, pdf_extensions)
        
        # Create sets of filenames without extensions for comparison
        doc_names_A = {get_filename_without_extension(f) for f in all_doc_files_A.keys()}
        pdf_names_A = {get_filename_without_extension(f) for f in all_pdf_files_A.keys()}

        # Process DOC files
        doc_files_B = {}
        doc_files_B_full_paths = {}
        
        # Walk through folder B to find all doc files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(doc_extensions):
                    # Check for excluded substrings and existing filenames
                    file_name_without_ext = get_filename_without_extension(file)
                    if not any(substr.lower() in file.lower() for substr in excluded_substrings):
                        # Skip if the filename exists in either doc_files or pdf_files of folder_A
                        if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                            doc_files_B[file.lower()] = os.path.join(root, file)
                            doc_files_B_full_paths[file.lower()] = (root, file)

        # Process PDF files
        pdf_files_B = {}
        pdf_files_B_full_paths = {}
        
        # Walk through folder B to find all pdf files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(pdf_extensions):
                    # Check if filename exists in either doc_files or pdf_files of folder_A
                    file_name_without_ext = get_filename_without_extension(file)
                    if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                        pdf_files_B[file.lower()] = os.path.join(root, file)
                        pdf_files_B_full_paths[file.lower()] = (root, file)

        # Find different files (now we don't need to subtract from set A since we already filtered)
        diff_docs = set(doc_files_B.keys()) - excluded_files
        diff_pdfs = set(pdf_files_B.keys()) - excluded_files

        # Copy DOC files (without conversion)
        for doc in diff_docs:
            src = doc_files_B[doc]
            dst_output = os.path.join(fd_doc, os.path.basename(src))
            dst_original = os.path.join(folder_A_doc, os.path.basename(src))
            root, filename = doc_files_B_full_paths[doc]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy files to all locations
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
                    
            except Exception as e:
                print(f"Error processing {src}: {e}")

        # Copy PDF files
        for pdf in diff_pdfs:
            src = pdf_files_B[pdf]
            dst_output = os.path.join(fd_pdf, os.path.basename(src))
            dst_original = os.path.join(folder_A_pdf, os.path.basename(src))
            root, filename = pdf_files_B_full_paths[pdf]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
            except Exception as e:
                print(f"Error copying {src}: {e}")

        # Copy PDFs to merge_pdf folder
        print(f"\nMerging PDFs for pair {idx} ({name_A} vs {name_B}):")
        
        # Copy PDFs from fd_pdf
        pdf_count = 0
        for filename in os.listdir(fd_pdf):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(fd_pdf, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        print(f"  - Total PDFs merged: {pdf_count}")
        print(f"  - Merge PDF location: {merge_pdf_paths[pair_folder_name]}")

        # Print results for this pair
        print(f"\nResults for pair {idx} ({name_A} vs {name_B}):")
        print(f"  - Processed {len(diff_docs)} unique DOC files")
        print(f"  - Processed {len(diff_pdfs)} unique PDF files")

    print(f"\nOverall results completed")
    return merge_pdf_paths

In [3]:
# Example usage
if __name__ == "__main__":
    exclusion_folder = "./file_can_pass_or_loi"
    exclusive_path = ("./dont_care")
    updated_fd = "./updated_check_not_convert"

    set1 = [
        r"F:\justfor_compare\Sale_merge_3807_justfor_compare",
        r"F:\justfor_compare\CT_in_Procur_merge_11184_justfor_compare",
        r"F:\justfor_compare\Proposal_merge_20344_orig_justfor_compare"
    ]
    set2 = [
        r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
        r"D:\Vector Cloud\Procurement\Procurement",
        r"D:\Vector Cloud\Proposal\1. Project", 
    ]
    
result_path_dict = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)


Merging PDFs for pair 1 (Sale_merge_3807_justfor_compare vs 1. SI):
  - Total PDFs merged: 0
  - Merge PDF location: d:\second_docu\part12_cronjob\updated_check_not_convert\Sale_merge_3807_justfor_compare_vs_1. SI\merge_pdf

Results for pair 1 (Sale_merge_3807_justfor_compare vs 1. SI):
  - Processed 0 unique DOC files
  - Processed 0 unique PDF files

Merging PDFs for pair 2 (CT_in_Procur_merge_11184_justfor_compare vs Procurement):
  - Total PDFs merged: 0
  - Merge PDF location: d:\second_docu\part12_cronjob\updated_check_not_convert\CT_in_Procur_merge_11184_justfor_compare_vs_Procurement\merge_pdf

Results for pair 2 (CT_in_Procur_merge_11184_justfor_compare vs Procurement):
  - Processed 0 unique DOC files
  - Processed 0 unique PDF files

Merging PDFs for pair 3 (Proposal_merge_20344_orig_justfor_compare vs 1. Project):
  - Total PDFs merged: 0
  - Merge PDF location: d:\second_docu\part12_cronjob\updated_check_not_convert\Proposal_merge_20344_orig_justfor_compare_vs_1. Project\

### Put in merge_pdf but not in doc_files/pdf_files

In [None]:
import os
import shutil
import sys
import subprocess

def find_soffice_path():
    """Find the LibreOffice executable path on different operating systems."""
    if sys.platform == "win32":
        possible_paths = [
            r"C:\Program Files\LibreOffice\program\soffice.exe",
            r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
        ]
        for path in possible_paths:
            if os.path.exists(path):
                print(f"Found LibreOffice at: {path}")
                return path
        print("LibreOffice not found in standard locations!")
    elif sys.platform == "darwin":  # macOS
        return "/Applications/LibreOffice.app/Contents/MacOS/soffice"
    else:  # Linux
        return "soffice"
    return None

def doc_to_pdf(doc_path, output_dir):
    """Converts a Word document to PDF using LibreOffice command-line tools."""
    soffice_path = find_soffice_path()
    if not soffice_path:
        raise RuntimeError("LibreOffice not found. Please verify it's installed correctly.")
    
    doc_path = os.path.abspath(doc_path)
    output_dir = os.path.abspath(output_dir)
    
    if not os.path.exists(doc_path):
        raise FileNotFoundError(f"Input file not found: {doc_path}")
    
    print(f"Converting file: {doc_path}")
    print(f"Output directory: {output_dir}")
    
    try:
        result = subprocess.run([
            soffice_path,
            '--headless',
            '--convert-to', 'pdf',
            '--outdir', output_dir,
            doc_path
        ], capture_output=True, text=True)
        
        if result.returncode == 0:
            expected_pdf = os.path.join(output_dir, 
                                      os.path.splitext(os.path.basename(doc_path))[0] + '.pdf')
            if os.path.exists(expected_pdf):
                print(f"Successfully converted: {doc_path}")
                return True
            else:
                print(f"Conversion seemed to succeed but PDF not found: {expected_pdf}")
                return False
        else:
            print(f"Conversion failed for {doc_path}")
            print(f"Error output: {result.stderr}")
            return False
            
    except Exception as e:
        print(f"Error converting {doc_path}: {str(e)}")
        return False

def compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path):
    """
    Compare doc_files and pdf_files subfolders between set1 and set2, copying different files
    to respective folders, converting doc files to PDF, and merging all PDFs into a single folder.
    Skips files that exist in either doc_files or pdf_files of set1 folders.
    
    Args:
        set1 (list): List of folder paths representing the first set
        set2 (list): List of folder paths representing the second set
        exclusion_folder (str): Path to the folder containing files to exclude
        exclusive_path (str or tuple): Path(s) where specific subfolders should be excluded
    
    Returns:
        dict: Dictionary with pair folder names as keys and absolute path of merge_pdf folder as value
    """
    # Define the main output folder
    if os.path.exists(updated_fd):
        shutil.rmtree(updated_fd)
    os.makedirs(updated_fd)

    # Define the file extensions to consider
    doc_extensions = ('.doc', '.docx', '.DOC', '.DOCX')
    pdf_extensions = ('.pdf', '.PDF')

    # Define substrings to exclude in doc filenames
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note']

    # Dictionary to store merge_pdf folder paths
    merge_pdf_paths = {}

    # Ensure exclusive_path is a tuple or list
    if isinstance(exclusive_path, str):
        exclusive_path = (exclusive_path,)

    # Check that both sets have the same number of folders
    if len(set1) != len(set2):
        raise ValueError("Both sets of folders must have the same number of folders.")

    def get_files_from_folder(folder_path, extensions):
        """Helper function to get all files with specific extensions from a folder"""
        files = {}
        if os.path.exists(folder_path):
            for root, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extensions):
                        files[filename.lower()] = os.path.join(root, filename)
        return files
    
    def get_filename_without_extension(filename):
        """Helper function to get filename without extension"""
        return os.path.splitext(filename.lower())[0]

    # Get excluded files
    excluded_files = set()
    if os.path.exists(exclusion_folder):
        excluded_files = {f.lower() for f in os.listdir(exclusion_folder) 
                        if f.lower().endswith(doc_extensions + pdf_extensions)}

    # Track overall conversion statistics
    total_successful_conversions = 0
    total_failed_conversions = 0

    # Process each pair of folders
    for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=1):
        name_A = os.path.basename(os.path.normpath(folder_A))
        name_B = os.path.basename(os.path.normpath(folder_B))
        pair_folder_name = f"{name_A}_vs_{name_B}"
        output_folder = os.path.join(updated_fd, pair_folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Create all output subfolders
        fd_doc = os.path.join(output_folder, 'fd_doc')
        fd_pdf = os.path.join(output_folder, 'fd_pdf')
        tree_structure = os.path.join(output_folder, 'tree_structure')
        doc_to_pdf_folder = os.path.join(output_folder, 'doc_to_pdf')
        merge_pdf_folder = os.path.join(output_folder, 'merge_pdf')
        
        for folder in [fd_doc, fd_pdf, tree_structure, doc_to_pdf_folder, merge_pdf_folder]:
            os.makedirs(folder, exist_ok=True)

        # Store absolute path of merge_pdf folder in the dictionary
        merge_pdf_paths[pair_folder_name] = os.path.abspath(merge_pdf_folder)

        # Ensure doc_files and pdf_files exist in folder_A
        folder_A_doc = os.path.join(folder_A, 'doc_files')
        folder_A_pdf = os.path.join(folder_A, 'pdf_files')
        os.makedirs(folder_A_doc, exist_ok=True)
        os.makedirs(folder_A_pdf, exist_ok=True)

        # Get all files from folder_A (both doc and pdf) for name comparison
        all_doc_files_A = get_files_from_folder(folder_A_doc, doc_extensions)
        all_pdf_files_A = get_files_from_folder(folder_A_pdf, pdf_extensions)
        
        # Create sets of filenames without extensions for comparison
        doc_names_A = {get_filename_without_extension(f) for f in all_doc_files_A.keys()}
        pdf_names_A = {get_filename_without_extension(f) for f in all_pdf_files_A.keys()}

        # Process DOC files
        doc_files_B = {}
        doc_files_B_full_paths = {}
        
        # Walk through folder B to find all doc files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(doc_extensions):
                    # Check for excluded substrings and existing filenames
                    file_name_without_ext = get_filename_without_extension(file)
                    if not any(substr.lower() in file.lower() for substr in excluded_substrings):
                        # Skip if the filename exists in either doc_files or pdf_files of folder_A
                        if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                            doc_files_B[file.lower()] = os.path.join(root, file)
                            doc_files_B_full_paths[file.lower()] = (root, file)

        # Process PDF files
        pdf_files_B = {}
        pdf_files_B_full_paths = {}
        
        # Walk through folder B to find all pdf files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(pdf_extensions):
                    # Check if filename exists in either doc_files or pdf_files of folder_A
                    file_name_without_ext = get_filename_without_extension(file)
                    if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                        pdf_files_B[file.lower()] = os.path.join(root, file)
                        pdf_files_B_full_paths[file.lower()] = (root, file)

        # Find different files (now we don't need to subtract from set A since we already filtered)
        diff_docs = set(doc_files_B.keys()) - excluded_files
        diff_pdfs = set(pdf_files_B.keys()) - excluded_files

        # Copy and convert files
        successful_conversions = 0
        failed_conversions = 0

        # Copy and convert DOC files
        for doc in diff_docs:
            src = doc_files_B[doc]
            dst_output = os.path.join(fd_doc, os.path.basename(src))
            dst_original = os.path.join(folder_A_doc, os.path.basename(src))
            root, filename = doc_files_B_full_paths[doc]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy files to all locations
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
                
                # Convert to PDF
                if doc_to_pdf(dst_output, doc_to_pdf_folder):
                    successful_conversions += 1
                else:
                    failed_conversions += 1
                    
            except Exception as e:
                print(f"Error processing {src}: {e}")
                failed_conversions += 1

        # Copy PDF files
        for pdf in diff_pdfs:
            src = pdf_files_B[pdf]
            dst_output = os.path.join(fd_pdf, os.path.basename(src))
            dst_original = os.path.join(folder_A_pdf, os.path.basename(src))
            root, filename = pdf_files_B_full_paths[pdf]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
            except Exception as e:
                print(f"Error copying {src}: {e}")

        # Update total statistics
        total_successful_conversions += successful_conversions
        total_failed_conversions += failed_conversions

        # Copy all PDFs to merge_pdf folder
        print(f"\nMerging PDFs for pair {idx} ({name_A} vs {name_B}):")
        
        # Copy PDFs from fd_pdf
        pdf_count = 0
        for filename in os.listdir(fd_pdf):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(fd_pdf, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        # Copy PDFs from doc_to_pdf
        for filename in os.listdir(doc_to_pdf_folder):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(doc_to_pdf_folder, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        print(f"  - Total PDFs merged: {pdf_count}")
        print(f"  - Merge PDF location: {merge_pdf_paths[pair_folder_name]}")

        # Print results for this pair
        print(f"\nResults for pair {idx} ({name_A} vs {name_B}):")
        print(f"  - Processed {len(diff_docs)} unique DOC files")
        print(f"  - Processed {len(diff_pdfs)} unique PDF files")
        print(f"  - Successfully converted to PDF: {successful_conversions} files")
        print(f"  - Failed conversions: {failed_conversions} files")

    # Print overall results
    print(f"\nOverall results:")
    print(f"Total files successfully converted to PDF: {total_successful_conversions}")
    print(f"Total failed conversions: {total_failed_conversions}")

    return merge_pdf_paths

In [None]:
exclusion_folder = "./file_can_pass_or_loi"
exclusive_path = ("./dont_care")
updated_fd = "./updated_fd_all_all_test1_ok12"

set1 = [
    "./examples/example_fd_to_compare/test_Sale",
    "./examples/example_fd_to_compare/test_CT",
    "./examples/example_fd_to_compare/test_Prop",
]
set2 = [
    "./examples/example_fd_to_compare/Sale_server",
    "./examples/example_fd_to_compare/CT_server",
    "./examples/example_fd_to_compare/Prop_server"]
    
result_path_dict = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)

### Ready

In [15]:
import os
import shutil
import sys
import subprocess

def find_soffice_path():
    """Find the LibreOffice executable path on different operating systems."""
    if sys.platform == "win32":
        possible_paths = [
            r"C:\Program Files\LibreOffice\program\soffice.exe",
            r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
        ]
        for path in possible_paths:
            if os.path.exists(path):
                print(f"Found LibreOffice at: {path}")
                return path
        print("LibreOffice not found in standard locations!")
    elif sys.platform == "darwin":  # macOS
        return "/Applications/LibreOffice.app/Contents/MacOS/soffice"
    else:  # Linux
        return "soffice"
    return None

def doc_to_pdf(doc_path, output_dir):
    """Converts a Word document to PDF using LibreOffice command-line tools."""
    soffice_path = find_soffice_path()
    if not soffice_path:
        raise RuntimeError("LibreOffice not found. Please verify it's installed correctly.")
    
    doc_path = os.path.abspath(doc_path)
    output_dir = os.path.abspath(output_dir)
    
    if not os.path.exists(doc_path):
        raise FileNotFoundError(f"Input file not found: {doc_path}")
    
    print(f"Converting file: {doc_path}")
    print(f"Output directory: {output_dir}")
    
    try:
        result = subprocess.run([
            soffice_path,
            '--headless',
            '--convert-to', 'pdf',
            '--outdir', output_dir,
            doc_path
        ], capture_output=True, text=True)
        
        if result.returncode == 0:
            expected_pdf = os.path.join(output_dir, 
                                      os.path.splitext(os.path.basename(doc_path))[0] + '.pdf')
            if os.path.exists(expected_pdf):
                print(f"Successfully converted: {doc_path}")
                return expected_pdf  # Return the path to the converted PDF
            else:
                print(f"Conversion seemed to succeed but PDF not found: {expected_pdf}")
                return None
        else:
            print(f"Conversion failed for {doc_path}")
            print(f"Error output: {result.stderr}")
            return None
            
    except Exception as e:
        print(f"Error converting {doc_path}: {str(e)}")
        return None

def compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path):
    """
    Compare doc_files and pdf_files subfolders between set1 and set2, copying different files
    to respective folders, converting doc files to PDF, and merging all PDFs into a single folder.
    Skips files that exist in either doc_files or pdf_files of set1 folders.
    
    Args:
        set1 (list): List of folder paths representing the first set
        set2 (list): List of folder paths representing the second set
        exclusion_folder (str): Path to the folder containing files to exclude
        exclusive_path (str or tuple): Path(s) where specific subfolders should be excluded
    
    Returns:
        dict: Dictionary with pair folder names as keys and absolute path of merge_pdf folder as value
    """
    # Define the main output folder
    if os.path.exists(updated_fd):
        shutil.rmtree(updated_fd)
    os.makedirs(updated_fd)

    # Define the file extensions to consider
    doc_extensions = ('.doc', '.docx', '.DOC', '.DOCX')
    pdf_extensions = ('.pdf', '.PDF')

    # Define substrings to exclude in doc filenames
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note']

    # Dictionary to store merge_pdf folder paths
    merge_pdf_paths = {}

    # Ensure exclusive_path is a tuple or list
    if isinstance(exclusive_path, str):
        exclusive_path = (exclusive_path,)

    # Check that both sets have the same number of folders
    if len(set1) != len(set2):
        raise ValueError("Both sets of folders must have the same number of folders.")

    def get_files_from_folder(folder_path, extensions):
        """Helper function to get all files with specific extensions from a folder"""
        files = {}
        if os.path.exists(folder_path):
            for root, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extensions):
                        files[filename.lower()] = os.path.join(root, filename)
        return files
    
    def get_filename_without_extension(filename):
        """Helper function to get filename without extension"""
        return os.path.splitext(filename.lower())[0]

    # Get excluded files
    excluded_files = set()
    if os.path.exists(exclusion_folder):
        excluded_files = {f.lower() for f in os.listdir(exclusion_folder) 
                        if f.lower().endswith(doc_extensions + pdf_extensions)}

    # Track overall conversion statistics
    total_successful_conversions = 0
    total_failed_conversions = 0

    # Process each pair of folders
    for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=1):
        print("folder_A", folder_A)
        print("folder_B", folder_B)
        name_A = os.path.basename(os.path.normpath(folder_A))
        name_B = os.path.basename(os.path.normpath(folder_B))
        pair_folder_name = f"{name_A}_vs_{name_B}"
        output_folder = os.path.join(updated_fd, pair_folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Create all output subfolders
        fd_doc = os.path.join(output_folder, 'fd_doc')
        fd_pdf = os.path.join(output_folder, 'fd_pdf')
        tree_structure = os.path.join(output_folder, 'tree_structure')
        doc_to_pdf_folder = os.path.join(output_folder, 'doc_to_pdf')
        merge_pdf_folder = os.path.join(output_folder, 'merge_pdf')
        
        for folder in [fd_doc, fd_pdf, tree_structure, doc_to_pdf_folder, merge_pdf_folder]:
            os.makedirs(folder, exist_ok=True)

        # Store absolute path of merge_pdf folder in the dictionary
        merge_pdf_paths[pair_folder_name] = os.path.abspath(merge_pdf_folder)

        # Ensure doc_files and pdf_files exist in folder_A
        folder_A_doc = os.path.join(folder_A, 'doc_files')
        folder_A_pdf = os.path.join(folder_A, 'pdf_files')
        os.makedirs(folder_A_doc, exist_ok=True)
        os.makedirs(folder_A_pdf, exist_ok=True)

        # Get all files from folder_A (both doc and pdf) for name comparison
        all_doc_files_A = get_files_from_folder(folder_A_doc, doc_extensions)
        all_pdf_files_A = get_files_from_folder(folder_A_pdf, pdf_extensions)
        
        # Create sets of filenames without extensions for comparison
        doc_names_A = {get_filename_without_extension(f) for f in all_doc_files_A.keys()}
        pdf_names_A = {get_filename_without_extension(f) for f in all_pdf_files_A.keys()}

        # Process DOC files
        doc_files_B = {}
        doc_files_B_full_paths = {}
        
        # Walk through folder B to find all doc files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(doc_extensions):
                    # Check for excluded substrings and existing filenames
                    file_name_without_ext = get_filename_without_extension(file)
                    if not any(substr.lower() in file.lower() for substr in excluded_substrings):
                        # Skip if the filename exists in either doc_files or pdf_files of folder_A
                        print("file_name_without_ext", file_name_without_ext)
                        if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                            doc_files_B[file.lower()] = os.path.join(root, file)
                            doc_files_B_full_paths[file.lower()] = (root, file)

        # Process PDF files
        pdf_files_B = {}
        pdf_files_B_full_paths = {}
        
        # Walk through folder B to find all pdf files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(pdf_extensions):
                    # Check if filename exists in either doc_files or pdf_files of folder_A
                    file_name_without_ext = get_filename_without_extension(file)
                    if file_name_without_ext not in doc_names_A and file_name_without_ext not in pdf_names_A:
                        pdf_files_B[file.lower()] = os.path.join(root, file)
                        pdf_files_B_full_paths[file.lower()] = (root, file)

        # Find different files (now we don't need to subtract from set A since we already filtered)
        diff_docs = set(doc_files_B.keys()) - excluded_files
        diff_pdfs = set(pdf_files_B.keys()) - excluded_files

        # Copy and convert files
        successful_conversions = 0
        failed_conversions = 0

        # Copy and convert DOC files
        for doc in diff_docs:
            src = doc_files_B[doc]
            dst_output = os.path.join(fd_doc, os.path.basename(src))
            dst_original = os.path.join(folder_A_doc, os.path.basename(src))
            root, filename = doc_files_B_full_paths[doc]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy files to all locations
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
                
                # Convert to PDF
                pdf_path = doc_to_pdf(dst_output, doc_to_pdf_folder)
                if pdf_path:
                    successful_conversions += 1
                    # Copy the converted PDF to folder_A's pdf_files
                    pdf_filename = os.path.basename(pdf_path)
                    dst_pdf_A = os.path.join(folder_A_pdf, pdf_filename)
                    shutil.copy2(pdf_path, dst_pdf_A)
                else:
                    failed_conversions += 1
                    
            except Exception as e:
                print(f"Error processing {src}: {e}")
                failed_conversions += 1

        # Copy PDF files
        for pdf in diff_pdfs:
            src = pdf_files_B[pdf]
            dst_output = os.path.join(fd_pdf, os.path.basename(src))
            dst_original = os.path.join(folder_A_pdf, os.path.basename(src))
            root, filename = pdf_files_B_full_paths[pdf]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
            except Exception as e:
                print(f"Error copying {src}: {e}")

        # Update total statistics
        total_successful_conversions += successful_conversions
        total_failed_conversions += failed_conversions

        # Copy all PDFs to merge_pdf folder
        print(f"\nMerging PDFs for pair {idx} ({name_A} vs {name_B}):")
        
        # Copy PDFs from fd_pdf
        pdf_count = 0
        for filename in os.listdir(fd_pdf):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(fd_pdf, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        # Copy PDFs from doc_to_pdf
        for filename in os.listdir(doc_to_pdf_folder):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(doc_to_pdf_folder, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        print(f"  - Total PDFs merged: {pdf_count}")
        print(f"  - Merge PDF location: {merge_pdf_paths[pair_folder_name]}")

        # Print results for this pair
        print(f"\nResults for pair {idx} ({name_A} vs {name_B}):")
        print(f"  - Processed {len(diff_docs)} unique DOC files")
        print(f"  - Processed {len(diff_pdfs)} unique PDF files")
        print(f"  - Successfully converted to PDF: {successful_conversions} files")
        print(f"  - Failed conversions: {failed_conversions} files")

    # Print overall results
    print(f"\nOverall results:")
    print(f"Total files successfully converted to PDF: {total_successful_conversions}")
    print(f"Total failed conversions: {total_failed_conversions}")

    return merge_pdf_paths

In [12]:
# # Example usage
# if __name__ == "__main__":
#     exclusion_folder = "./file_can_pass_or_loi"
#     exclusive_path = ("./dont_care")
#     updated_fd = "./updated_reandy"

#     set1 = [
#         r"F:\Sale_merge_3807",
#         # r"F:\CT_in_Procur_merge_11184_orig - Copy",
#         # r"F:\Proposal_merge_20344_orig - Copy"
#     ]
#     set2 = [
#         r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
#         # r"D:\Vector Cloud\Procurement\Procurement",
#         # r"D:\Vector Cloud\Proposal\1. Project", 
#     ]
    
# result_path_dict = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)

In [16]:
exclusion_folder = "./file_can_pass_or_loi"
exclusive_path = ("./dont_care")
updated_fd = "./updated_ready"

set1 = [
    "./examples/example_fd_to_compare/test_Sale",
    "./examples/example_fd_to_compare/test_CT",
    "./examples/example_fd_to_compare/test_Prop",
]
set2 = [
    "./examples/example_fd_to_compare/Sale_server",
    "./examples/example_fd_to_compare/CT_server",
    "./examples/example_fd_to_compare/Prop_server"]
    
result_path_dict = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)
from copy_and_update_files_for_RAG import count_files_in_directories
count_files_in_directories(result_path_dict)

folder_A ./examples/example_fd_to_compare/test_Sale
folder_B ./examples/example_fd_to_compare/Sale_server
file_name_without_ext autronica ops training ncsp quotation rev 1
file_name_without_ext belden_loa_ntt_en

Merging PDFs for pair 1 (test_Sale vs Sale_server):
  - Total PDFs merged: 0
  - Merge PDF location: d:\second_docu\part12_cronjob\updated_ready\test_Sale_vs_Sale_server\merge_pdf

Results for pair 1 (test_Sale vs Sale_server):
  - Processed 0 unique DOC files
  - Processed 0 unique PDF files
  - Successfully converted to PDF: 0 files
  - Failed conversions: 0 files
folder_A ./examples/example_fd_to_compare/test_CT
folder_B ./examples/example_fd_to_compare/CT_server
file_name_without_ext 1. don dang ky kt cl - viv

Merging PDFs for pair 2 (test_CT vs CT_server):
  - Total PDFs merged: 0
  - Merge PDF location: d:\second_docu\part12_cronjob\updated_ready\test_CT_vs_CT_server\merge_pdf

Results for pair 2 (test_CT vs CT_server):
  - Processed 0 unique DOC files
  - Processed 0 u