In [24]:
import os
import shutil

def compare_fd_then_create_updatedfd(set1, set2, exclusion_folder, exclusive_path):
    """
    Compare doc_files and pdf_files subfolders between set1 and set2, copying different files
    to respective folders in both the output directory (fd_doc/fd_pdf) and back to set1 folders.
    Also maintains the original tree structure.
    
    Args:
        set1 (list): List of folder paths representing the first set
        set2 (list): List of folder paths representing the second set
        exclusion_folder (str): Path to the folder containing files to exclude
        exclusive_path (str or tuple): Path(s) where specific subfolders should be excluded
    
    Returns:
        str: Absolute path of the 'updated_fd' where the result is stored
    """
    # Define the main output folder
    updated_fd = "./updated_fd_all_all_test2"
    if os.path.exists(updated_fd):
        shutil.rmtree(updated_fd)
    os.makedirs(updated_fd)

    # Define the file extensions to consider
    doc_extensions = ('.doc', '.docx', '.DOC', '.DOCX')
    pdf_extensions = ('.pdf', '.PDF')

    # Define substrings to exclude in doc filenames
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note']

    # Ensure exclusive_path is a tuple or list
    if isinstance(exclusive_path, str):
        exclusive_path = (exclusive_path,)

    # Check that both sets have the same number of folders
    if len(set1) != len(set2):
        raise ValueError("Both sets of folders must have the same number of folders.")

    def get_files_from_folder(folder_path, extensions):
        """Helper function to get all files with specific extensions from a folder"""
        files = {}
        if os.path.exists(folder_path):
            for root, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extensions):
                        files[filename.lower()] = os.path.join(root, filename)
        return files

    # Get excluded files
    excluded_files = set()
    if os.path.exists(exclusion_folder):
        excluded_files = {f.lower() for f in os.listdir(exclusion_folder) 
                        if f.lower().endswith(doc_extensions + pdf_extensions)}

    # Process each pair of folders
    for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=1):
        # Create output folder structure
        name_A = os.path.basename(os.path.normpath(folder_A))
        name_B = os.path.basename(os.path.normpath(folder_B))
        pair_folder_name = f"{name_A}_vs_{name_B}"
        output_folder = os.path.join(updated_fd, pair_folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Create output subfolders with new names
        fd_doc = os.path.join(output_folder, 'fd_doc')  # Changed from doc_files
        fd_pdf = os.path.join(output_folder, 'fd_pdf')  # Changed from pdf_files
        tree_structure = os.path.join(output_folder, 'tree_structure')
        os.makedirs(fd_doc, exist_ok=True)
        os.makedirs(fd_pdf, exist_ok=True)
        os.makedirs(tree_structure, exist_ok=True)

        # Ensure doc_files and pdf_files exist in folder_A (keeping original names)
        folder_A_doc = os.path.join(folder_A, 'doc_files')
        folder_A_pdf = os.path.join(folder_A, 'pdf_files')
        os.makedirs(folder_A_doc, exist_ok=True)
        os.makedirs(folder_A_pdf, exist_ok=True)

        # Process DOC files
        doc_files_A = get_files_from_folder(folder_A_doc, doc_extensions)
        doc_files_B = {}
        doc_files_B_full_paths = {}  # Store full paths for tree structure
        
        # Walk through folder B to find all doc files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(doc_extensions):
                    # Check for excluded substrings
                    if not any(substr.lower() in file.lower() for substr in excluded_substrings):
                        doc_files_B[file.lower()] = os.path.join(root, file)
                        doc_files_B_full_paths[file.lower()] = (root, file)

        # Process PDF files
        pdf_files_A = get_files_from_folder(folder_A_pdf, pdf_extensions)
        pdf_files_B = {}
        pdf_files_B_full_paths = {}  # Store full paths for tree structure
        
        # Walk through folder B to find all pdf files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(pdf_extensions):
                    pdf_files_B[file.lower()] = os.path.join(root, file)
                    pdf_files_B_full_paths[file.lower()] = (root, file)

        # Find different files
        diff_docs = set(doc_files_B.keys()) - set(doc_files_A.keys()) - excluded_files
        diff_pdfs = set(pdf_files_B.keys()) - set(pdf_files_A.keys()) - excluded_files

        # Copy different DOC files
        for doc in diff_docs:
            src = doc_files_B[doc]
            # Copy to output folder
            dst_output = os.path.join(fd_doc, os.path.basename(src))  # Using fd_doc
            # Copy to original folder A
            dst_original = os.path.join(folder_A_doc, os.path.basename(src))
            # Copy to tree structure maintaining original hierarchy
            root, filename = doc_files_B_full_paths[doc]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy to output folder
                shutil.copy2(src, dst_output)
                # Copy to original folder
                shutil.copy2(src, dst_original)
                # Copy to tree structure
                os.makedirs(os.path.dirname(dst_tree), exist_ok=True)
                shutil.copy2(src, dst_tree)
            except Exception as e:
                pass
                print(f"Error copying {src}: {e}")

        # Copy different PDF files
        for pdf in diff_pdfs:
            src = pdf_files_B[pdf]
            # Copy to output folder
            dst_output = os.path.join(fd_pdf, os.path.basename(src))  # Using fd_pdf
            # Copy to original folder A
            dst_original = os.path.join(folder_A_pdf, os.path.basename(src))
            # Copy to tree structure maintaining original hierarchy
            root, filename = pdf_files_B_full_paths[pdf]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy to output folder
                shutil.copy2(src, dst_output)
                # Copy to original folder
                shutil.copy2(src, dst_original)
                # Copy to tree structure
                os.makedirs(os.path.dirname(dst_tree), exist_ok=True)
                shutil.copy2(src, dst_tree)
            except Exception as e:
                pass
                print(f"Error copying {src}: {e}")

        print(f"Pair {idx} ({name_A} vs {name_B}):")
        print(f"  - Copied {len(diff_docs)} different DOC files")
        print(f"    * To output: {fd_doc}")
        print(f"    * To original: {folder_A_doc}")
        print(f"    * To tree structure: {tree_structure}")
        print(f"  - Copied {len(diff_pdfs)} different PDF files")
        print(f"    * To output: {fd_pdf}")
        print(f"    * To original: {folder_A_pdf}")
        print(f"    * To tree structure: {tree_structure}")

    return os.path.abspath(updated_fd)

# Example usage
exclusion_folder = r"F:\file_can_pass_or_loi"
exclusive_path = (r"D:\Vector Cloud\Procurement\Procurement12",)
set1 = [
    r"F:\docu\Sale_docu",
    r"F:\docu\CT_in_Procur_docu",
    r"F:\docu\Proposal_docu"
]
set2 = [
    r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
    r"D:\Vector Cloud\Procurement\Procurement",
    r"D:\Vector Cloud\Proposal\1. Project", 
]
result_path = compare_fd_then_create_updatedfd(set1, set2, exclusion_folder, exclusive_path)
print(f"Comparison completed. Files stored in: {result_path}")

Pair 1 (Sale_docu vs 1. SI):
  - Copied 0 different DOC files
    * To output: ./updated_fd_all_all_test2\Sale_docu_vs_1. SI\fd_doc
    * To original: F:\docu\Sale_docu\doc_files
    * To tree structure: ./updated_fd_all_all_test2\Sale_docu_vs_1. SI\tree_structure
  - Copied 0 different PDF files
    * To output: ./updated_fd_all_all_test2\Sale_docu_vs_1. SI\fd_pdf
    * To original: F:\docu\Sale_docu\pdf_files
    * To tree structure: ./updated_fd_all_all_test2\Sale_docu_vs_1. SI\tree_structure
Error copying D:\Vector Cloud\Procurement\Procurement\2. Contract\2020\CT2005026_SI_KN - Yokogawa for ISD\Service\Service Oct-2024\Draft Invoice - Vector-cmt.pdf: [WinError 362] The cloud file provider is not running
Pair 2 (CT_in_Procur_docu vs Procurement):
  - Copied 0 different DOC files
    * To output: ./updated_fd_all_all_test2\CT_in_Procur_docu_vs_Procurement\fd_doc
    * To original: F:\docu\CT_in_Procur_docu\doc_files
    * To tree structure: ./updated_fd_all_all_test2\CT_in_Procur_do

In [28]:
import os
import shutil
import sys
import subprocess

def find_soffice_path():
    """Find the LibreOffice executable path on different operating systems."""
    if sys.platform == "win32":
        possible_paths = [
            r"C:\Program Files\LibreOffice\program\soffice.exe",
            r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
        ]
        for path in possible_paths:
            if os.path.exists(path):
                print(f"Found LibreOffice at: {path}")
                return path
        print("LibreOffice not found in standard locations!")
    elif sys.platform == "darwin":  # macOS
        return "/Applications/LibreOffice.app/Contents/MacOS/soffice"
    else:  # Linux
        return "soffice"
    return None

def doc_to_pdf(doc_path, output_dir):
    """Converts a Word document to PDF using LibreOffice command-line tools."""
    soffice_path = find_soffice_path()
    if not soffice_path:
        raise RuntimeError("LibreOffice not found. Please verify it's installed correctly.")
    
    doc_path = os.path.abspath(doc_path)
    output_dir = os.path.abspath(output_dir)
    
    if not os.path.exists(doc_path):
        raise FileNotFoundError(f"Input file not found: {doc_path}")
    
    print(f"Converting file: {doc_path}")
    print(f"Output directory: {output_dir}")
    
    try:
        result = subprocess.run([
            soffice_path,
            '--headless',
            '--convert-to', 'pdf',
            '--outdir', output_dir,
            doc_path
        ], capture_output=True, text=True)
        
        if result.returncode == 0:
            expected_pdf = os.path.join(output_dir, 
                                      os.path.splitext(os.path.basename(doc_path))[0] + '.pdf')
            if os.path.exists(expected_pdf):
                print(f"Successfully converted: {doc_path}")
                return True
            else:
                print(f"Conversion seemed to succeed but PDF not found: {expected_pdf}")
                return False
        else:
            print(f"Conversion failed for {doc_path}")
            print(f"Error output: {result.stderr}")
            return False
            
    except Exception as e:
        print(f"Error converting {doc_path}: {str(e)}")
        return False

def compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path):
    """
    Compare doc_files and pdf_files subfolders between set1 and set2, copying different files
    to respective folders and converting doc files to PDF.
    """
    # Define the main output folder
    # updated_fd = "./updated_fd_all_all_test2"
    if os.path.exists(updated_fd):
        shutil.rmtree(updated_fd)
    os.makedirs(updated_fd)

    # Define the file extensions to consider
    doc_extensions = ('.doc', '.docx', '.DOC', '.DOCX')
    pdf_extensions = ('.pdf', '.PDF')

    # Define substrings to exclude in doc filenames
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note']

    # Ensure exclusive_path is a tuple or list
    if isinstance(exclusive_path, str):
        exclusive_path = (exclusive_path,)

    if len(set1) != len(set2):
        raise ValueError("Both sets of folders must have the same number of folders.")

    def get_files_from_folder(folder_path, extensions):
        files = {}
        if os.path.exists(folder_path):
            for root, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extensions):
                        files[filename.lower()] = os.path.join(root, filename)
        return files

    # Get excluded files
    excluded_files = set()
    if os.path.exists(exclusion_folder):
        excluded_files = {f.lower() for f in os.listdir(exclusion_folder) 
                        if f.lower().endswith(doc_extensions + pdf_extensions)}

    # Track overall conversion statistics
    total_successful_conversions = 0
    total_failed_conversions = 0

    # Process each pair of folders
    for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=1):
        name_A = os.path.basename(os.path.normpath(folder_A))
        name_B = os.path.basename(os.path.normpath(folder_B))
        pair_folder_name = f"{name_A}_vs_{name_B}"
        output_folder = os.path.join(updated_fd, pair_folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Create all output subfolders
        fd_doc = os.path.join(output_folder, 'fd_doc')
        fd_pdf = os.path.join(output_folder, 'fd_pdf')
        tree_structure = os.path.join(output_folder, 'tree_structure')
        doc_to_pdf_folder = os.path.join(output_folder, 'doc_to_pdf')  # New folder for converted PDFs
        
        for folder in [fd_doc, fd_pdf, tree_structure, doc_to_pdf_folder]:
            os.makedirs(folder, exist_ok=True)

        # Ensure doc_files and pdf_files exist in folder_A
        folder_A_doc = os.path.join(folder_A, 'doc_files')
        folder_A_pdf = os.path.join(folder_A, 'pdf_files')
        os.makedirs(folder_A_doc, exist_ok=True)
        os.makedirs(folder_A_pdf, exist_ok=True)

        # Process files (similar to before)
        doc_files_A = get_files_from_folder(folder_A_doc, doc_extensions)
        doc_files_B = {}
        doc_files_B_full_paths = {}
        
        for root, dirs, files in os.walk(folder_B):
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(doc_extensions):
                    if not any(substr.lower() in file.lower() for substr in excluded_substrings):
                        doc_files_B[file.lower()] = os.path.join(root, file)
                        doc_files_B_full_paths[file.lower()] = (root, file)

        pdf_files_A = get_files_from_folder(folder_A_pdf, pdf_extensions)
        pdf_files_B = {}
        pdf_files_B_full_paths = {}
        
        for root, dirs, files in os.walk(folder_B):
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(pdf_extensions):
                    pdf_files_B[file.lower()] = os.path.join(root, file)
                    pdf_files_B_full_paths[file.lower()] = (root, file)

        diff_docs = set(doc_files_B.keys()) - set(doc_files_A.keys()) - excluded_files
        diff_pdfs = set(pdf_files_B.keys()) - set(pdf_files_A.keys()) - excluded_files

        # Copy and convert files
        successful_conversions = 0
        failed_conversions = 0

        # Copy and convert DOC files
        for doc in diff_docs:
            src = doc_files_B[doc]
            dst_output = os.path.join(fd_doc, os.path.basename(src))
            dst_original = os.path.join(folder_A_doc, os.path.basename(src))
            root, filename = doc_files_B_full_paths[doc]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy files to all locations
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
                
                # Convert to PDF
                if doc_to_pdf(dst_output, doc_to_pdf_folder):
                    successful_conversions += 1
                else:
                    failed_conversions += 1
                    
            except Exception as e:
                print(f"Error processing {src}: {e}")
                failed_conversions += 1

        # Copy PDF files (no conversion needed)
        for pdf in diff_pdfs:
            src = pdf_files_B[pdf]
            dst_output = os.path.join(fd_pdf, os.path.basename(src))
            dst_original = os.path.join(folder_A_pdf, os.path.basename(src))
            root, filename = pdf_files_B_full_paths[pdf]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
            except Exception as e:
                print(f"Error copying {src}: {e}")

        # Update total statistics
        total_successful_conversions += successful_conversions
        total_failed_conversions += failed_conversions

        # Print results for this pair
        print(f"\nResults for pair {idx} ({name_A} vs {name_B}):")
        print(f"  - Copied {len(diff_docs)} different DOC files")
        print(f"  - Copied {len(diff_pdfs)} different PDF files")
        print(f"  - Successfully converted to PDF: {successful_conversions} files")
        print(f"  - Failed conversions: {failed_conversions} files")

    # Print overall results
    print(f"\nOverall results:")
    print(f"Total files successfully converted to PDF: {total_successful_conversions}")
    print(f"Total failed conversions: {total_failed_conversions}")

    return os.path.abspath(updated_fd)

# Example usage
exclusion_folder = r"F:\file_can_pass_or_loi"
exclusive_path = (r"D:\Vector Cloud\Procurement\Procurement12",)
updated_fd = "./updated_fd_all_all_test1"
set1 = [
    r"F:\docu\Sale_docu",
]
set2 = [
    r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
]

result_path = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)
print(f"Comparison and conversion completed. Files stored in: {result_path}")


Results for pair 1 (Sale_docu vs 1. SI):
  - Copied 0 different DOC files
  - Copied 0 different PDF files
  - Successfully converted to PDF: 0 files
  - Failed conversions: 0 files

Overall results:
Total files successfully converted to PDF: 0
Total failed conversions: 0
Comparison and conversion completed. Files stored in: d:\second_docu\updated_fd_all_all_test1


In [33]:
import os
import shutil
import sys
import subprocess

def find_soffice_path():
    """Find the LibreOffice executable path on different operating systems."""
    if sys.platform == "win32":
        possible_paths = [
            r"C:\Program Files\LibreOffice\program\soffice.exe",
            r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
        ]
        for path in possible_paths:
            if os.path.exists(path):
                print(f"Found LibreOffice at: {path}")
                return path
        print("LibreOffice not found in standard locations!")
    elif sys.platform == "darwin":  # macOS
        return "/Applications/LibreOffice.app/Contents/MacOS/soffice"
    else:  # Linux
        return "soffice"
    return None

def doc_to_pdf(doc_path, output_dir):
    """Converts a Word document to PDF using LibreOffice command-line tools."""
    soffice_path = find_soffice_path()
    if not soffice_path:
        raise RuntimeError("LibreOffice not found. Please verify it's installed correctly.")
    
    doc_path = os.path.abspath(doc_path)
    output_dir = os.path.abspath(output_dir)
    
    if not os.path.exists(doc_path):
        raise FileNotFoundError(f"Input file not found: {doc_path}")
    
    print(f"Converting file: {doc_path}")
    print(f"Output directory: {output_dir}")
    
    try:
        result = subprocess.run([
            soffice_path,
            '--headless',
            '--convert-to', 'pdf',
            '--outdir', output_dir,
            doc_path
        ], capture_output=True, text=True)
        
        if result.returncode == 0:
            expected_pdf = os.path.join(output_dir, 
                                      os.path.splitext(os.path.basename(doc_path))[0] + '.pdf')
            if os.path.exists(expected_pdf):
                print(f"Successfully converted: {doc_path}")
                return True
            else:
                print(f"Conversion seemed to succeed but PDF not found: {expected_pdf}")
                return False
        else:
            print(f"Conversion failed for {doc_path}")
            print(f"Error output: {result.stderr}")
            return False
            
    except Exception as e:
        print(f"Error converting {doc_path}: {str(e)}")
        return False

def compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path):
    """
    Compare doc_files and pdf_files subfolders between set1 and set2, copying different files
    to respective folders, converting doc files to PDF, and merging all PDFs into a single folder.
    
    Args:
        set1 (list): List of folder paths representing the first set
        set2 (list): List of folder paths representing the second set
        exclusion_folder (str): Path to the folder containing files to exclude
        exclusive_path (str or tuple): Path(s) where specific subfolders should be excluded
    
    Returns:
        dict: Dictionary with pair folder names as keys and absolute path of merge_pdf folder as value
    """
    # Define the main output folder
    # updated_fd = "./updated_fd_all_all_test2"
    if os.path.exists(updated_fd):
        shutil.rmtree(updated_fd)
    os.makedirs(updated_fd)

    # Define the file extensions to consider
    doc_extensions = ('.doc', '.docx', '.DOC', '.DOCX')
    pdf_extensions = ('.pdf', '.PDF')

    # Define substrings to exclude in doc filenames
    excluded_substrings = ['F32', 'F60', 'phieu bao hanh', 'livery note']

    # Dictionary to store merge_pdf folder paths
    merge_pdf_paths = {}

    # Ensure exclusive_path is a tuple or list
    if isinstance(exclusive_path, str):
        exclusive_path = (exclusive_path,)

    # Check that both sets have the same number of folders
    if len(set1) != len(set2):
        raise ValueError("Both sets of folders must have the same number of folders.")

    def get_files_from_folder(folder_path, extensions):
        """Helper function to get all files with specific extensions from a folder"""
        files = {}
        if os.path.exists(folder_path):
            for root, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extensions):
                        files[filename.lower()] = os.path.join(root, filename)
        return files

    # Get excluded files
    excluded_files = set()
    if os.path.exists(exclusion_folder):
        excluded_files = {f.lower() for f in os.listdir(exclusion_folder) 
                        if f.lower().endswith(doc_extensions + pdf_extensions)}

    # Track overall conversion statistics
    total_successful_conversions = 0
    total_failed_conversions = 0

    # Process each pair of folders
    for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=1):
        name_A = os.path.basename(os.path.normpath(folder_A))
        name_B = os.path.basename(os.path.normpath(folder_B))
        pair_folder_name = f"{name_A}_vs_{name_B}"
        output_folder = os.path.join(updated_fd, pair_folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Create all output subfolders
        fd_doc = os.path.join(output_folder, 'fd_doc')
        fd_pdf = os.path.join(output_folder, 'fd_pdf')
        tree_structure = os.path.join(output_folder, 'tree_structure')
        doc_to_pdf_folder = os.path.join(output_folder, 'doc_to_pdf')
        merge_pdf_folder = os.path.join(output_folder, 'merge_pdf')
        
        for folder in [fd_doc, fd_pdf, tree_structure, doc_to_pdf_folder, merge_pdf_folder]:
            os.makedirs(folder, exist_ok=True)

        # Store absolute path of merge_pdf folder in the dictionary
        merge_pdf_paths[pair_folder_name] = os.path.abspath(merge_pdf_folder)

        # Ensure doc_files and pdf_files exist in folder_A
        folder_A_doc = os.path.join(folder_A, 'doc_files')
        folder_A_pdf = os.path.join(folder_A, 'pdf_files')
        os.makedirs(folder_A_doc, exist_ok=True)
        os.makedirs(folder_A_pdf, exist_ok=True)

        # Process DOC files
        doc_files_A = get_files_from_folder(folder_A_doc, doc_extensions)
        doc_files_B = {}
        doc_files_B_full_paths = {}
        
        # Walk through folder B to find all doc files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(doc_extensions):
                    # Check for excluded substrings
                    if not any(substr.lower() in file.lower() for substr in excluded_substrings):
                        doc_files_B[file.lower()] = os.path.join(root, file)
                        doc_files_B_full_paths[file.lower()] = (root, file)

        # Process PDF files
        pdf_files_A = get_files_from_folder(folder_A_pdf, pdf_extensions)
        pdf_files_B = {}
        pdf_files_B_full_paths = {}
        
        # Walk through folder B to find all pdf files
        for root, dirs, files in os.walk(folder_B):
            # Skip '2. Contract' subfolder in exclusive_path
            if folder_B in exclusive_path and root == folder_B:
                if '2. Contract' in dirs:
                    dirs.remove('2. Contract')
            
            for file in files:
                if file.lower().endswith(pdf_extensions):
                    pdf_files_B[file.lower()] = os.path.join(root, file)
                    pdf_files_B_full_paths[file.lower()] = (root, file)

        # Find different files
        diff_docs = set(doc_files_B.keys()) - set(doc_files_A.keys()) - excluded_files
        diff_pdfs = set(pdf_files_B.keys()) - set(pdf_files_A.keys()) - excluded_files

        # Copy and convert files
        successful_conversions = 0
        failed_conversions = 0

        # Copy and convert DOC files
        for doc in diff_docs:
            src = doc_files_B[doc]
            dst_output = os.path.join(fd_doc, os.path.basename(src))
            dst_original = os.path.join(folder_A_doc, os.path.basename(src))
            root, filename = doc_files_B_full_paths[doc]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                # Copy files to all locations
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
                
                # Convert to PDF
                if doc_to_pdf(dst_output, doc_to_pdf_folder):
                    successful_conversions += 1
                else:
                    failed_conversions += 1
                    
            except Exception as e:
                print(f"Error processing {src}: {e}")
                failed_conversions += 1

        # Copy PDF files
        for pdf in diff_pdfs:
            src = pdf_files_B[pdf]
            dst_output = os.path.join(fd_pdf, os.path.basename(src))
            dst_original = os.path.join(folder_A_pdf, os.path.basename(src))
            root, filename = pdf_files_B_full_paths[pdf]
            rel_path = os.path.relpath(root, folder_B)
            dst_tree = os.path.join(tree_structure, rel_path, filename)
            
            try:
                for dst in [dst_output, dst_original, dst_tree]:
                    os.makedirs(os.path.dirname(dst), exist_ok=True)
                    shutil.copy2(src, dst)
            except Exception as e:
                print(f"Error copying {src}: {e}")

        # Update total statistics
        total_successful_conversions += successful_conversions
        total_failed_conversions += failed_conversions

        # Copy all PDFs to merge_pdf folder
        print(f"\nMerging PDFs for pair {idx} ({name_A} vs {name_B}):")
        
        # Copy PDFs from fd_pdf
        pdf_count = 0
        for filename in os.listdir(fd_pdf):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(fd_pdf, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        # Copy PDFs from doc_to_pdf
        for filename in os.listdir(doc_to_pdf_folder):
            if filename.lower().endswith('.pdf'):
                src = os.path.join(doc_to_pdf_folder, filename)
                dst = os.path.join(merge_pdf_folder, filename)
                try:
                    shutil.copy2(src, dst)
                    pdf_count += 1
                except Exception as e:
                    print(f"Error copying {src} to merge_pdf: {e}")

        print(f"  - Total PDFs merged: {pdf_count}")
        print(f"  - Merge PDF location: {merge_pdf_paths[pair_folder_name]}")

        # Print results for this pair
        print(f"\nResults for pair {idx} ({name_A} vs {name_B}):")
        print(f"  - Copied {len(diff_docs)} different DOC files")
        print(f"  - Copied {len(diff_pdfs)} different PDF files")
        print(f"  - Successfully converted to PDF: {successful_conversions} files")
        print(f"  - Failed conversions: {failed_conversions} files")

    # Print overall results
    print(f"\nOverall results:")
    print(f"Total files successfully converted to PDF: {total_successful_conversions}")
    print(f"Total failed conversions: {total_failed_conversions}")

    return merge_pdf_paths


# Example usage
exclusion_folder = r"F:\file_can_pass_or_loi"
exclusive_path = (r"D:\Vector Cloud\Procurement\Procurement12",)
updated_fd = "./updated_fd_all_all_test1"
# set1 = [
#     r"F:\docu\Sale_docu",
# ]
# set2 = [
#     r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
# ]

set1 = [
    r"F:\docu\Sale_docu",
    r"F:\docu\CT_in_Procur_docu",
    r"F:\docu\Proposal_docu"
]
set2 = [
    r"D:\Vector Cloud\Sales\3. SI Sales\1. SI", 
    r"D:\Vector Cloud\Procurement\Procurement",
    r"D:\Vector Cloud\Proposal\1. Project", 
]


result_path = compare_fd_then_create_updatedfd(set1, set2, updated_fd, exclusion_folder, exclusive_path)
print(f"Comparison and conversion completed. Files stored in: {result_path}")
import os
import json

def count_files_in_directories(base_paths_dict):
    """
    Count files in fd_doc, fd_pdf, and merge_pdf folders for each pair directory
    and save results to JSON.
    
    Args:
        base_paths_dict (dict): Dictionary with pair names and merge_pdf paths
    """
    # Function to count files in a directory
    def count_files(directory):
        if not os.path.exists(directory):
            print(f"Warning: Directory does not exist: {directory}")
            return 0
        return len([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])

    # Dictionary to store all counts
    results = {}

    for pair_name, merge_pdf_path in base_paths_dict.items():
        # Get the base directory for this pair
        base_dir = os.path.dirname(merge_pdf_path)
        
        # Define paths for each folder type
        fd_doc_path = os.path.join(base_dir, 'fd_doc')
        fd_pdf_path = os.path.join(base_dir, 'fd_pdf')
        
        # Count files in each directory
        doc_count = count_files(fd_doc_path)
        pdf_count = count_files(fd_pdf_path)
        merge_pdf_count = count_files(merge_pdf_path)

        # Store counts in results dictionary
        results[pair_name] = {
            'fd_doc_count': doc_count,
            'fd_pdf_count': pdf_count,
            'merge_pdf_count': merge_pdf_count,
            'paths': {
                'fd_doc': fd_doc_path,
                'fd_pdf': fd_pdf_path,
                'merge_pdf': merge_pdf_path
            }
        }

        # Print counts for this pair
        # print(f"\nCounts for {pair_name}:")
        # print(f"Number of files in fd_doc: {doc_count}")
        # print(f"Number of files in fd_pdf: {pdf_count}")
        # print(f"Number of files in merge_pdf: {merge_pdf_count}")
        # print('*' * 50)

    # Save results to JSON file
    output_file = 'file_counts.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
    
    print(f"\nResults have been saved to {output_file}")
    return results

# Example usage
# base_paths_dict = {
#     'Sale_docu_vs_1. SI': r'd:\second_docu\updated_fd_all_all_test1\Sale_docu_vs_1. SI\merge_pdf'
# }

# Run the counting function
results = count_files_in_directories(result_path)


Merging PDFs for pair 1 (Sale_docu vs 1. SI):
  - Total PDFs merged: 0
  - Merge PDF location: d:\second_docu\updated_fd_all_all_test1\Sale_docu_vs_1. SI\merge_pdf

Results for pair 1 (Sale_docu vs 1. SI):
  - Copied 0 different DOC files
  - Copied 0 different PDF files
  - Successfully converted to PDF: 0 files
  - Failed conversions: 0 files
Error copying D:\Vector Cloud\Procurement\Procurement\2. Contract\2020\CT2005026_SI_KN - Yokogawa for ISD\Service\Service Oct-2024\Draft Invoice - Vector-cmt.pdf: [WinError 362] The cloud file provider is not running

Merging PDFs for pair 2 (CT_in_Procur_docu vs Procurement):
  - Total PDFs merged: 0
  - Merge PDF location: d:\second_docu\updated_fd_all_all_test1\CT_in_Procur_docu_vs_Procurement\merge_pdf

Results for pair 2 (CT_in_Procur_docu vs Procurement):
  - Copied 0 different DOC files
  - Copied 1 different PDF files
  - Successfully converted to PDF: 0 files
  - Failed conversions: 0 files
Error processing D:\Vector Cloud\Proposal\1. 