### Copy files from folder A to folder B

In [7]:
import shutil
import os

def copy_files(source_folder, destination_folder):
    # Check if the source folder exists
    if not os.path.exists(source_folder):
        print(f"Source folder '{source_folder}' does not exist.")
        return
    
    # Check if the destination folder exists, if not, create it
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Loop through each file in the source folder
    for filename in os.listdir(source_folder):
        source_file = os.path.join(source_folder, filename)
        destination_file = os.path.join(destination_folder, filename)

        # Check if it's a file (and not a subfolder)
        if os.path.isfile(source_file):
            shutil.copy(source_file, destination_file)
            print(f"Copied: {filename} to {destination_folder}")

source = r"D:\Vector Cloud\Proposal\1. Project\2016"
destination = r"F:\Proposal_16"

copy_files(source, destination)


### Copy DOC files

In [None]:
import os
import shutil

def copy_doc_files_flat(source_folder, destination_folder):
    """
    Recursively copies all .doc and .docx files from the source folder and its subdirectories
    to the destination folder, without preserving directory structure.

    If there are duplicate filenames, the script renames the files to include part of the source path.

    Parameters:
        source_folder (str): The path to the folder containing the .doc and .docx files.
        destination_folder (str): The path to the folder where the files will be copied.
    """
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        print(f"Created destination folder: {destination_folder}")

    for root, dirs, files in os.walk(source_folder):
        for filename in files:
            if filename.lower().endswith(('.doc', '.docx', '.DOC', 'DOCX')):
                source_file = os.path.join(root, filename)
                
                # Build a unique filename for the destination
                relative_path = os.path.relpath(root, source_folder)
                # Sanitize the relative path to create a safe filename
                safe_relative_path = relative_path.replace(os.sep, '_').replace('.', '_')
                name, ext = os.path.splitext(filename)
                new_filename = f"{name}_{safe_relative_path}{ext}"
                dest_file = os.path.join(destination_folder, new_filename)
                
                # Handle duplicate filenames by appending an index if necessary
                count = 1
                original_dest_file = dest_file
                while os.path.exists(dest_file):
                    new_filename = f"{name}_{safe_relative_path}_{count}{ext}"
                    dest_file = os.path.join(destination_folder, new_filename)
                    count += 1
                
                try:
                    shutil.copy2(source_file, dest_file)
                    print(f"Copied: {source_file} -> {dest_file}")
                except Exception as e:
                    print(f"Failed to copy {source_file}: {e}")

if __name__ == "__main__":
    # Replace these paths with your source and destination folders
    source_folder = r"D:\Vector Cloud\Procurement\Procurement"
    destination_folder = r"F:\doc_files_in_Procur"

    copy_doc_files_flat(source_folder, destination_folder)


### Create a list to save PDF filename

In [None]:
import os

def get_store_pdf_filenames(folder_path):
    pdf_filenames = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_filenames.append(file) 
    
    return pdf_filenames

# Example usage
folder_path = r"F:\Proposal_20_1587" # Change this to the folder you want to search
pdf_filenames_list = get_store_pdf_filenames(folder_path)

print(len(pdf_filenames_list))
pdf_filenames_list

In [None]:
import os
import shutil

# Replace these with your actual folder paths
# folder_A = r"F:\CT_in_Procur_merge_11186"
# folder_A = r"F:\Sale_merge_3807"
folder_A = r"F:\Proposal_merge_20352"

# folder_B = r"D:\Vector Cloud\Procurement\Procurement\2. Contract"
# folder_B = r"D:\Vector Cloud\Sales\3. SI Sales\1. SI"
folder_B = r"D:\Vector Cloud\Proposal\1. Project"


output_folder = './yeah'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get list of PDF filenames in folder A (case-insensitive)
files_in_A = set(f.lower() for f in os.listdir(folder_A) if f.lower().endswith('.pdf'))

# Get all PDF files in folder B and its subfolders
files_in_B = []
for root, dirs, files in os.walk(folder_B):
    for file in files:
        if file.lower().endswith('.pdf'):
            files_in_B.append(os.path.join(root, file))

# Collect filenames (without path) of files in B
filenames_in_B = set(os.path.basename(f).lower() for f in files_in_B)

# Find the files that are in B but not in A
diff_filenames = filenames_in_B - files_in_A

# Now, from files_in_B, select the files whose basename is in diff_filenames
diff_files = [f for f in files_in_B if os.path.basename(f).lower() in diff_filenames]

# Copy the different files from B to the output folder, preserving relative paths
for src in diff_files:
    # Get the relative path of the file from folder B
    rel_path = os.path.relpath(src, folder_B)
    print(rel_path)
    dst = os.path.join(output_folder, rel_path)
    
    # Ensure the destination subfolder exists
    os.makedirs(os.path.dirname(dst), exist_ok=True)
    
    # Copy the file
    shutil.copy2(src, dst)

print(f"Copied {len(diff_files)} files to {output_folder}")


In [32]:
import os
import shutil

# Replace these with your actual folder paths
# folder_A = r"F:\CT_in_Procur_merge_11186"
# folder_A = r"F:\Sale_merge_3807"
folder_A = r"F:\Proposal_merge_20352"

# folder_B = r"D:\Vector Cloud\Procurement\Procurement\2. Contract"
# folder_B = r"D:\Vector Cloud\Sales\3. SI Sales\1. SI"
folder_B = r"D:\Vector Cloud\Proposal\1. Project"


output_folder = './yeah'
exclusion_folder = r"F:\file_can_pass_or_loi" # New variable for exclusion folder

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get list of PDF filenames in folder A (case-insensitive)
files_in_A = set(f.lower() for f in os.listdir(folder_A) if f.lower().endswith('.pdf'))
excluded_filenames = set(f.lower() for f in os.listdir(exclusion_folder))  # Collect filenames to exclude

# Get all PDF files in folder B and its subfolders
files_in_B = []
for root, dirs, files in os.walk(folder_B):
    for file in files:
        if file.lower().endswith('.pdf'):
            files_in_B.append(os.path.join(root, file))

# Collect filenames (without path) of files in B
filenames_in_B = set(os.path.basename(f).lower() for f in files_in_B)

# Find the files that are in B but not in A
diff_filenames = filenames_in_B - files_in_A
diff_filenames = diff_filenames - excluded_filenames  # Adjusted line to exclude files

# Now, from files_in_B, select the files whose basename is in diff_filenames
diff_files = [f for f in files_in_B if os.path.basename(f).lower() in diff_filenames]

# Copy the different files from B to the output folder, preserving relative paths
for src in diff_files:
    # Get the relative path of the file from folder B
    rel_path = os.path.relpath(src, folder_B)
    print(rel_path)
    dst = os.path.join(output_folder, rel_path)
    
    # Ensure the destination subfolder exists
    os.makedirs(os.path.dirname(dst), exist_ok=True)
    
    # Copy the file
    shutil.copy2(src, dst)

print(f"Copied {len(diff_files)} files to {output_folder}")


2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\CPP PAGA BLOCK DIAGRAM.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\CPP_BEACON LOCATION.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\LQ PAGA BLOCK DIAGRAM.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\LQ_BEACON LOCATION.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\PAGA REVISED QTY.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\Palo Alto Firewall Specification.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\PQ-CPC0-ELE-SPC-MPC-00008-00_N01_Electrical Distribution Board.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\FIREWALL ARCHITECTURE\UPDATED FIREWALL SCHEMATIC.pdf
2024\15. McDermott - Block B - Offshore\5. BCM\RFQ\Vector Infotech\Attachments\HOTLINE SYS

In [None]:
import os
import shutil

# Define the sets of folder paths
set1 = [
    r"F:\CT_in_Procur_merge_11186",
    r"F:\Proposal_merge_20352",
    r"F:\Sale_merge_3807"  
]

set2 = [
    r"D:\Vector Cloud\Procurement\Procurement\2. Contract",
    r"D:\Vector Cloud\Proposal\1. Project",
    r"D:\Vector Cloud\Sales\3. SI Sales\1. SI"  
]
# Define the main output folder
updated_fd = r"./updated_fd"
os.makedirs(updated_fd, exist_ok=True)
exclusion_folder = r"F:\file_can_pass_or_loi" # New variable for exclusion folder


# Check that both sets have the same number of folders
if len(set1) != len(set2):
    raise ValueError("Both sets of folders must have the same number of folders.")

# Loop over the pairs of folders
for idx, (folder_A, folder_B) in enumerate(zip(set1, set2), start=0):
    # Create a subfolder for this pair within the main output folder
    name_A = os.path.basename(os.path.normpath(folder_A))
    name_B = os.path.basename(os.path.normpath(folder_B))
    pair_folder_name = f"{name_A}_vs_{name_B}"
    output_folder = os.path.join(updated_fd, pair_folder_name)
    os.makedirs(output_folder, exist_ok=True)

    # Ensure that folder_A and folder_B exist
    if not os.path.exists(folder_A):
        print(f"Folder A does not exist: {folder_A}")
        continue
    if not os.path.exists(folder_B):
        print(f"Folder B does not exist: {folder_B}")
        continue

    # Get list of PDF filenames in folder A (case-insensitive)
    files_in_A = set(f.lower() for f in os.listdir(folder_A) if f.lower().endswith('.pdf'))
    excluded_filenames = set(f.lower() for f in os.listdir(exclusion_folder))  # Collect filenames to exclude

    # Get all PDF files in folder B and its subfolders
    files_in_B = []
    for root, dirs, files in os.walk(folder_B):
        for file in files:
            if file.lower().endswith('.pdf'):
                files_in_B.append(os.path.join(root, file))

    # Collect filenames (without path) of files in B
    filenames_in_B = set(os.path.basename(f).lower() for f in files_in_B)

    # Find the files that are in B but not in A
    diff_filenames = filenames_in_B - files_in_A

    # Now, from files_in_B, select the files whose basename is in diff_filenames
    diff_files = [f for f in files_in_B if os.path.basename(f).lower() in diff_filenames]
    diff_filenames = diff_filenames - excluded_filenames  # Adjusted line to exclude files

    # Copy the different files from B to the output folder, preserving relative paths
    for src in diff_files:
        # Get the relative path of the file from folder B
        rel_path = os.path.relpath(src, folder_B)
        dst = os.path.join(output_folder, rel_path)

        # Ensure the destination subfolder exists
        os.makedirs(os.path.dirname(dst), exist_ok=True)

        # Check if the source file exists
        if not os.path.exists(src):
            print(f"Source file does not exist: {src}")
            continue

        # Copy the file
        try:
            shutil.copy2(src, dst)
        except Exception as e:
            print(f"Error copying {src} to {dst}: {e}")

    print(f"Copied {len(diff_files)} files for pair {idx} ({name_A} vs {name_B}) to {output_folder}")

In [None]:
from byaldi import RAGMultiModalModel
from dotenv import load_dotenv
import PIL.Image

load_dotenv()
hf_token = os.getenv("HF_TOKEN")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
os.environ["HF_TOKEN"] = hf_token
os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key

PIL.Image.MAX_IMAGE_PIXELS = None
attention_path = r"D:\first_docu\attn_fd\attn_CT_all_2032024_cp1"
RAG = RAGMultiModalModel.from_index(attention_path, verbose=1)
len(RAG.get_doc_ids_to_file_names())