### Copy all pdf files

In [None]:
import os
import shutil
from glob import glob

def find_and_copy_pdfs(source_directory, destination_directory):
    if os.path.exists(destination_directory):
        print(f"Output folder '{destination_directory}' already exists. Deleting it...")
        shutil.rmtree(destination_directory)
    os.makedirs(destination_directory)

    pdf_files = glob(os.path.join(source_directory, '**', '*.pdf'), recursive=True)

    for pdf in pdf_files:
        try:
            shutil.copy(pdf, destination_directory)
            print(f"Copied: {pdf}")
        except Exception as e:
            print(f"Failed to copy {pdf}: {e}")

# Define source directory (where to search) and destination directory (where to store)
source_dir = r"D:\first_docu\updated_fd"
destination_dir = r"D:\first_docu\abcd"
os.makedirs(destination_dir , exist_ok=True)

find_and_copy_pdfs(source_dir, destination_dir)


### Copy folder structure (level 0 and 1 only) and pdf files

In [4]:
import os
import shutil

def copy_structure_and_pdfs(source_dir, target_dir):
    # Create the target directory if it doesn't exist
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Walk through the source directory up to the first level of sub-folders (deep level 0 and 1)
    for root, dirs, files in os.walk(source_dir):
        # Copy the immediate sub-folders of the source directory (deep level 1)
        if root == source_dir:
            for folder in dirs:
                # Create corresponding sub-folder in the target directory
                source_folder_path = os.path.join(root, folder)
                target_folder_path = os.path.join(target_dir, folder)
                if not os.path.exists(target_folder_path):
                    os.makedirs(target_folder_path)
                
                # Traverse all sub-directories from this folder (all levels under deep level 1)
                for sub_root, sub_dirs, sub_files in os.walk(source_folder_path):
                    for file in sub_files:
                        # Copy only PDF files to the corresponding sub-folder at deep level 1 in the target
                        if file.lower().endswith('.pdf'):
                            source_file_path = os.path.join(sub_root, file)
                            target_file_path = os.path.join(target_folder_path, file)
                            shutil.copy2(source_file_path, target_file_path)
            break  # Stop after processing deep level 1 to avoid going deeper in directory tree

# Define your source and target directories
source_directory = r'D:\Vector Cloud\Proposal\1. Project\2024'
target_directory = r'C:\Users\Admin\Downloads\copy_2024'

# Run the function
copy_structure_and_pdfs(source_directory, target_directory)


### Count pdf files

In [7]:
import os

def count_pdfs(directory):
    pdf_count = 0

    # Walk through the directory and its sub-directories
    for root, dirs, files in os.walk(directory):
        # Count files that end with .pdf (case insensitive)
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_count += 1

    return pdf_count

# Define the directory you want to search
directory_path = r'C:\Users\Admin\Downloads\all_not81415_2024'

# Get the count of PDF files
total_pdfs = count_pdfs(directory_path)

print(f'Total number of PDF files: {total_pdfs}')


Total number of PDF files: 319


### Map filename and number of files

In [1]:
import os
import json
from collections import OrderedDict

def count_pdfs_in_subfolders(directory):
    pdf_count_dict = {}
    pdf_count_dict = OrderedDict()

    # Traverse the directory at level 1 depth (imm+ediate sub-folders only)
    for root, dirs, files in os.walk(directory):
        if root == directory:  # Only process first-level sub-folders
            for sub_folder in dirs:
                sub_folder_path = os.path.join(root, sub_folder)
                pdf_count = 0

                # Count PDF files in the current sub-folder
                for file in os.listdir(sub_folder_path):
                    if file.lower().endswith('.pdf'):
                        pdf_count += 1

                # Add the count to the dictionary
                pdf_count_dict[sub_folder] = pdf_count
            break  # Stop after processing deep level 1 sub-folders

    return pdf_count_dict

# Define the directory you want to analyze
directory_path = r'C:\Users\Admin\Downloads\copy_2024'

# Get the count of PDFs in each sub-folder
pdf_counts = count_pdfs_in_subfolders(directory_path)

# Print the result as JSON
print(json.dumps(pdf_counts, indent=4))


{
    "1. BSR - PAGA Upgrading": 29,
    "10. Calofic AI Camera": 9,
    "11. DVK-Voice Recording Extend (Draft)": 4,
    "12. Block B - Telemetry for WHPs": 8,
    "13. JVPC - VHF Radio budgetary": 9,
    "14. Long Thanh Airport - VSP": 168,
    "15. McDermott - Block B - Offshore": 255,
    "16. Digital Twin - Genco2": 36,
    "17. TL JOC - Router redundancy": 2,
    "18. Cuu Long JOC - Upgrade OTN to XTRANS": 13,
    "19. Long Thanh Airport - Hangar packet": 0,
    "2. VISG - Radio upgrade": 6,
    "20. JVPC - Motorola": 20,
    "21. TTCL - YHP - Radio": 42,
    "22. PVN Block 01 & 02 - Frame contract PM": 44,
    "23. VSP - Thien Nga Hai Au": 4,
    "3. HSIA - TETRA for T1 and T2": 5,
    "4. PPS - IPPhone and Telecom survey": 13,
    "5. JVPC - NDB upgrade": 28,
    "6. Long Thanh Airport - Aircraft fueling facility": 32,
    "7. TLJOC upgrade Router": 2,
    "8. PTSC - LDV": 463,
    "9. Murphy Oil - LDV - VSAT": 18
}


In [None]:
import os
import shutil

# Source folder containing the PDFs
source_folder = r"C:\Users\Admin\Downloads\Propsals_2024\Proposals_2024_split2"

# Target folders to copy the files
target_folder_1 = r"C:\Users\Admin\Downloads\Proposals_2024_split2_v1"
target_folder_2 = r"C:\Users\Admin\Downloads\Proposals_2024_split2_v2"
target_folder_3 = r"C:\Users\Admin\Downloads\Proposals_2024_split2_v3"

# Create target folders if they don't exist
os.makedirs(target_folder_1, exist_ok=True)
os.makedirs(target_folder_2, exist_ok=True)
os.makedirs(target_folder_3, exist_ok=True)

# Get list of all PDF files in the source folder
pdf_files = [file for file in os.listdir(source_folder) if file.lower().endswith('.pdf')]

# Sort the files to ensure consistent distribution
pdf_files.sort()

# Total number of files
total_files = len(pdf_files)

# Number of folders
num_folders = 3

# Calculate the base number of files per folder and the remainder
files_per_folder = total_files // num_folders  # Integer division
remainder_files = total_files % num_folders    # Modulo operation

# Initialize folder distributions
folder_distribution = [files_per_folder] * num_folders

# Distribute the remainder files to the first few folders
for i in range(remainder_files):
    folder_distribution[i] += 1

# Copy files to target folders
current_index = 0
target_folders = [target_folder_1, target_folder_2, target_folder_3]

for folder_index, file_count in enumerate(folder_distribution):
    target_folder = target_folders[folder_index]
    for _ in range(file_count):
        source_file = os.path.join(source_folder, pdf_files[current_index])
        destination_file = os.path.join(target_folder, pdf_files[current_index])
        shutil.copy(source_file, destination_file)
        current_index += 1

# Verify and print the number of files copied to each folder
files_copied_1 = len(os.listdir(target_folder_1))
files_copied_2 = len(os.listdir(target_folder_2))
files_copied_3 = len(os.listdir(target_folder_3))

total_copied = files_copied_1 + files_copied_2 + files_copied_3

print(f"Files have been successfully copied into three folders with distribution:")
print(f"Folder 1: {files_copied_1} files")
print(f"Folder 2: {files_copied_2} files")
print(f"Folder 3: {files_copied_3} files")
print(f"Total files copied: {total_copied}")

# Check if the total copied files equal the total files in source
if total_copied == total_files:
    print("All files have been successfully copied.")
else:
    print("There was an error in copying files.")


### Convert doc to PDF

In [None]:
import os
import subprocess
import sys

def doc_to_pdf(doc_path, output_dir):
    """
    Converts a Word document to PDF using LibreOffice command-line tools.
    
    Parameters:
        doc_path (str): The path to the input .doc or .docx file.
        output_dir (str): The directory where the output PDF will be saved.
    """
    try:
        subprocess.check_call([
            'soffice',
            '--headless',
            '--convert-to', 'pdf',
            '--outdir', output_dir,
            doc_path
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(f"Converted: {doc_path}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to convert {doc_path}: {e}")

def batch_convert_docs(folder_path, output_dir):
    """
    Converts all .doc and .docx files in a folder to PDF using LibreOffice,
    and stores the PDFs in the specified output directory.

    Parameters:
        folder_path (str): The path to the folder containing Word documents.
        output_dir (str): The directory where the output PDFs will be saved.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.doc', '.docx')):
            doc_path = os.path.join(folder_path, filename)
            doc_to_pdf(doc_path, output_dir)

if __name__ == "__main__":
    
    # Replace this with the path to your folder containing .doc/.docx files
    folder_path = r"D:\fd_doc_Prop_1885_split1" 
    
    # Define the output directory where PDFs will be stored
    output_dir = r"D:\fd_doc_Prop_1885_split1_to_pdf"
    
    batch_convert_docs(folder_path, output_dir)


### Copy all doc/docx files

In [None]:
import os
import shutil

def copy_doc_files_flat(source_folder, destination_folder):
    """
    Recursively copies all .doc and .docx files from the source folder and its subdirectories
    to the destination folder, without preserving directory structure.

    If there are duplicate filenames, the script renames the files to include part of the source path.

    Parameters:
        source_folder (str): The path to the folder containing the .doc and .docx files.
        destination_folder (str): The path to the folder where the files will be copied.
    """
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        print(f"Created destination folder: {destination_folder}")

    for root, dirs, files in os.walk(source_folder):
        for filename in files:
            if filename.lower().endswith(('.doc', '.docx', '.DOC', 'DOCX')):
                source_file = os.path.join(root, filename)
                
                # Build a unique filename for the destination
                relative_path = os.path.relpath(root, source_folder)
                # Sanitize the relative path to create a safe filename
                safe_relative_path = relative_path.replace(os.sep, '_').replace('.', '_')
                name, ext = os.path.splitext(filename)
                new_filename = f"{name}_{safe_relative_path}{ext}"
                dest_file = os.path.join(destination_folder, new_filename)
                
                # Handle duplicate filenames by appending an index if necessary
                count = 1
                original_dest_file = dest_file
                while os.path.exists(dest_file):
                    new_filename = f"{name}_{safe_relative_path}_{count}{ext}"
                    dest_file = os.path.join(destination_folder, new_filename)
                    count += 1
                
                try:
                    shutil.copy2(source_file, dest_file)
                    print(f"Copied: {source_file} -> {dest_file}")
                except Exception as e:
                    print(f"Failed to copy {source_file}: {e}")

if __name__ == "__main__":
    # Replace these paths with your source and destination folders
    source_folder = r"D:\Vector Cloud\Procurement\Procurement"
    destination_folder = r"F:\doc_in_Procur_check_2"

    copy_doc_files_flat(source_folder, destination_folder)


### Count all doc/docx files

In [24]:
import os

def count_doc_files(folder_path):
    """
    Counts the total number of files with .doc or .docx extensions in the specified folder.

    Parameters:
        folder_path (str): The path to the folder to search.

    Returns:
        int: The count of .doc and .docx files.
    """
    count = 0
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.doc', '.docx')):
            count += 1
    return count

if __name__ == "__main__":
    # Replace 'path_to_your_folder' with the actual path to your folder
    folder_path = r"C:\Users\Admin\Downloads\colpali_train\Procur_all_docs_fd_1"
    total_files = count_doc_files(folder_path)
    print(f"Total .doc and .docx files in '{folder_path}': {total_files}")


Total .doc and .docx files in 'C:\Users\Admin\Downloads\colpali_train\Procur_all_docs_fd_1': 6403


In [None]:
import os

# Define the paths to the folders
folder_a = r'C:\Users\Admin\Downloads\colpali_train\Procur_all_doc_to_pdf'
folder_b =  r'C:\Users\Admin\Downloads\colpali_train\Procur_all_docs_fd_1'

# Get the list of file names in both folders
files_in_a = set(os.listdir(folder_a))
files_in_b = set(os.listdir(folder_b))

# Get the list of files in B but not in A
files_only_in_b = files_in_b - files_in_a

# Convert the result to a list
files_only_in_b_list = list(files_only_in_b)

# Print or save the list
print("Files in folder B but not in folder A:")
for file in files_only_in_b_list:
    print(file)

print(f"Total files: {len(files_only_in_b_list)}")


In [23]:
import os

# Define the paths to folder A and folder B
folder_a =  r'C:\Users\Admin\Downloads\colpali_train\Procur_all_docs_fd_1'
folder_b = r'C:\Users\Admin\Downloads\colpali_train\Procur_all_doc_to_pdf'

# Get the filenames (without extensions) in folder A (.doc files)
files_in_a = [os.path.splitext(f)[0] for f in os.listdir(folder_a) if f.endswith(('.doc', '.docx', '.DOC', '.DOCX'))]

# Get the filenames (without extensions) in folder B (.pdf files)
files_in_b = [os.path.splitext(f)[0] for f in os.listdir(folder_b) if f.endswith('.pdf')]

# Find files that are in folder B but not in folder A
files_only_in_b = [f for f in files_in_b if f not in files_in_a]

# Print the result
counter = 0
print("Files in folder B but not in folder A:")
for file in files_only_in_b:
    counter +=1 
    print(file)
print(f"Total files: {counter}")
# print(files_in_a)


Files in folder B but not in folder A:
Cam ket noi dung hang xuat_Eng_VNese V072014_2_ Contract_2022_CT2209052 - SI - ND Cung Cấp TTB cho Mông Dương_Outgoing POs_Ecom_Demo Autronica-Tam nhap tai xuat_Docs for Customs Clearance_Tai xuat
Supplier confirmation for the use of E Signature  Banten_2_ Contract_2015_OB150201-Dai Hung Gas Gathering Project_OB150201-Dai Hung Project_Logistics_Certificate, CO,CQ_Confirmation of E Signature
Supplier confirmation for the use of E Signature  Eltron_2_ Contract_2015_OB150201-Dai Hung Gas Gathering Project_OB150201-Dai Hung Project_Logistics_Certificate, CO,CQ_Confirmation of E Signature
Supplier confirmation for the use of E Signature  Samlex_2_ Contract_2015_OB150201-Dai Hung Gas Gathering Project_OB150201-Dai Hung Project_Logistics_Certificate, CO,CQ_Confirmation of E Signature
Total files: 4


### Count extension of files

In [21]:
import os
folder_c = r'C:\Users\Admin\Downloads\colpali_train\Procur_all_docs_fd_1' # Replace with the actual path to folder C
extensions_set = set()
for filename in os.listdir(folder_c):
    _, file_extension = os.path.splitext(filename)   
    if file_extension:
        extensions_set.add(file_extension)
print("Unique file extensions in folder C:")
for ext in extensions_set:
    print(ext)

Unique file extensions in folder C:
.docx
.doc
.DOCX
.DOC
