### Copy Files from Secondary Dirs into Main Dir:

In [10]:
import os
import shutil

def copy_files(src_dir, dst_dir):
    """
    Copies HTML files from src_dir to dst_dir, without overwriting files that already exist.
    """
    # Ensure that the destination directory exists
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    
    # Iterate through all files in the source directory
    for file_name in os.listdir(src_dir):
        src_file_path = os.path.join(src_dir, file_name)
        
        # Only process HTML files
        if file_name.endswith('.html'):
            dst_file_path = os.path.join(dst_dir, file_name)
            
            # Check if the file already exists in the destination
            if not os.path.exists(dst_file_path):
                # Copy the file if it does not exist in the destination
                shutil.copy2(src_file_path, dst_file_path)
                print(f"Copied: {src_file_path} -> {dst_file_path}")
            else:
                print(f"Skipped (exists): {dst_file_path}")

def copy_from_secondary_dirs(main_dir, secondary_dirs):
    """
    Copies HTML files from the HomePage and DetailPage subfolders of secondary directories
    to the corresponding subfolders in the main directory.
    """
    main_home_page_dir = os.path.join(main_dir, 'HomePage')
    main_detail_page_dir = os.path.join(main_dir, 'DetailPage')

    # Ensure main HomePage and DetailPage subdirectories exist
    if not os.path.exists(main_home_page_dir):
        os.makedirs(main_home_page_dir)
    if not os.path.exists(main_detail_page_dir):
        os.makedirs(main_detail_page_dir)

    # Loop through each secondary directory
    for secondary_dir in secondary_dirs:
        sec_home_page_dir = os.path.join(secondary_dir, 'HomePage')
        sec_detail_page_dir = os.path.join(secondary_dir, 'DetailPage')

        print(f"\nProcessing directory: {secondary_dir}")
        
        # Copy HTML files from HomePage subfolder
        if os.path.exists(sec_home_page_dir):
            print(f"Copying from HomePage folder in {secondary_dir}")
            copy_files(sec_home_page_dir, main_home_page_dir)
        else:
            print(f"HomePage folder not found in {secondary_dir}")

        # Copy HTML files from DetailPage subfolder
        if os.path.exists(sec_detail_page_dir):
            print(f"Copying from DetailPage folder in {secondary_dir}")
            copy_files(sec_detail_page_dir, main_detail_page_dir)
        else:
            print(f"DetailPage folder not found in {secondary_dir}")


if __name__ == "__main__":
    # Set the main directory
    main_dir = input("Enter the path to the main directory: ")
    
    # List to hold the secondary directories
    secondary_dirs = []
    
    # Get user input for secondary directories
    while True:
        sec_dir = input("Enter a secondary directory path (or type 'done' when finished): ")
        if sec_dir.lower() == 'done':
            break
        elif os.path.isdir(sec_dir):
            secondary_dirs.append(sec_dir)
        else:
            print("Invalid directory. Please enter a valid path.")
    
    if not secondary_dirs:
        print("No secondary directories provided. Exiting.")
    else:
        # Copy the files from secondary dirs to main dir
        copy_from_secondary_dirs(main_dir, secondary_dirs)



Processing directory: C:\Users\hp\Desktop\freelance\Mouser\mouser_bot
Copying from HomePage folder in C:\Users\hp\Desktop\freelance\Mouser\mouser_bot
Skipped (exists): D:\mouser_data\chunk3\HomePage\0 048 71.html
Copied: C:\Users\hp\Desktop\freelance\Mouser\mouser_bot\HomePage\2-146140-4.html -> D:\mouser_data\chunk3\HomePage\2-146140-4.html
Copied: C:\Users\hp\Desktop\freelance\Mouser\mouser_bot\HomePage\2-146274-0.html -> D:\mouser_data\chunk3\HomePage\2-146274-0.html
Copied: C:\Users\hp\Desktop\freelance\Mouser\mouser_bot\HomePage\2-146285-8.html -> D:\mouser_data\chunk3\HomePage\2-146285-8.html
Copied: C:\Users\hp\Desktop\freelance\Mouser\mouser_bot\HomePage\2-146462-2.html -> D:\mouser_data\chunk3\HomePage\2-146462-2.html
Copied: C:\Users\hp\Desktop\freelance\Mouser\mouser_bot\HomePage\2-146490-5.html -> D:\mouser_data\chunk3\HomePage\2-146490-5.html
Copied: C:\Users\hp\Desktop\freelance\Mouser\mouser_bot\HomePage\2-1470212-2.html -> D:\mouser_data\chunk3\HomePage\2-1470212-2.htm

### Check Common File Names

In [9]:
dirA = r'D:\mouser_data\chunk3\Scraped TSV Records 1.8k+\HomePage'
dirB = r'D:\mouser_data\chunk3\HomePage'

import os

def get_file_names(dir_path):
    """
    Get the set of all file names in a directory.
    """
    try:
        # Return the set of file names in the directory
        return set(os.listdir(dir_path))
    except FileNotFoundError:
        print(f"Directory not found: {dir_path}")
        return set()

def find_common_files(dirA, dirB):
    """
    Find common file names between two directories.
    """
    # Get file names from both directories
    files_in_dirA = get_file_names(dirA)
    files_in_dirB = get_file_names(dirB)

    # Find common files
    common_files = files_in_dirA.intersection(files_in_dirB)

    return common_files

if __name__ == "__main__":
    # Input paths for dirA and dirB
    dirA = input("Enter the path to directory A: ")
    dirB = input("Enter the path to directory B: ")

    # Find common files
    common_files = find_common_files(dirA, dirB)

    if common_files:
        print("\nCommon files:")
        for file in common_files:
            print(file)
    else:
        print("\nNo common files found.")



Common files:
HomePage
DetailPage


In [5]:
len(common_files)

563

### Decompress and save .html file

In [7]:
import gzip
import os

def decompress_gzip_file(input_file, output_file):
    """
    Decompresses a gzip file and writes the decoded content to the output file.
    """
    try:
        # Open the gzip file in read binary mode
        with gzip.open(input_file, 'rb') as f_in:
            # Read the decompressed content
            decompressed_data = f_in.read()
        
        # Write the decompressed content to the output file
        with open(output_file, 'wb') as f_out:
            f_out.write(decompressed_data)
        
        print(f"Decompressed content has been saved to {output_file}")
    
    except OSError as e:
        print(f"Error decompressing the file: {e}")

if __name__ == "__main__":
    # Input and output files
    input_file = input("Enter the path to the gzip-encoded HTML file: ")
    output_file = input("Enter the path to save the decompressed HTML file: ")

    # Decompress the gzip file
    decompress_gzip_file(input_file, output_file)


Decompressed content has been saved to D:\mouser_data\chunk3\new.html
