In [4]:
import os

files = [f for f in os.listdir('../data/unified/specs') if not f.endswith('.json')]
files

['.DS_Store']

In [45]:
import os
import glob

def count_files_with_extension(directory, extension):
    # Construct the search pattern with the specified extension
    search_pattern = os.path.join(directory, f'*.{extension}')
    
    # Use glob to find all files that match the pattern, and return their count
    files = glob.glob(search_pattern)
    return len(files)

# Example usage
directory = '../data/unified/will_trash'
extension = 'png'  # for example, find all .txt files
num_files = count_files_with_extension(directory, extension)

print(f"Number of files with .{extension} extension: {num_files}")

Number of files with .png extension: 9


In [10]:
import os
import json

def find_nested_field(data, field_name, field_value):
    """
    Recursively searches for a field_name with the field_value in nested JSON data.
    """
    if isinstance(data, dict):
        for key, value in data.items():
            if key == field_name and value == field_value:
                return True
            elif isinstance(value, (dict, list)):
                if find_nested_field(value, field_name, field_value):
                    return True
    elif isinstance(data, list):
        for item in data:
            if find_nested_field(item, field_name, field_value):
                return True
    return False

def list_json_files_with_nested_field(directory, field_name, field_value):
    matching_files = []
    
    # Loop over all files in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            # Construct the full path to the file
            file_path = os.path.join(directory, filename)

            try:
                # Open and parse the JSON file
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    
                    # Check if the nested field and value exist in the JSON data
                    if find_nested_field(data, field_name, field_value):
                        matching_files.append(filename)

            except (json.JSONDecodeError, IOError) as e:
                print(f"Error reading {filename}: {e}")

    return matching_files

# Example usage
directory = '../data/unified/specs'
field_name = 'type'
field_value = 'vcf'
matching_json_files = list_json_files_with_nested_field(directory, field_name, field_value)

print("Files with the specified nested field:")
for filename in matching_json_files:
    print(filename)
print(len(matching_json_files))

Files with the specified nested field:
PBCA-DE-2009e5e7-1796-445b-8677-46b3804fe0bf.json
BOCA-UK-f86e2d80-911b-7a19-e040-11ac0d486900.json
PBCA-DE-0609b182-0396-4755-b463-10f949c5b8bb.json
BTCA-SG-4ef20a13-f107-4654-adca-fc60b8f5be47.json
BRCA-US-25c76a8f-77c0-4650-bddf-45ed0c10a2e6.json
BRCA-UK-f393bafe-7503-5c45-e040-11ac0d484511.json
BOCA-UK-f87d7c27-eeef-920e-e040-11ac0d48388d.json
PBCA-DE-18f9240e-7acd-41eb-a154-c2a67a1cbe9b.json
PBCA-DE-6d3d5531-16b4-46ec-bd7e-394946855d62.json
PBCA-DE-aa4bf69c-a1a6-48dc-96fc-3fe7d2680517.json
PBCA-DE-b16780b0-75ba-4304-87e0-33f1e321b622.json
BRCA-EU-fc8130df-897d-5404-e040-11ac0d485e0a.json
VCF_INDELS_sw_1_0_s_0_7_cc_2.json
LIRI-JP-bcc74a5e-c622-11e3-bf01-24c6515278c0.json
BRCA-UK-fc447d51-cdc2-a180-e040-11ac0c4846a4.json
BRCA-EU-fc8130df-3225-3f96-e040-11ac0d485dfe.json
VCF_INDELS_sw_0_7_s_1_2_oc.json
BRCA-US-fc6d77a9-121b-48ab-a899-713c3d1319a2.json
BRCA-EU-fc8130e0-ad66-b82e-e040-11ac0d485e0e.json
GBM-US-2c71d542-1653-4d97-8752-f2fd05c7e75d.j

In [3]:
import os
import json

def find_files_with_types(folder_path, file_types):
    matching_files = {file_type: [] for file_type in file_types}
    
    try:
        # Loop through all files in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                
                try:
                    # Read and parse each JSON file
                    with open(file_path, 'r') as file:
                        json_data = json.load(file)
                        
                        # Function to recursively search for specified file types in the JSON
                        def search_for_type(obj, file_types):
                            if isinstance(obj, dict):
                                for key, value in obj.items():
                                    if key == "type" and value in file_types:
                                        return value
                                    if isinstance(value, (dict, list)):
                                        result = search_for_type(value, file_types)
                                        if result:
                                            return result
                            elif isinstance(obj, list):
                                for item in obj:
                                    result = search_for_type(item, file_types)
                                    if result:
                                        return result
                            return None
                        
                        # Check if any of the file types are found in the JSON
                        file_type_found = search_for_type(json_data, file_types)
                        if file_type_found:
                            matching_files[file_type_found].append(filename)

                except json.JSONDecodeError as e:
                    print(f"Error reading JSON file {filename}: {e}")
                except Exception as e:
                    print(f"Error processing file {filename}: {e}")
                    
    except Exception as e:
        print(f"Error accessing folder: {e}")
        return {}, 0
    
    # Print results
    for file_type, files in matching_files.items():
        print(f"\nFiles containing 'type':'{file_type}':")
        for file in files:
            print(file)
        print(f"Total number of '{file_type}' files: {len(files)}")
    
    total_files = sum(len(files) for files in matching_files.values())
    return matching_files, total_files

# Example usage
folder_path = "../data/gallery_p1/specs"
file_types = ["vcf", "gff", "bed", "bam"]
files, total_count = find_files_with_types(folder_path, file_types)


Files containing 'type':'vcf':
VCF_INDELS_sw_0_7_s_1_2.json
VCF_POINT_MUTATIONS_sw_1_2_s_1_0.json
VCF_POINT_MUTATIONS_sw_0_7_s_1_0.json
VCF_INDELS_sw_1_2_s_1_2.json
VCF_INDELS_sw_1_2_s_0_7.json
VCF_INDELS_sw_0_7_s_0_7.json
VCF_POINT_MUTATIONS_sw_1_0_s_1_0.json
VCF_INDELS_sw_1_0_s_1_2.json
VCF_INDELS_sw_1_0_s_0_7.json
VCF_POINT_MUTATIONS_sw_1_0_s_1_2.json
VCF_INDELS_sw_1_0_s_1_0.json
VCF_POINT_MUTATIONS_sw_1_0_s_0_7.json
VCF_POINT_MUTATIONS_sw_1_2_s_1_2.json
VCF_INDELS_sw_0_7_s_1_0.json
VCF_INDELS_sw_1_2_s_1_0.json
VCF_POINT_MUTATIONS_sw_0_7_s_1_2.json
VCF_POINT_MUTATIONS_sw_0_7_s_0_7.json
VCF_POINT_MUTATIONS_sw_1_2_s_0_7.json
Total number of 'vcf' files: 18

Files containing 'type':'gff':
GFF_DEMO_sw_1_2_s_1_0.json
GFF_DEMO_sw_0_7_s_1_0.json
GFF_DEMO_sw_1_0_s_1_0.json
GFF_DEMO_sw_1_0_s_0_7.json
GFF_DEMO_sw_1_0_s_1_2.json
GFF_DEMO_sw_0_7_s_0_7.json
GFF_DEMO_sw_1_2_s_0_7.json
GFF_DEMO_sw_1_2_s_1_2.json
GFF_DEMO_sw_0_7_s_1_2.json
Total number of 'gff' files: 9

Files containing 'type':'b

In [1]:
import os
import shutil
import logging
from pathlib import Path
from typing import Set, Dict
from datetime import datetime

class FileMatchProcessor:
    def __init__(self, spec_path: str, imgs_path: str, unmatched_path: str = None):
        """Initialize with paths and set up logging."""
        self.spec_folder = Path(spec_path)
        self.imgs_folder = Path(imgs_path)
        self.unmatched_folder = Path(unmatched_path or f"unmatched_specs_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
        
        # Set up logging
        self._setup_logging()
        
        # Validate paths
        self._validate_paths()

    def _setup_logging(self):
        """Configure logging settings."""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(f'file_matching_{datetime.now().strftime("%Y%m%d")}.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def _validate_paths(self):
        """Validate that source directories exist."""
        if not self.spec_folder.exists():
            raise FileNotFoundError(f"Spec folder not found: {self.spec_folder}")
        if not self.imgs_folder.exists():
            raise FileNotFoundError(f"Images folder not found: {self.imgs_folder}")

    def _get_files(self, path: Path, extension: str) -> Set[str]:
        """Get set of filenames without extensions for given path and extension."""
        try:
            return {f.stem for f in path.glob(f"*.{extension}")}
        except Exception as e:
            self.logger.error(f"Error reading files from {path}: {str(e)}")
            return set()

    def _create_unmatched_folder(self):
        """Create unmatched folder if it doesn't exist."""
        try:
            self.unmatched_folder.mkdir(parents=True, exist_ok=True)
            self.logger.info(f"Created or verified unmatched folder: {self.unmatched_folder}")
        except Exception as e:
            self.logger.error(f"Error creating unmatched folder: {str(e)}")
            raise

    def _move_file(self, filename: str) -> bool:
        """Move a single file to unmatched folder."""
        src_path = self.spec_folder / f"{filename}.json"
        dst_path = self.unmatched_folder / f"{filename}.json"

        try:
            # Handle case where destination file already exists
            if dst_path.exists():
                new_name = f"{filename}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                dst_path = self.unmatched_folder / new_name
                self.logger.warning(f"Destination file exists, renaming to {new_name}")

            shutil.move(str(src_path), str(dst_path))
            self.logger.info(f"Successfully moved {filename}.json to {self.unmatched_folder}")
            return True
        except Exception as e:
            self.logger.error(f"Error moving {filename}.json: {str(e)}")
            return False

    def process_files(self) -> Dict[str, int]:
        """Main method to process and move unmatched files."""
        stats = {"total_unmatched": 0, "successfully_moved": 0, "failed_moves": 0}
        
        try:
            # Get files from both directories
            spec_files = self._get_files(self.spec_folder, "json")
            imgs_files = self._get_files(self.imgs_folder, "png")

            # Find unmatched files
            unmatched_files = spec_files - imgs_files
            stats["total_unmatched"] = len(unmatched_files)

            if not unmatched_files:
                self.logger.info("No unmatched files found.")
                return stats

            # Create unmatched folder
            self._create_unmatched_folder()

            # Move unmatched files
            for filename in unmatched_files:
                if self._move_file(filename):
                    stats["successfully_moved"] += 1
                else:
                    stats["failed_moves"] += 1

        except Exception as e:
            self.logger.error(f"Error during file processing: {str(e)}")
            raise

        self.logger.info(f"Processing complete. Stats: {stats}")
        return stats

def main():
    """Main function to run the file matching process."""
    try:
        processor = FileMatchProcessor(
            spec_path="../data/unified/specs",
            imgs_path="../data/unified/imgs",
            unmatched_path="../data/unified/remaining-specs"
        )
        stats = processor.process_files()
        print(f"Processing completed with stats: {stats}")
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()

2025-02-28 20:22:52,038 - INFO - Created or verified unmatched folder: ../data/unified/remaining-specs
2025-02-28 20:22:52,039 - INFO - Successfully moved two_by_two_p_3_m_0_sw_1_2_s_0_7_cc_0.json to ../data/unified/remaining-specs
2025-02-28 20:22:52,039 - INFO - Successfully moved two_vertical_m_12_sw_0_7_s_0_5_cc_2.json to ../data/unified/remaining-specs
2025-02-28 20:22:52,040 - INFO - Successfully moved two_by_two_p_2_m_17_sw_1_0_s_1_0_cc_0.json to ../data/unified/remaining-specs
2025-02-28 20:22:52,040 - INFO - Successfully moved two_by_two_p_2_m_14_sw_1_2_s_1_0_oc.json to ../data/unified/remaining-specs
2025-02-28 20:22:52,041 - INFO - Successfully moved two_by_two_p_3_m_8_sw_1_0_s_0_7_oc.json to ../data/unified/remaining-specs
2025-02-28 20:22:52,042 - INFO - Successfully moved two_by_two_uneven_h_p_1_sw_1_2_s_1_0_oc.json to ../data/unified/remaining-specs
2025-02-28 20:22:52,042 - INFO - Successfully moved two_by_two_sq_uneven_h_p_0_sw_1_2_s_1_2_cc_1.json to ../data/unified/re

Processing completed with stats: {'total_unmatched': 10093, 'successfully_moved': 10093, 'failed_moves': 0}


In [14]:
# Compare contents of two folders

import os

# Specify the folder paths
specs_folder = '../data/unified_v3/specs'
imgs_folder = '../data/unified_v3/imgs'

# Extract base file names from the specs folder
specs_files = os.listdir(specs_folder)
specs_base_names = {os.path.splitext(file)[0] for file in specs_files if file.endswith('.json')}

# Extract base file names from the imgs folder
imgs_files = os.listdir(imgs_folder)
imgs_base_names = {os.path.splitext(file)[0] for file in imgs_files if file.endswith('.png')}

# Find overlaps and differences
overlap = specs_base_names.intersection(imgs_base_names)
specs_only = specs_base_names.difference(imgs_base_names)
imgs_only = imgs_base_names.difference(specs_base_names)

# Report results
# print("Overlap:", overlap)
print("Files in specs only:", specs_only)
print("Files in imgs only:", imgs_only)

Files in specs only: set()
Files in imgs only: set()


In [8]:
import os
import shutil
from math import ceil

def divide_files_into_folders(source_folder, dest_folder_base, num_folders=5):
    # Get list of files in the source folder
    files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]

    # Calculate the number of files per folder
    num_files = len(files)
    files_per_folder = ceil(num_files / num_folders)

    # Ensure destination folders exist
    dest_folders = []
    for i in range(num_folders):
        folder_name = f"{dest_folder_base}_{i+1}"
        dest_folder = os.path.join(source_folder, folder_name)
        os.makedirs(dest_folder, exist_ok=True)
        dest_folders.append(dest_folder)

    # Distribute files to each destination folder
    for index, file in enumerate(files):
        dest_folder = dest_folders[index // files_per_folder]
        shutil.move(os.path.join(source_folder, file), os.path.join(dest_folder, file))

# Example usage
source_folder = '../data/unified/remaining_specs'
dest_folder_base = 'partition'
divide_files_into_folders(source_folder, dest_folder_base)

In [37]:
import os

def delete_matching_files(folder_a, folder_b):
    # Get lists of files in both directories
    files_in_a = set(os.listdir(folder_a))
    files_in_b = set(os.listdir(folder_b))
    
    # Find the intersection of both sets to get files that exist in both directories
    files_to_delete = files_in_a.intersection(files_in_b)
    
    # Initialize a counter for the number of deletions
    delete_count = 0

    # Delete the files that are present in both directories
    for file_name in files_to_delete:
        file_path_a = os.path.join(folder_a, file_name)
        # Check again to ensure it's a file to avoid directories
        if os.path.isfile(file_path_a):
            os.remove(file_path_a)
            delete_count += 1

    return delete_count

# Example usage:
folder_a = '../data/unified/failed_imgs'
folder_b = '../data/chromoscope/imgs'
count_deleted = delete_matching_files(folder_a, folder_b)
print(f'{count_deleted} files deleted.')

481 files deleted.


In [42]:
# compare the file names in the two folders and copy the matching files to rerun

import os
import shutil

# Define the paths to the folders
folder_a_path = '../data/unified/A2_rerun_specs'  # Path to the folder containing JSON files
folder_b_path = '../data/unified/A2_rerun_imgs'   # Path to the folder containing PNG files
rerun_folder_path = '../data/unified/A3_rerun_specs'  # Path to the destination folder

# Create the rerun_specs folder if it doesn't exist
os.makedirs(rerun_folder_path, exist_ok=True)

# Get the list of files in each folder
files_in_a = set(os.listdir(folder_a_path))
files_in_b = set(os.listdir(folder_b_path))

# Find matching file names (ignoring extensions)
for file_a in files_in_a:
    file_name_without_ext, ext = os.path.splitext(file_a)
    if ext.lower() == '.json':
        # Corresponding PNG file name
        file_b = file_name_without_ext + '.png'
        if file_b in files_in_b:
            src_file_path = os.path.join(folder_a_path, file_a)
            dst_file_path = os.path.join(rerun_folder_path, file_a)
            # Copy the file from A_specs to rerun_specs
            shutil.copy(src_file_path, dst_file_path)
            print(f'Copied {file_a} to {rerun_folder_path}')

print('Finished processing.')

Copied breast_cancer_sw_2_0_s_2_0_cc_0.json to ../data/unified/A3_rerun_specs
Copied VCF_POINT_MUTATIONS_sw_0_7_s_0_7_cc_0.json to ../data/unified/A3_rerun_specs
Copied breast_cancer_sw_2_0_s_2_0_cc_2.json to ../data/unified/A3_rerun_specs
Copied VCF_POINT_MUTATIONS_sw_0_7_s_1_2_cc_0.json to ../data/unified/A3_rerun_specs
Copied breast_cancer_sw_2_0_s_2_0_oc.json to ../data/unified/A3_rerun_specs
Copied VCF_POINT_MUTATIONS_sw_0_7_s_0_7_oc.json to ../data/unified/A3_rerun_specs
Copied breast_cancer_sw_2_0_s_2_0_cc_1.json to ../data/unified/A3_rerun_specs
Copied VCF_POINT_MUTATIONS_sw_1_2_s_1_2_cc_0.json to ../data/unified/A3_rerun_specs
Copied VCF_POINT_MUTATIONS_sw_0_7_s_1_2_oc.json to ../data/unified/A3_rerun_specs
Finished processing.


In [8]:
import os

def check_file_type(directory, file_extension):
    """
    Check if all the files in the directory (including nested directories)
    have the specified file extension. If not, print the files with different extensions.

    """
    incorrect_files = []

    for root, _, files in os.walk(directory):
        for file in files:
            if not file.endswith(file_extension):
                # Append the incorrect file path
                incorrect_files.append(os.path.join(root, file))

    if incorrect_files:
        print("Files with a different extension:")
        for incorrect_file in incorrect_files:
            print(incorrect_file)
        return False

    return True

# Example usage:
directory_path = '../data/unified_v2/gallery_p2/specs'  # Change to your folder path
file_extension = '.json'  # Change to your desired file extension

if check_file_type(directory_path, file_extension):
    print(f"All files in '{directory_path}' and its subdirectories are '{file_extension}' files.")
else:
    print(f"Not all files in '{directory_path}' are '{file_extension}' files.")

Files with a different extension:
../data/unified_v2/gallery_p2/specs/.DS_Store
../data/unified_v2/gallery_p2/specs/responsive-ideogram/.DS_Store
../data/unified_v2/gallery_p2/specs/responsive-circular/.DS_Store
../data/unified_v2/gallery_p2/specs/basic-link/.DS_Store
Not all files in '../data/unified_v2/gallery_p2/specs' are '.json' files.


In [14]:
import os

def get_all_files_in_directory(directory, file_extension=None):
    """
    Recursively get all files in the directory with the specified extension.
    If no extension is specified, retrieve all files.
    """
    matches = set()
    for root, _, files in os.walk(directory):
        for file in files:
            if file_extension:
                if file.lower().endswith(file_extension.lower()):
                    matches.add(os.path.relpath(os.path.join(root, file), start=directory))
            else:
                matches.add(os.path.relpath(os.path.join(root, file), start=directory))
    return matches

# Define the paths to the folders
folder_a_path = '../data/unified_v2/gallery_p2/specs'  # Path to the folder containing JSON files
folder_b_path = '../data/unified_v2/gallery_p2/imgs'   # Path to the folder containing PNG files

# Get all files with specified extensions
files_in_a = get_all_files_in_directory(folder_a_path, '.json')
files_in_b = get_all_files_in_directory(folder_b_path, '.png')

# Find matching file names (ignoring extensions)
matched = False
for file_a in files_in_a:
    file_name_without_ext = os.path.splitext(file_a)[0]
    file_b = file_name_without_ext + '.png'
    if file_b in files_in_b:
        # print(f"Matching files: {file_a} and {file_b}")
        matched = True
    else:
        print(f"No matching file for {file_a} in folder B")

# Additionally, check for PNGs in folder B without corresponding JSONs
for file_b in files_in_b:
    file_name_without_ext = os.path.splitext(file_b)[0]
    file_a = file_name_without_ext + '.json'
    if file_a not in files_in_a:
        print(f"No matching file for {file_b} in folder A")
        matched = True

if not matched:
    print('No matching files found.')

print('Finished processing.')

No matching file for basic-link/.ipynb_checkpoints/basic-link_p_0_sw_0_7_s_0_7_cc_1-checkpoint.json in folder B
Finished processing.


In [1]:
import os
import filecmp

def compare_directories(dir1, dir2):
    # Differ files
    differ_files = []

    # Compare the directories
    dir_cmp = filecmp.dircmp(dir1, dir2)

    # Files that are different
    differ_files.extend(dir_cmp.diff_files)
    
    # Files only present in one of the directories
    if dir_cmp.left_only or dir_cmp.right_only:
        differ_files.extend(dir_cmp.left_only)
        differ_files.extend(dir_cmp.right_only)

    # Recursively compare subdirectories
    for sub_dir in dir_cmp.subdirs:
        differ_files.extend(compare_directories(
            os.path.join(dir1, sub_dir),
            os.path.join(dir2, sub_dir)
        ))
        
    return differ_files

def main():
    folder1 = '../data/gallery_p1/specs_copy'
    folder2 = '../data/gallery_p1/specs'

    # Compare the folders
    diff_files = compare_directories(folder1, folder2)

    # Print out the differing files
    if diff_files:
        print("Differences found in the following files:")
        for file in diff_files:
            print(file)
    else:
        print("The folders are identical.")

if __name__ == "__main__":
    main()

Differences found in the following files:
DUMMY_TRACK_sw_0_7_s_0_7.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_0_7_s_0_7.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_0_7_s_1_0.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_0_7_s_1_2.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_0_7.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_0.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_2.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_2_s_0_7.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_2_s_1_0.json
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_2_s_1_2.json
EX_SPEC_GENE_ANNOTATION_sw_0_7_s_0_7.json
EX_SPEC_GIVE_sw_0_7_s_0_7.json
EX_SPEC_GIVE_sw_0_7_s_1_0.json
EX_SPEC_GIVE_sw_0_7_s_1_2.json
EX_SPEC_GIVE_sw_1_0_s_0_7.json
EX_SPEC_GIVE_sw_1_0_s_1_0.json
EX_SPEC_GIVE_sw_1_0_s_1_2.json
EX_SPEC_GIVE_sw_1_2_s_0_7.json
EX_SPEC_GIVE_sw_1_2_s_1_0.json
EX_SPEC_GIVE_sw_1_2_s_1_2.json
EX_SPEC_MATRIX_HFFC6_sw_0_7_s_0_7.json
EX_SPEC_MATRIX_HFFC6_sw_0_7_s_1_0.json
EX_SPEC_MATRIX_HFFC6_sw_0_7_s_1_2.json
EX_SPEC_MATRIX_HFFC6_sw_1_0_s_0

In [8]:
import os
import shutil

# Define paths to the folders
folder_a = './geranium_seeds/rerun/heat'
folder_b = '../data/unified/specs'
rerun_specs_folder = './rerun_specs'

# Create the rerun_specs folder if it doesn't exist
if not os.path.exists(rerun_specs_folder):
    os.makedirs(rerun_specs_folder)

def get_filenames_without_extension(directory, extension):
    """Retrieves a set of filenames without the extension from a given directory."""
    return {os.path.splitext(filename)[0] for filename in os.listdir(directory) 
            if os.path.isfile(os.path.join(directory, filename)) and filename.lower().endswith(extension)}

# Get filenames without extensions from folder A for PNG files
folder_a_filenames = get_filenames_without_extension(folder_a, '.png')

# Walk through folder B, find matching JSON files by name, and move them
for filename in os.listdir(folder_b):
    if os.path.isfile(os.path.join(folder_b, filename)) and filename.lower().endswith('.json'):
        # Get the file name without extension
        name_without_ext = os.path.splitext(filename)[0]
        
        # Check if this JSON file's name (excluding extension) matches any PNG file name from folder A
        if name_without_ext in folder_a_filenames:
            # Move the file to the rerun_specs directory
            shutil.move(os.path.join(folder_b, filename), os.path.join(rerun_specs_folder, filename))
            print(f'Moved: {filename} to rerun_specs')

Moved: heatmap_sw_1_0_s_1_0_oc.json to rerun_specs
Moved: hic_matrix_sw_1_0_s_1_0_oc.json to rerun_specs
Moved: gray_heatmap_sw_1_0_s_1_0_oc.json to rerun_specs


In [12]:
# check file type

import os

def check_folders(folders, file_type):
    """
    Checks if all files in the given folders and subfolders are of the specified file type.
    Prints exceptions if any folder contains other file types.
    
    :param folders: List of folder paths to check.
    :param file_type: Allowed file extension (e.g., 'json' or 'png').
    """
    allowed_ext = f".{file_type.lower()}"
    exceptions = {}
    
    for folder in folders:
        for root, _, files in os.walk(folder):
            for file in files:
                if not file.lower().endswith(allowed_ext):
                    if root not in exceptions:
                        exceptions[root] = []
                    exceptions[root].append(file)
    
    if exceptions:
        print("Exceptions found:")
        for folder, files in exceptions.items():
            print(f"Folder: {folder}")
            print("  Contains disallowed files:", ", ".join(files))
    else:
        print(f"All specified folders only contain .{file_type} files.")

# Example usage
if __name__ == "__main__":
    folders_to_check = ["../data/unified_v2/autogosling/imgs", "../data/unified_v2/gallery_p1/imgs", "../data/unified_v2/gallery_p2/imgs"]  # Replace with actual paths
    file_type = "png"  # Change to 'png' if needed
    check_folders(folders_to_check, file_type)


All specified folders only contain .png files.
