In [10]:
import os
from pathlib import Path

def load_paths(filename):
    """Load relative paths from a text file."""
    with open(filename, "r") as file:
        return set(line.strip().strip('/') for line in file if line.strip())

def drop_directories(base_path, paths_to_remove):
    """Delete directories listed in the provided path set."""
    for rel_path in paths_to_remove:
        full_path = os.path.join(base_path, rel_path)
        if os.path.exists(full_path):
            print(f"Deleting: {full_path}")
            os.system(f"rm -rf '{full_path}'")  # Careful with rm -rf!
        else:
            print(f"Path not found: {full_path}")

def find_missing_difficulty(base_dir):
    """Recursively check for info.txt files missing 'Graph difficulty'."""
    missing_difficulty = []
    for info_path in Path(base_dir).rglob("info.txt"):
        try:
            with open(info_path, "r", encoding="utf-8") as f:
                content = f.read()
                if "Graph difficulty" not in content:
                    missing_difficulty.append(str(info_path))
        except Exception as e:
            print(f"Error reading {info_path}: {e}")
    return missing_difficulty

if __name__ == "__main__":
    invalid_file = "../dataset/invalid.txt"
    multistory_file = "../dataset/multistory.txt"
    base_dataset_dir = "../cubicasa5k-original"

    # Step 1: Load directories to drop
    invalid_paths = load_paths(invalid_file)
    multistory_paths = load_paths(multistory_file)
    paths_to_remove = invalid_paths.union(multistory_paths)

    # Step 2: Drop those directories
    drop_directories(base_dataset_dir, paths_to_remove)

    # Step 3: Find info.txt files missing "Graph difficulty"
    missing_files = find_missing_difficulty(base_dataset_dir)

    print("\nFiles missing 'Graph difficulty':")
    for f in missing_files:
        print(f)


Deleting: ../cubicasa5k-original/high_quality_architectural/3915
Deleting: ../cubicasa5k-original/high_quality_architectural/9465
Deleting: ../cubicasa5k-original/high_quality_architectural/5100
Deleting: ../cubicasa5k-original/high_quality/2554
Deleting: ../cubicasa5k-original/high_quality_architectural/1075
Deleting: ../cubicasa5k-original/high_quality_architectural/5533
Deleting: ../cubicasa5k-original/high_quality/5509
Deleting: ../cubicasa5k-original/high_quality_architectural/8517
Deleting: ../cubicasa5k-original/high_quality_architectural/8320
Deleting: ../cubicasa5k-original/high_quality_architectural/9796
Deleting: ../cubicasa5k-original/high_quality/7730
Deleting: ../cubicasa5k-original/high_quality_architectural/601
Deleting: ../cubicasa5k-original/high_quality_architectural/4697
Deleting: ../cubicasa5k-original/high_quality_architectural/8745
Deleting: ../cubicasa5k-original/high_quality_architectural/11438
Deleting: ../cubicasa5k-original/high_quality_architectural/5039
De

In [1]:
import os
from pathlib import Path
import shutil

def load_paths(filename):
    with open(filename, "r") as file:
        return set(line.strip().strip('/') for line in file if line.strip())

def check_and_drop_invalid_info_dirs(base_path):
    base = Path(base_path)
    for category in base.iterdir():
        if category.is_dir():
            for number_dir in category.iterdir():
                if number_dir.is_dir():
                    info_path = number_dir / "info.txt"
                    # Delete if info.txt doesn't exist or contains "undefined"
                    if not info_path.exists():
                        print(f"Missing info.txt → Deleting: {number_dir}")
                        shutil.rmtree(number_dir)
                    else:
                        with open(info_path, "r", encoding="utf-8") as f:
                            content = f.read()
                            if "UNDEFINED" in content:
                                print(f"'UNDEFINED' found in info.txt → Deleting: {number_dir}")
                                shutil.rmtree(number_dir)

def find_missing_difficulty(base_dir):
    missing = []
    for info_file in Path(base_dir).rglob("info.txt"):
        with open(info_file, "r", encoding="utf-8") as f:
            content = f.read()
            if "Graph difficulty" not in content:
                missing.append(str(info_file))
    return missing

if __name__ == "__main__":
    invalid_file = "../dataset/invalid.txt"
    multistory_file = "../dataset/multistory.txt"
    dataset_dir = "../cubicasa5k-662"

    # 2. Drop folders with missing or invalid info.txt
    check_and_drop_invalid_info_dirs(dataset_dir)

    # 3. Check remaining info.txt files for missing 'Graph difficulty'
    missing = find_missing_difficulty(dataset_dir)

    print("\nFiles missing 'Graph difficulty':")
    for path in missing:
        print(path)



Files missing 'Graph difficulty':
