# annotation class check

In [2]:
import os
import glob

def validate_yolo_labels(folder_path):
    # Define the valid classes as strings to avoid unnecessary type casting
    valid_classes = {'0', '1', '2'}
    
    # Get all .txt files in the directory
    txt_files = glob.glob(os.path.join(folder_path, "*.txt"))
    
    if not txt_files:
        print(f"No .txt files found in {folder_path}")
        return

    print(f"Checking {len(txt_files)} files in: {folder_path}...\n")
    
    invalid_files_count = 0
    
    for file_path in txt_files:
        filename = os.path.basename(file_path)
        is_valid = True
        invalid_lines = []

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                
                for i, line in enumerate(lines):
                    line = line.strip()
                    if not line:
                        continue # Skip empty lines
                    
                    parts = line.split()
                    
                    # YOLO format: class_id x_center y_center width height
                    if not parts:
                        continue
                        
                    class_id = parts[0]
                    
                    if class_id not in valid_classes:
                        is_valid = False
                        invalid_lines.append(f"Line {i+1}: Found class '{class_id}'")
            
            if not is_valid:
                print(f"[!] Invalid classes found in: {filename}")
                for error in invalid_lines:
                    print(f"    - {error}")
                invalid_files_count += 1
                
        except Exception as e:
            print(f"[Error] Could not read {filename}: {e}")

    print("-" * 30)
    if invalid_files_count == 0:
        print("✅ Success: All files contain only classes 0, 1, or 2.")
    else:
        print(f"⚠️  Found {invalid_files_count} files with invalid classes.")

# Execute the function
target_folder = r"/media/holidayj/Documents/data/frames/chungmuro_hasun_10frame_1920_train_arrival/30_frames_crop/set_1"
validate_yolo_labels(target_folder)

Checking 672 files in: /media/holidayj/Documents/data/frames/chungmuro_hasun_10frame_1920_train_arrival/30_frames_crop/set_1...

------------------------------
✅ Success: All files contain only classes 0, 1, or 2.


In [2]:
import os
import filecmp

def compare_all_files_deep(dir1, dir2):
    # Check if directories exist
    if not os.path.exists(dir1) or not os.path.exists(dir2):
        print("Error: One or both directory paths are invalid.")
        return

    print(f"Comparing contents of:\n 1. {dir1}\n 2. {dir2}\n")

    # Get file lists
    files_1 = set(os.listdir(dir1))
    files_2 = set(os.listdir(dir2))

    # 1. Check for missing/extra files based on name
    common_files = files_1.intersection(files_2)
    only_in_1 = files_1 - files_2
    only_in_2 = files_2 - files_1

    print(f"--- Structure Summary ---")
    print(f"Files only in Folder 1: {len(only_in_1)}")
    print(f"Files only in Folder 2: {len(only_in_2)}")
    print(f"Common filenames:       {len(common_files)}")

    if not common_files:
        print("No common files to compare content.")
        return

    # 2. Deep Content Comparison
    print(f"\n--- Deep Content Check (Byte-by-Byte) ---")
    print("Reading files... (This may take time for large datasets)")
    
    match_files = []
    mismatch_files = []
    errors = []

    # Iterate through common files and check content
    for i, filename in enumerate(common_files):
        path1 = os.path.join(dir1, filename)
        path2 = os.path.join(dir2, filename)

        # Skip directories, compare only files
        if os.path.isfile(path1) and os.path.isfile(path2):
            try:
                # shallow=False forces the function to read the file contents
                if filecmp.cmp(path1, path2, shallow=False):
                    match_files.append(filename)
                else:
                    mismatch_files.append(filename)
            except Exception as e:
                errors.append(f"{filename}: {e}")
        
        # Optional: Print progress every 1000 files
        if (i + 1) % 1000 == 0:
            print(f" ... processed {i + 1} files")

    # 3. Final Report
    print(f"\n[Result]")
    print(f"✅ Identical Files: {len(match_files)}")
    print(f"❌ Different Content: {len(mismatch_files)}")
    
    if mismatch_files:
        print("\nThe following files have the same name but DIFFERENT content:")
        for f in sorted(mismatch_files)[:20]: # Show top 20
            print(f" - {f}")
        if len(mismatch_files) > 20:
            print(f" ... and {len(mismatch_files) - 20} more.")

    if errors:
        print(f"\nErrors encountered: {len(errors)}")

# Define your paths here
folder_a = '/media/holidayj/Documents/Data/Platform/final_dataset/Euljiro/Euljiro_inner_20201128_f1038_t1519/Euljiro_off_peak_TrainVal_2sec_plus_addition_1sec_done_FINAL'
folder_b = '/media/holidayj/Documents/Data/Platform/final_dataset/Euljiro/Euljiro_inner_20201128_f1038_t1519/Euljiro_off_peak_2sec_plus_addition_1sec_done'

if __name__ == "__main__":
    compare_all_files_deep(folder_a, folder_b)

Comparing contents of:
 1. /media/holidayj/Documents/Data/Platform/final_dataset/Euljiro/Euljiro_inner_20201128_f1038_t1519/Euljiro_off_peak_TrainVal_2sec_plus_addition_1sec_done_FINAL
 2. /media/holidayj/Documents/Data/Platform/final_dataset/Euljiro/Euljiro_inner_20201128_f1038_t1519/Euljiro_off_peak_2sec_plus_addition_1sec_done

--- Structure Summary ---
Files only in Folder 1: 0
Files only in Folder 2: 0
Common filenames:       10800

--- Deep Content Check (Byte-by-Byte) ---
Reading files... (This may take time for large datasets)
 ... processed 1000 files
 ... processed 2000 files
 ... processed 3000 files
 ... processed 4000 files
 ... processed 5000 files
 ... processed 6000 files
 ... processed 7000 files
 ... processed 8000 files
 ... processed 9000 files
 ... processed 10000 files

[Result]
✅ Identical Files: 10800
❌ Different Content: 0
