In [None]:
import os
import random
import shutil
from pathlib import Path
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing as mp
import time
import threading
import hashlib
import json
from tqdm import tqdm
from collections import defaultdict, Counter
import uuid
from datetime import datetime

random.seed(42)

class ThreadSafeFileTracker:
    """Enhanced thread-safe file tracking with comprehensive logging."""
    
    def __init__(self):
        self._lock = threading.RLock()  # Reentrant lock for nested calls
        self._filename_counter_lock = threading.RLock()
        self._processed_files = set()
        self._failed_files = []
        self._successful_files = []
        self._attempts = defaultdict(int)
        self._filename_counters = defaultdict(int)  # For unique filename generation
        self._reserved_filenames = set()  # Track reserved filenames
    
    def reserve_unique_filename(self, base_path, destination_dir):
        """Thread-safe unique filename reservation."""
        with self._filename_counter_lock:
            path = Path(base_path)
            name = path.stem
            ext = path.suffix
            
            counter = 1
            new_name = f"{name}{ext}"
            new_path = destination_dir / new_name
            
            # Check both filesystem and our reservation system
            while (new_path.exists() or new_name in self._reserved_filenames):
                new_name = f"{name}_{counter}{ext}"
                new_path = destination_dir / new_name
                counter += 1
            
            # Reserve this filename
            self._reserved_filenames.add(new_name)
            return new_path, new_name
    
    def release_filename_reservation(self, filename):
        """Release a filename reservation if copy failed."""
        with self._filename_counter_lock:
            self._reserved_filenames.discard(filename)
    
    def mark_attempt(self, filepath):
        with self._lock:
            self._attempts[filepath] += 1
            return self._attempts[filepath]
    
    def is_processed(self, filepath):
        with self._lock:
            return filepath in self._processed_files
    
    def mark_processed(self, filepath):
        with self._lock:
            self._processed_files.add(filepath)
    
    def add_success(self, result):
        with self._lock:
            self._successful_files.append(result)
    
    def add_failure(self, result):
        with self._lock:
            self._failed_files.append(result)
    
    def get_results(self):
        with self._lock:
            return self._successful_files.copy(), self._failed_files.copy()
    
    def get_stats(self):
        with self._lock:
            return {
                'processed_count': len(self._processed_files),
                'success_count': len(self._successful_files),
                'failure_count': len(self._failed_files),
                'total_attempts': sum(self._attempts.values()),
                'reserved_filenames': len(self._reserved_filenames)
            }

class SplitLogger:
    """Handles comprehensive per-split logging for diff analysis."""
    
    def __init__(self, log_dir):
        self.log_dir = Path(log_dir)
        self.log_dir.mkdir(parents=True, exist_ok=True)
        self._lock = threading.RLock()
    
    def log_expected_files(self, split_name, label, file_list):
        """Log all files that should be copied for a split/label."""
        log_file = self.log_dir / f"{split_name}_{label}_EXPECTED.json"
        
        expected_data = {
            'timestamp': datetime.now().isoformat(),
            'split': split_name,
            'label': label,
            'total_count': len(file_list),
            'files': []
        }
        
        for i, filepath in enumerate(file_list):
            expected_data['files'].append({
                'index': i,
                'source_path': str(filepath),
                'source_name': Path(filepath).name,
                'source_size': self._safe_get_file_size(filepath)
            })
        
        with open(log_file, 'w', encoding='utf-8') as f:
            json.dump(expected_data, f, indent=2, ensure_ascii=False)
        
        return log_file
    
    def log_actual_files(self, split_name, label, successful_copies, failed_copies):
        """Log all files that were actually copied (or failed) for a split/label."""
        log_file = self.log_dir / f"{split_name}_{label}_ACTUAL.json"
        
        actual_data = {
            'timestamp': datetime.now().isoformat(),
            'split': split_name,
            'label': label,
            'successful_count': len(successful_copies),
            'failed_count': len(failed_copies),
            'successful_files': successful_copies,
            'failed_files': failed_copies
        }
        
        with open(log_file, 'w', encoding='utf-8') as f:
            json.dump(actual_data, f, indent=2, ensure_ascii=False)
        
        return log_file
    
    def create_diff_analysis(self, split_name, label):
        """Create a diff analysis between expected and actual files."""
        expected_file = self.log_dir / f"{split_name}_{label}_EXPECTED.json"
        actual_file = self.log_dir / f"{split_name}_{label}_ACTUAL.json"
        diff_file = self.log_dir / f"{split_name}_{label}_DIFF.json"
        
        if not (expected_file.exists() and actual_file.exists()):
            return None
        
        # Load data
        with open(expected_file, 'r', encoding='utf-8') as f:
            expected_data = json.load(f)
        
        with open(actual_file, 'r', encoding='utf-8') as f:
            actual_data = json.load(f)
        
        # Create sets for comparison
        expected_names = {Path(f['source_path']).name for f in expected_data['files']}
        successful_names = set()
        failed_names = set()
        
        # Extract names from successful copies
        for success in actual_data['successful_files']:
            # Get original filename from source_path if available
            if 'source_path' in success:
                successful_names.add(Path(success['source_path']).name)
        
        # Extract names from failed copies
        for failure in actual_data['failed_files']:
            if 'filepath' in failure:
                failed_names.add(Path(failure['filepath']).name)
        
        # Perform diff analysis
        attempted_names = successful_names | failed_names
        missing_names = expected_names - attempted_names
        
        diff_analysis = {
            'timestamp': datetime.now().isoformat(),
            'split': split_name,
            'label': label,
            'summary': {
                'expected_total': len(expected_names),
                'successful_total': len(successful_names),
                'failed_total': len(failed_names),
                'missing_total': len(missing_names),
                'attempted_total': len(attempted_names)
            },
            'missing_files': sorted(list(missing_names)),
            'successful_files': sorted(list(successful_names)),
            'failed_files': sorted(list(failed_names))
        }
        
        # Create human-readable diff report
        readable_diff_file = self.log_dir / f"{split_name}_{label}_DIFF.txt"
        with open(readable_diff_file, 'w', encoding='utf-8') as f:
            f.write(f"DIFF ANALYSIS: {split_name}/{label}\n")
            f.write("=" * 50 + "\n\n")
            
            f.write("SUMMARY:\n")
            f.write(f"  Expected files: {diff_analysis['summary']['expected_total']}\n")
            f.write(f"  Successful copies: {diff_analysis['summary']['successful_total']}\n")
            f.write(f"  Failed copies: {diff_analysis['summary']['failed_total']}\n")
            f.write(f"  Missing files: {diff_analysis['summary']['missing_total']}\n")
            f.write(f"  Files attempted: {diff_analysis['summary']['attempted_total']}\n\n")
            
            if missing_names:
                f.write(f"MISSING FILES ({len(missing_names)}):\n")
                f.write("-" * 30 + "\n")
                for name in sorted(missing_names):
                    f.write(f"  {name}\n")
                f.write("\n")
            
            if failed_names:
                f.write(f"FAILED FILES ({len(failed_names)}):\n")
                f.write("-" * 30 + "\n")
                for name in sorted(failed_names):
                    f.write(f"  {name}\n")
                f.write("\n")
            
            success_rate = (len(successful_names) / len(expected_names)) * 100 if expected_names else 0
            f.write(f"SUCCESS RATE: {success_rate:.2f}%\n")
        
        # Save JSON diff
        with open(diff_file, 'w', encoding='utf-8') as f:
            json.dump(diff_analysis, f, indent=2, ensure_ascii=False)
        
        return diff_file, readable_diff_file, diff_analysis
    
    def _safe_get_file_size(self, filepath):
        """Safely get file size, return -1 if not accessible."""
        try:
            return Path(filepath).stat().st_size
        except Exception:
            return -1

def get_files_from_directory(folder, ext=".wav"):
    """Get files from a single directory - used for parallel processing."""
    folder_path = Path(folder)
    if folder_path.exists():
        return list(folder_path.rglob(f"*{ext}"))
    return []

def get_all_audio_files_parallel(folder_list, ext=".wav", max_workers=None):
    """Efficiently gather all audio files using parallel processing."""
    if max_workers is None:
        max_workers = min(len(folder_list), mp.cpu_count())
    
    all_files = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_folder = {
            executor.submit(get_files_from_directory, folder, ext): folder 
            for folder in folder_list
        }
        
        for future in as_completed(future_to_folder):
            folder = future_to_folder[future]
            try:
                files = future.result()
                all_files.extend([str(p) for p in files])
                print(f"Scanned {folder}: {len(files)} files")
            except Exception as e:
                print(f"Error scanning {folder}: {e}")
    
    return all_files

def copy_file_atomic(src_path, dst_dir, file_tracker, max_retries=3, retry_delay=1):
    """
    Atomically copy a file with race-condition-free unique naming.
    """
    # Early duplicate check
    if file_tracker.is_processed(src_path):
        return {
            'filepath': src_path,
            'label': dst_dir.name,
            'success': False,
            'error': 'Already processed by another thread',
            'status': 'duplicate_attempt'
        }
    
    attempt_num = file_tracker.mark_attempt(src_path)
    reserved_filename = None
    
    try:
        src = Path(src_path)
        
        # Verify source file
        if not src.exists():
            return {
                'filepath': src_path,
                'label': dst_dir.name,
                'success': False,
                'error': f"Source file does not exist: {src_path}",
                'status': 'source_missing',
                'attempt': attempt_num
            }
        
        # Check file size
        try:
            file_size = src.stat().st_size
            if file_size == 0:
                return {
                    'filepath': src_path,
                    'label': dst_dir.name,
                    'success': False,
                    'error': f"Source file is empty: {src_path}",
                    'status': 'empty_file',
                    'attempt': attempt_num
                }
        except Exception as e:
            return {
                'filepath': src_path,
                'label': dst_dir.name,
                'success': False,
                'error': f"Cannot access source file: {e}",
                'status': 'access_error',
                'attempt': attempt_num
            }
        
        # Reserve unique filename atomically
        dst_path, reserved_filename = file_tracker.reserve_unique_filename(src, dst_dir)
        
        # Mark as processed BEFORE actual copy to prevent race conditions
        file_tracker.mark_processed(src_path)
        
        # Attempt copy with retries
        last_error = None
        for retry in range(max_retries):
            try:
                # Use a temporary file for atomic copy
                temp_path = dst_path.with_suffix(f'.tmp_{uuid.uuid4().hex[:8]}')
                
                # Copy to temporary file first
                shutil.copy2(src, temp_path)
                
                # Verify temporary file
                if not temp_path.exists():
                    raise Exception("Temporary file creation failed")
                
                temp_size = temp_path.stat().st_size
                if temp_size != file_size:
                    raise Exception(f"Size mismatch: expected {file_size}, got {temp_size}")
                
                # Atomically rename to final destination
                temp_path.rename(dst_path)
                
                # Final verification
                if not dst_path.exists():
                    raise Exception("Final file does not exist after rename")
                
                final_size = dst_path.stat().st_size
                if final_size != file_size:
                    raise Exception(f"Final size mismatch: expected {file_size}, got {final_size}")
                
                # Success!
                result = {
                    'filepath': str(dst_path.relative_to(dst_dir.parent.parent)),
                    'label': dst_dir.name,
                    'success': True,
                    'source_path': src_path,
                    'destination_path': str(dst_path),
                    'source_size': file_size,
                    'destination_size': final_size,
                    'attempt': attempt_num,
                    'retry': retry + 1,
                    'verified': True
                }
                file_tracker.add_success(result)
                return result
                
            except Exception as e:
                last_error = str(e)
                
                # Clean up any temporary files
                try:
                    if 'temp_path' in locals() and temp_path.exists():
                        temp_path.unlink()
                except:
                    pass
                
                if retry < max_retries - 1:
                    time.sleep(retry_delay)
                    retry_delay *= 1.5  # Exponential backoff
        
        # All retries failed - release filename reservation
        if reserved_filename:
            file_tracker.release_filename_reservation(reserved_filename)
        
        result = {
            'filepath': src_path,
            'label': dst_dir.name,
            'success': False,
            'error': f"Failed after {max_retries} attempts. Last error: {last_error}",
            'status': 'copy_failed',
            'attempt': attempt_num,
            'max_retries_reached': True
        }
        file_tracker.add_failure(result)
        return result
        
    except Exception as e:
        # Release reservation on unexpected error
        if reserved_filename:
            file_tracker.release_filename_reservation(reserved_filename)
            
        result = {
            'filepath': src_path,
            'label': dst_dir.name,
            'success': False,
            'error': f"Unexpected error: {str(e)}",
            'status': 'unexpected_error',
            'attempt': attempt_num
        }
        file_tracker.add_failure(result)
        return result

def copy_files_with_comprehensive_logging(file_paths, destination_dir, split_name, label, 
                                        split_logger, max_workers=None):
    """Copy files with comprehensive per-split logging and diff analysis."""
    if max_workers is None:
        max_workers = min(32, mp.cpu_count() * 2)
    
    destination_dir.mkdir(parents=True, exist_ok=True)
    
    # Initialize fresh tracker for this batch
    file_tracker = ThreadSafeFileTracker()
    
    # Log expected files
    expected_log_file = split_logger.log_expected_files(split_name, label, file_paths)
    print(f"    Expected files logged to: {expected_log_file.name}")
    
    print(f"    Starting copy of {len(file_paths)} files with {max_workers} workers...")
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(copy_file_atomic, src_path, destination_dir, file_tracker)
            for src_path in file_paths
        ]
        
        desc = f"Copying to {destination_dir.relative_to(Path('dataset'))}"
        
        # Process results as they complete
        for future in tqdm(as_completed(futures), total=len(futures), desc=desc):
            _ = future.result()
    
    # Get results
    successful_results, failed_results = file_tracker.get_results()
    
    # Separate duplicates from real failures
    duplicate_results = [r for r in failed_results if r.get('is_duplicate', False)]
    actual_failed_results = [r for r in failed_results if not r.get('is_duplicate', False)]
    
    # Log actual files (excluding duplicates from failures)
    actual_log_file = split_logger.log_actual_files(split_name, label, successful_results, actual_failed_results)
    print(f"    Actual files logged to: {actual_log_file.name}")
    
    # Create diff analysis (using actual failures, not duplicates)
    diff_files = split_logger.create_diff_analysis(split_name, label)
    if diff_files:
        diff_json, diff_txt, diff_data = diff_files
        print(f"    Diff analysis created: {diff_txt.name}")
        
        # Print improved summary
        summary = diff_data['summary']
        if summary['missing_total'] > 0:
            print(f"    🚨 {summary['missing_total']} files went MISSING (never attempted)")
        if summary['failed_total'] > 0:
            print(f"    ❌ {summary['failed_total']} files FAILED during copy")
        
        # Show duplicate information
        if duplicate_results:
            print(f"    🔄 {len(duplicate_results)} duplicate attempts (expected in multithreading)")
        
        print(f"    ✅ {summary['successful_total']}/{summary['expected_total']} files copied successfully")
        
        # Verify against actual disk count
        actual_files_on_disk = len(list(destination_dir.glob('*.wav')))
        expected_on_disk = summary['successful_total']  # This should now match
        
        if actual_files_on_disk != expected_on_disk:
            print("    🚨 DISK VERIFICATION MISMATCH!")
            print(f"       Tracked successful: {expected_on_disk}")
            print(f"       Actually on disk: {actual_files_on_disk}")
        else:
            print(f"    ✅ DISK VERIFICATION PASSED: {actual_files_on_disk} files on disk")
    
    # Return clean results for manifest
    clean_successful = []
    for r in successful_results:
        clean_successful.append({
            'filepath': r['filepath'],
            'label': r['label']
        })
    
    # Update statistics to reflect actual situation
    stats = file_tracker.get_stats()
    stats['duplicate_count'] = len(duplicate_results)
    stats['actual_failure_count'] = len(actual_failed_results)
    
    return clean_successful, actual_failed_results, stats

def check_existing_splits(base_dir):
    """Check which splits already exist and return completed ones."""
    completed_splits = []
    base_path = Path(base_dir)
    
    for split_name in ['train', 'val', 'test']:
        split_dir = base_path / split_name
        manifest_file = base_path / f'{split_name}.csv'
        
        if split_dir.exists() and manifest_file.exists():
            fake_count = len(list((split_dir / 'fake').glob('*.wav')))
            real_count = len(list((split_dir / 'real').glob('*.wav')))
            
            if fake_count > 0 and real_count > 0:
                completed_splits.append(split_name)
                print(f"Found existing {split_name} split: {fake_count} fake, {real_count} real files")
    
    return completed_splits

def split_balanced_dataset(fake_files, real_files, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """Split dataset ensuring equal fake/real samples in each split."""
    random.shuffle(fake_files)
    random.shuffle(real_files)
    
    min_count = min(len(fake_files), len(real_files))
    
    fake_files = fake_files[:min_count]
    real_files = real_files[:min_count]
    
    print(f"Using {min_count} files per class for balanced dataset")
    
    train_size = int(train_ratio * min_count)
    val_size = int(val_ratio * min_count)
    
    print(f"Split sizes - Train: {train_size}, Val: {val_size}, Test: {min_count - train_size - val_size}")
    
    fake_train = fake_files[:train_size]
    fake_val = fake_files[train_size:train_size + val_size]
    fake_test = fake_files[train_size + val_size:]
    
    real_train = real_files[:train_size]
    real_val = real_files[train_size:train_size + val_size]
    real_test = real_files[train_size + val_size:]
    
    return {
        'train': {'fake': fake_train, 'real': real_train},
        'val': {'fake': fake_val, 'real': real_val},
        'test': {'fake': fake_test, 'real': real_test},
    }

# --- Main Execution ---
if __name__ == "__main__":
    start_time = time.time()
    
    # Directory configuration
    fake_dirs = [
        r"for-2seconds\testing\fake",
        r"for-2seconds\training\fake",
        r"for-2seconds\validation\fake",
        r"release_in_the_wild\fake",
        r"generated_audio\fake\common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech",
        r"generated_audio\fake\jsut_multi_band_melgan",
        r"generated_audio\fake\jsut_parallel_wavegan",
        r"generated_audio\fake\ljspeech_full_band_melgan",
        r"generated_audio\fake\ljspeech_hifiGAN",
        r"generated_audio\fake\ljspeech_melgan",
        r"generated_audio\fake\ljspeech_melgan_large",
        r"generated_audio\fake\ljspeech_multi_band_melgan",
        r"generated_audio\fake\ljspeech_parallel_wavegan",
        r"generated_audio\fake\ljspeech_waveglow",
    ]
    
    real_dirs = [
        r"for-2seconds\testing\real",
        r"for-2seconds\training\real",
        r"for-2seconds\validation\real",
        r"common-voices-mozilla\cv-valid-train\wav-files"
    ]
    
    print("Gathering files in parallel...")
    fake_files = get_all_audio_files_parallel(fake_dirs)
    real_files = get_all_audio_files_parallel(real_dirs)
    
    print(f"\nTotal fake files found: {len(fake_files)}")
    print(f"Total real files found: {len(real_files)}")
    
    print("\nCreating balanced splits...")
    splits = split_balanced_dataset(fake_files, real_files)
    
    base_dir = Path('dataset')
    log_dir = base_dir / 'logs'
    
    # Initialize split logger
    split_logger = SplitLogger(log_dir)
    
    print(f"\nChecking for existing splits... (Detailed logs will be saved to '{log_dir}')")
    completed_splits = check_existing_splits(base_dir)
    
    if completed_splits:
        print(f"Found existing splits: {', '.join(completed_splits)}")
    
    print("\nCreating directory structure and copying files...")
    print(f"Using up to {min(32, mp.cpu_count() * 2)} threads for file copying...")
    
    # Overall tracking
    overall_stats = {
        'total_expected': 0,
        'total_successful': 0,
        'total_failed': 0,
        'total_missing': 0
    }
    
    for split_name, classes in splits.items():
        if split_name in completed_splits:
            print(f"\nSkipping {split_name} split (already exists)...")
            continue
            
        print(f"\nProcessing {split_name} split...")
        all_manifest_rows = []
        
        for label, files in classes.items():
            out_dir = base_dir / split_name / label
            
            print(f"\n  Processing {split_name}/{label} ({len(files)} files)...")
            
            # Copy files with comprehensive logging
            successful_copies, failed_copies, copy_stats = copy_files_with_comprehensive_logging(
                files, out_dir, split_name, label, split_logger
            )
            
            all_manifest_rows.extend(successful_copies)
            
            # Update overall statistics
            overall_stats['total_expected'] += len(files)
            overall_stats['total_successful'] += len(successful_copies)
            overall_stats['total_failed'] += len(failed_copies)
            overall_stats['total_missing'] += len(files) - len(successful_copies) - len(failed_copies)
        
        # Create manifest
        manifest_path = base_dir / f'{split_name}.csv'
        with open(manifest_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['filepath', 'label'])
            writer.writeheader()
            writer.writerows(all_manifest_rows)
        
        print(f"  -> Created manifest: {manifest_path}")
        print(f"  -> Total files in {split_name}: {len(all_manifest_rows)}")
    
    end_time = time.time()
    total_time = end_time - start_time
    
    print("\n" + "="*60)
    print("DATASET CREATION COMPLETE!")
    print("="*60)
    print(f"Total time: {total_time:.2f} seconds")
    
    # Final comprehensive summary
    print("\nOVERALL STATISTICS:")
    print(f"Expected files: {overall_stats['total_expected']}")
    print(f"Successful copies: {overall_stats['total_successful']}")
    print(f"Failed copies: {overall_stats['total_failed']}")
    print(f"Missing files: {overall_stats['total_missing']}")
    
    # Verify against actual disk
    print("\nFINAL DISK VERIFICATION:")
    actual_total_on_disk = 0
    for split_name in ['train', 'val', 'test']:
        split_dir = base_dir / split_name
        if not split_dir.exists(): 
            continue
        
        fake_count = len(list((split_dir / 'fake').glob('*.wav')))
        real_count = len(list((split_dir / 'real').glob('*.wav')))
        split_total = fake_count + real_count
        actual_total_on_disk += split_total
        balance_status = "✅" if fake_count == real_count else "⚠️"
        print(f"  {split_name}: {fake_count} fake, {real_count} real = {split_total} total {balance_status}")
    
    print(f"  GRAND TOTAL ON DISK: {actual_total_on_disk}")
    
    if actual_total_on_disk != overall_stats['total_successful']:
        print(f"🚨 CRITICAL DISCREPANCY!")
        print(f"   Tracked successful: {overall_stats['total_successful']}")
        print(f"   Actually on disk: {actual_total_on_disk}")
    
    # Performance metrics
    if total_time > 0:
        print(f"\nPerformance: {overall_stats['total_successful'] / total_time:.1f} files/second")
    if overall_stats['total_expected'] > 0:
        print(f"Success rate: {(overall_stats['total_successful'] / overall_stats['total_expected'] * 100):.2f}%")
    
    print(f"\n📋 Comprehensive per-split logs available in: {log_dir.resolve()}")
    print("   - *_EXPECTED.json: All files that should have been copied")
    print("   - *_ACTUAL.json: All files that were actually processed")
    print("   - *_DIFF.json/.txt: Diff analysis showing missing/failed files")
    print("\n🔍 Use the DIFF files to identify exactly which files went missing!")

Gathering files in parallel...
Scanned for-2seconds\testing\fake: 544 files
Scanned for-2seconds\validation\fake: 1413 files
Scanned release_in_the_wild\fake: 31779 files
Scanned generated_audio\fake\jsut_multi_band_melgan: 5000 files
Scanned for-2seconds\training\fake: 6978 files
Scanned generated_audio\fake\common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech: 16283 files
Scanned generated_audio\fake\jsut_parallel_wavegan: 5000 files
Scanned generated_audio\fake\ljspeech_full_band_melgan: 13100 files
Scanned generated_audio\fake\ljspeech_hifiGAN: 13100 files
Scanned generated_audio\fake\ljspeech_melgan: 13100 files
Scanned generated_audio\fake\ljspeech_melgan_large: 13100 files
Scanned generated_audio\fake\ljspeech_multi_band_melgan: 13100 files
Scanned generated_audio\fake\ljspeech_parallel_wavegan: 13100 files
Scanned generated_audio\fake\ljspeech_waveglow: 13100 files
Scanned for-2seconds\testing\real: 544 files
Scanned for-2seconds\validation\real: 1413 files
Scanned for

Copying to train\fake: 100%|██████████| 126957/126957 [08:07<00:00, 260.56it/s]


    Actual files logged to: train_fake_ACTUAL.json
    Diff analysis created: train_fake_DIFF.txt
    ✅ 84488/84488 files copied successfully
    🚨 DISK VERIFICATION MISMATCH!
       Tracked successful: 84488
       Actually on disk: 126957

  Processing train/real (126957 files)...
    Expected files logged to: train_real_EXPECTED.json
    Starting copy of 126957 files with 8 workers...


Copying to train\real: 100%|██████████| 126957/126957 [08:46<00:00, 241.28it/s]


    Actual files logged to: train_real_ACTUAL.json
    Diff analysis created: train_real_DIFF.txt
    ✅ 126875/126875 files copied successfully
    🚨 DISK VERIFICATION MISMATCH!
       Tracked successful: 126875
       Actually on disk: 126957
  -> Created manifest: dataset\train.csv
  -> Total files in train: 253914

Processing val split...

  Processing val/fake (15869 files)...
    Expected files logged to: val_fake_EXPECTED.json
    Starting copy of 15869 files with 8 workers...


Copying to val\fake: 100%|██████████| 15869/15869 [00:53<00:00, 297.88it/s]


    Actual files logged to: val_fake_ACTUAL.json
    Diff analysis created: val_fake_DIFF.txt
    ✅ 14674/14674 files copied successfully
    🚨 DISK VERIFICATION MISMATCH!
       Tracked successful: 14674
       Actually on disk: 15869

  Processing val/real (15869 files)...
    Expected files logged to: val_real_EXPECTED.json
    Starting copy of 15869 files with 8 workers...


Copying to val\real: 100%|██████████| 15869/15869 [01:03<00:00, 249.09it/s]


    Actual files logged to: val_real_ACTUAL.json
    Diff analysis created: val_real_DIFF.txt
    ✅ 15866/15866 files copied successfully
    🚨 DISK VERIFICATION MISMATCH!
       Tracked successful: 15866
       Actually on disk: 15869
  -> Created manifest: dataset\val.csv
  -> Total files in val: 31738

Processing test split...

  Processing test/fake (15871 files)...
    Expected files logged to: test_fake_EXPECTED.json
    Starting copy of 15871 files with 8 workers...


Copying to test\fake: 100%|██████████| 15871/15871 [00:51<00:00, 310.23it/s]


    Actual files logged to: test_fake_ACTUAL.json
    Diff analysis created: test_fake_DIFF.txt
    ✅ 14590/14590 files copied successfully
    🚨 DISK VERIFICATION MISMATCH!
       Tracked successful: 14590
       Actually on disk: 15871

  Processing test/real (15871 files)...
    Expected files logged to: test_real_EXPECTED.json
    Starting copy of 15871 files with 8 workers...


Copying to test\real: 100%|██████████| 15871/15871 [01:01<00:00, 259.53it/s]


    Actual files logged to: test_real_ACTUAL.json
    Diff analysis created: test_real_DIFF.txt
    ✅ 15871/15871 files copied successfully
  -> Created manifest: dataset\test.csv
  -> Total files in test: 31742

DATASET CREATION COMPLETE!
Total time: 1451.96 seconds

OVERALL STATISTICS:
Expected files: 317394
Successful copies: 317394
Failed copies: 0
Missing files: 0

FINAL DISK VERIFICATION:
  train: 126957 fake, 126957 real = 253914 total ✅
  val: 15869 fake, 15869 real = 31738 total ✅
  test: 15871 fake, 15871 real = 31742 total ✅
  GRAND TOTAL ON DISK: 317394

Performance: 218.6 files/second
Success rate: 100.00%

📋 Comprehensive per-split logs available in: C:\Users\Crumbz\Desktop\4TH-YEAR-1ST-SEM\THESIS\thesis-testing\dataset\logs
   - *_EXPECTED.json: All files that should have been copied
   - *_ACTUAL.json: All files that were actually processed
   - *_DIFF.json/.txt: Diff analysis showing missing/failed files

🔍 Use the DIFF files to identify exactly which files went missin