In [1]:
# Add this cell to fix the file paths
import pandas as pd

# Read the CSV files
train_csv = pd.read_csv('derived/unified_dualtask/train.csv')
val_csv = pd.read_csv('derived/unified_dualtask/val.csv')
test_csv = pd.read_csv('derived/unified_dualtask/test.csv')

# Function to fix file paths
def fix_file_paths(df):
    # Replace the old path with your current path
    df['image_path'] = df['image_path'].str.replace(
        '/Users/chufal/projects/DHAI-Brain-Segmentation',
        '/home/qarc/projects/DHAI-Brain-Segmentation'
    )
    df['label_path'] = df['label_path'].str.replace(
        '/Users/chufal/projects/DHAI-Brain-Segmentation',
        '/home/qarc/projects/DHAI-Brain-Segmentation'
    )
    return df

# Fix all CSV files
train_csv = fix_file_paths(train_csv)
val_csv = fix_file_paths(val_csv)
test_csv = fix_file_paths(test_csv)

# Save the corrected CSV files
train_csv.to_csv('derived/unified_dualtask/train_fixed.csv', index=False)
val_csv.to_csv('derived/unified_dualtask/val_fixed.csv', index=False)
test_csv.to_csv('derived/unified_dualtask/test_fixed.csv', index=False)

print("CSV files updated with correct paths")

CSV files updated with correct paths


In [4]:
from pathlib import Path


PROJ_ROOT = Path('/home/qarc/projects/DHAI-Brain-Segmentation')
DUALTASK_ROOT = PROJ_ROOT / 'derived' / 'unified_dualtask'
TRAIN_CSV = DUALTASK_ROOT / 'train_fixed.csv'
VAL_CSV = DUALTASK_ROOT / 'val_fixed.csv'
TEST_CSV = DUALTASK_ROOT / 'test_fixed.csv'

In [5]:
# Validation cell - check file existence
import os

def validate_file_paths(csv_path):
    df = pd.read_csv(csv_path)
    print(f"Checking {csv_path}:")
    
    missing_files = []
    for idx, row in df.iterrows():
        if not os.path.exists(row['image_path']):
            missing_files.append(f"Image: {row['image_path']}")
        if not os.path.exists(row['label_path']):
            missing_files.append(f"Label: {row['label_path']}")
    
    if missing_files:
        print(f"Missing {len(missing_files)} files:")
        for missing in missing_files[:5]:  # Show first 5
            print(f"  - {missing}")
        if len(missing_files) > 5:
            print(f"  ... and {len(missing_files) - 5} more")
    else:
        print("All files exist!")
    
    return len(missing_files) == 0

# Check all CSV files
train_valid = validate_file_paths(TRAIN_CSV)
val_valid = validate_file_paths(VAL_CSV)
test_valid = validate_file_paths(TEST_CSV)

if train_valid and val_valid and test_valid:
    print("\n✅ All files validated successfully!")
else:
    print("\n❌ Some files are missing. Please fix the paths.")

Checking /home/qarc/projects/DHAI-Brain-Segmentation/derived/unified_dualtask/train_fixed.csv:
All files exist!
Checking /home/qarc/projects/DHAI-Brain-Segmentation/derived/unified_dualtask/val_fixed.csv:
All files exist!
Checking /home/qarc/projects/DHAI-Brain-Segmentation/derived/unified_dualtask/test_fixed.csv:
All files exist!

✅ All files validated successfully!


In [6]:
# Check class distribution in your dataset
def analyze_class_distribution():
    print("=== CLASS DISTRIBUTION ANALYSIS ===")
    
    # Training set
    train_df = pd.read_csv(TRAIN_CSV)
    train_counts = train_df['class_label'].value_counts().sort_index()
    print(f"Training set:")
    print(f"  Class 0 (No tumor): {train_counts.get(0, 0)} samples")
    print(f"  Class 1 (Tumor): {train_counts.get(1, 0)} samples")
    print(f"  Total: {len(train_df)} samples")
    print(f"  Positive ratio: {train_counts.get(1, 0)/len(train_df)*100:.1f}%")
    
    # Validation set
    val_df = pd.read_csv(VAL_CSV)
    val_counts = val_df['class_label'].value_counts().sort_index()
    print(f"\nValidation set:")
    print(f"  Class 0 (No tumor): {val_counts.get(0, 0)} samples")
    print(f"  Class 1 (Tumor): {val_counts.get(1, 0)} samples")
    print(f"  Total: {len(val_df)} samples")
    print(f"  Positive ratio: {val_counts.get(1, 0)/len(val_df)*100:.1f}%")
    
    # Check if this is a severe imbalance
    if train_counts.get(1, 0) / len(train_df) < 0.1:
        print("\n⚠️  SEVERE CLASS IMBALANCE DETECTED!")
        print("   This will cause training issues. Consider:")
        print("   1. Class weighting in loss function")
        print("   2. Data augmentation for minority class")
        print("   3. Balanced sampling strategies")

analyze_class_distribution()

=== CLASS DISTRIBUTION ANALYSIS ===
Training set:
  Class 0 (No tumor): 647 samples
  Class 1 (Tumor): 325 samples
  Total: 972 samples
  Positive ratio: 33.4%

Validation set:
  Class 0 (No tumor): 139 samples
  Class 1 (Tumor): 70 samples
  Total: 209 samples
  Positive ratio: 33.5%
