# Dataset Splitting for CT-MRI Fusion

This notebook splits the CT-MRI dataset into train and test sets with proper randomization and creates permanent folder structures for consistent evaluation across all methods.

In [1]:
# Import Required Libraries
import os
import shutil
import random
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load and Analyze Dataset
print("Loading CT-MRI dataset...")

# Define paths
dataset_root = Path("Dataset/CT-MRI")
ct_dir = dataset_root / "CT"
mri_dir = dataset_root / "MRI"

# Get all image pairs
ct_files = sorted(list(ct_dir.glob("*.png")))
mri_files = sorted(list(mri_dir.glob("*.png")))

print(f"Found {len(ct_files)} CT images")
print(f"Found {len(mri_files)} MRI images")

# Create pairs list
image_pairs = []
for ct_file in ct_files:
    mri_file = mri_dir / ct_file.name
    if mri_file.exists():
        image_pairs.append((str(ct_file), str(mri_file)))
    else:
        print(f"Warning: No matching MRI for {ct_file.name}")

print(f"Total valid pairs: {len(image_pairs)}")

# Extract filenames for splitting
filenames = [os.path.basename(pair[0]) for pair in image_pairs]
print(f"Sample filenames: {filenames[:5]}")

Loading CT-MRI dataset...
Found 573 CT images
Found 573 MRI images
Total valid pairs: 573
Sample filenames: ['16003.png', '16004.png', '16005.png', '16006.png', '16007.png']


In [3]:
# Split Dataset into Train and Test
print("Splitting dataset into train and test sets...")

# Define split ratio
test_size = 0.2  # 20% for testing, 80% for training
val_size = 0.1   # 10% of training for validation

# First split: train+val and test
train_val_files, test_files = train_test_split(
    filenames, 
    test_size=test_size, 
    random_state=42,
    shuffle=True
)

# Second split: train and val from train+val
train_files, val_files = train_test_split(
    train_val_files,
    test_size=val_size / (1 - test_size),  # Adjust for proportion
    random_state=42,
    shuffle=True
)

print(f"Total images: {len(filenames)}")
print(f"Train images: {len(train_files)} ({len(train_files)/len(filenames)*100:.1f}%)")
print(f"Validation images: {len(val_files)} ({len(val_files)/len(filenames)*100:.1f}%)")
print(f"Test images: {len(test_files)} ({len(test_files)/len(filenames)*100:.1f}%)")

# Verify no overlap
train_set = set(train_files)
val_set = set(val_files)
test_set = set(test_files)

print(f"Train-Val overlap: {len(train_set & val_set)}")
print(f"Train-Test overlap: {len(train_set & test_set)}")
print(f"Val-Test overlap: {len(val_set & test_set)}")

Splitting dataset into train and test sets...
Total images: 573
Train images: 400 (69.8%)
Validation images: 58 (10.1%)
Test images: 115 (20.1%)
Train-Val overlap: 0
Train-Test overlap: 0
Val-Test overlap: 0


In [4]:
# Create Folder Structure and Copy Files
print("Creating folder structure and copying files...")

# Define new structure
base_dir = Path("Dataset")
train_dir = base_dir / "train"
val_dir = base_dir / "val"
test_dir = base_dir / "test"

# Create directories
for split_dir in [train_dir, val_dir, test_dir]:
    (split_dir / "CT").mkdir(parents=True, exist_ok=True)
    (split_dir / "MRI").mkdir(parents=True, exist_ok=True)

# Function to copy files for a split
def copy_files(file_list, split_name, split_dir):
    print(f"Copying {len(file_list)} files to {split_name} set...")
    for filename in file_list:
        # Copy CT file
        src_ct = ct_dir / filename
        dst_ct = split_dir / "CT" / filename
        if src_ct.exists():
            shutil.copy2(src_ct, dst_ct)
        
        # Copy MRI file
        src_mri = mri_dir / filename
        dst_mri = split_dir / "MRI" / filename
        if src_mri.exists():
            shutil.copy2(src_mri, dst_mri)

# Copy files
copy_files(train_files, "train", train_dir)
copy_files(val_files, "validation", val_dir)
copy_files(test_files, "test", test_dir)

print("File copying completed!")

# Verify counts
for split_name, split_dir in [("train", train_dir), ("val", val_dir), ("test", test_dir)]:
    ct_count = len(list((split_dir / "CT").glob("*.png")))
    mri_count = len(list((split_dir / "MRI").glob("*.png")))
    print(f"{split_name}: {ct_count} CT, {mri_count} MRI images")

Creating folder structure and copying files...
Copying 400 files to train set...
Copying 58 files to validation set...
Copying 115 files to test set...
File copying completed!
train: 400 CT, 400 MRI images
val: 58 CT, 58 MRI images
test: 115 CT, 115 MRI images


In [5]:
# Save Split Information
print("Saving split information for reproducibility...")

import json

split_info = {
    "total_images": len(filenames),
    "train_count": len(train_files),
    "val_count": len(val_files),
    "test_count": len(test_files),
    "train_ratio": len(train_files) / len(filenames),
    "val_ratio": len(val_files) / len(filenames),
    "test_ratio": len(test_files) / len(filenames),
    "random_seed": 42,
    "train_files": sorted(train_files),
    "val_files": sorted(val_files),
    "test_files": sorted(test_files)
}

# Save to JSON
with open("Dataset/split_info.json", "w") as f:
    json.dump(split_info, f, indent=2)

print("Split information saved to Dataset/split_info.json")

# Display summary
print("\n=== SPLIT SUMMARY ===")
print(f"Total images: {split_info['total_images']}")
print(f"Train: {split_info['train_count']} ({split_info['train_ratio']*100:.1f}%)")
print(f"Validation: {split_info['val_count']} ({split_info['val_ratio']*100:.1f}%)")
print(f"Test: {split_info['test_count']} ({split_info['test_ratio']*100:.1f}%)")
print("\nNew folder structure:")
print("Dataset/")
print("├── train/CT/     (CT training images)")
print("├── train/MRI/    (MRI training images)")
print("├── val/CT/       (CT validation images)")
print("├── val/MRI/      (MRI validation images)")
print("├── test/CT/      (CT test images)")
print("├── test/MRI/     (MRI test images)")
print("└── split_info.json (split metadata)")

Saving split information for reproducibility...
Split information saved to Dataset/split_info.json

=== SPLIT SUMMARY ===
Total images: 573
Train: 400 (69.8%)
Validation: 58 (10.1%)
Test: 115 (20.1%)

New folder structure:
Dataset/
├── train/CT/     (CT training images)
├── train/MRI/    (MRI training images)
├── val/CT/       (CT validation images)
├── val/MRI/      (MRI validation images)
├── test/CT/      (CT test images)
├── test/MRI/     (MRI test images)
└── split_info.json (split metadata)
