In [None]:
import os
import pandas as pd
from shutil import copyfile
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Define categories and paths
CATEGORIES = ["Atelectasis", "Infiltration", "Effusion", "Nodule", "Mass"]
IMAGE_FOLDERS = [f"/kaggle/input/data/images_{i:03d}/images" for i in range(1, 13)]
CSV_FILE = "/kaggle/input/data/Data_Entry_2017.csv"
OUTPUT_PATH = "./output/"
LOG_FILE = "./file_distribution_log.csv"

# Maximum counts
MAX_TRAIN = 1100
MAX_VAL = 200

# Create folders for each category
def create_folders(base_path, categories):
    for category in categories:
        os.makedirs(os.path.join(base_path, "train", category), exist_ok=True)
        os.makedirs(os.path.join(base_path, "val", category), exist_ok=True)

# Locate an image in the distributed image folders
def locate_image(image_name):
    for folder in IMAGE_FOLDERS:
        potential_path = os.path.join(folder, image_name)
        if os.path.exists(potential_path):
            return potential_path
    return None

# Logging function
def log_file(image_name, label, folder_type):
    with open(LOG_FILE, 'a') as f:
        f.write(f"{image_name},{label},{folder_type}\n")

# Function to distribute files with count limits, ensuring no duplicates by base name
def distribute_files(row, train_folder, val_folder, train_count, val_count, used_base_names):
    image_name = row['Image Index']
    label = row['Labels'][0]  # Since we only consider single-labeled images

    # Extract base name by removing suffix like _000, _001, etc.
    base_name = image_name.rsplit('_', 1)[0]

    # If the base name has already been used for this category, skip it
    if base_name in used_base_names[label]:
        return

    source = locate_image(image_name)

    if source:
        if row['is_train']:
            if train_count[label] < MAX_TRAIN:
                destination = os.path.join(train_folder, label, image_name)
                copyfile(source, destination)
                train_count[label] += 1
                used_base_names[label].add(base_name)  # Mark the base name as used
                log_file(image_name, label, "train")
        else:
            if val_count[label] < MAX_VAL:
                destination = os.path.join(val_folder, label, image_name)
                copyfile(source, destination)
                val_count[label] += 1
                log_file(image_name, label, "val")

# Load the CSV file
data = pd.read_csv(CSV_FILE)

# Filter relevant rows and columns
data['Labels'] = data['Finding Labels'].apply(lambda x: x.split('|'))
data = data[data['Labels'].map(lambda labels: len(labels) == 1 and labels[0] in CATEGORIES)]

# Train-validation split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)
train_data['is_train'] = True
val_data['is_train'] = False
split_data = pd.concat([train_data, val_data])

# Create output folders
create_folders(OUTPUT_PATH, CATEGORIES)

# Initialize log file
with open(LOG_FILE, 'w') as f:
    f.write("Image Name,Category,Folder Type\n")

# Initialize counters for each category
train_count = defaultdict(int)
val_count = defaultdict(int)

# Dictionary to track base image names to avoid duplicates
used_base_names = defaultdict(set)

# Distribute files
split_data.apply(lambda row: distribute_files(row, os.path.join(OUTPUT_PATH, "train"), os.path.join(OUTPUT_PATH, "val"), train_count, val_count, used_base_names), axis=1)

print(f"Dataset reorganization completed! File distribution logged in {LOG_FILE}")

In [None]:
import shutil

# Path to output folder and ZIP file
output_folder = "./output"
zip_file = "./output.zip"

# Compress the folder
shutil.make_archive(output_folder, 'zip', output_folder)

print(f"Zipped output saved as {zip_file}")