In [1]:
import os
import random
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
random.seed(42)
base_dir = "../data/processed_png"     # nơi chứa A, F, C (ảnh gốc)
output_dir = "../data/dataset_split"   # nơi lưu train/val/test
csv_dir = "../data/csv_split"          # nơi lưu file csv

os.makedirs(output_dir, exist_ok=True)
os.makedirs(csv_dir, exist_ok=True)

labels = ["A", "F", "C"]

# Đọc participants.tsv để lấy danh sách subject
participants = pd.read_csv("../data/raw/participants.tsv", sep="\t")
subjects = participants["participant_id"].tolist()
print("Tổng số subject:", len(subjects))

Tổng số subject: 88


In [6]:
# Chia train/val/test theo subject
train_subjects, temp_subjects = train_test_split(subjects, test_size=0.3, random_state=42)
val_subjects, test_subjects = train_test_split(temp_subjects, test_size=0.5, random_state=42)

print("Train:", len(train_subjects), "Val:", len(val_subjects), "Test:", len(test_subjects))

Train: 61 Val: 13 Test: 14


In [7]:
# Copy ảnh theo subject vào train/val/test
def move_subject_images(subjects, split):
    for label in labels:
        src_folder = os.path.join(base_dir, label)
        dst_folder = os.path.join(output_dir, split, label)
        os.makedirs(dst_folder, exist_ok=True)

        for fname in os.listdir(src_folder):
            if any(fname.startswith(subj) for subj in subjects):
                shutil.copy(os.path.join(src_folder, fname), dst_folder)

# Thực hiện copy
move_subject_images(train_subjects, "train")
move_subject_images(val_subjects, "val")
move_subject_images(test_subjects, "test")

print("Đã copy ảnh xong theo subject split")


Đã copy ảnh xong theo subject split


In [8]:
# Xuất CSV mapping filepath <> label
import csv

for split in ["train", "val", "test"]:
    rows = []
    for label in labels:
        folder = os.path.join(output_dir, split, label)
        files = os.listdir(folder)
        for f in files:
            filepath = os.path.join(folder, f)
            rows.append([filepath, label])

    csv_file = os.path.join(csv_dir, f"{split}.csv")
    with open(csv_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["filepath", "label"])
        writer.writerows(rows)

    print(f"Saved {csv_file} with {len(rows)} samples")

Saved ../data/csv_split\train.csv with 145179 samples
Saved ../data/csv_split\val.csv with 24662 samples
Saved ../data/csv_split\test.csv with 25840 samples


In [9]:
# Kiểm tra
for split in ["train", "val", "test"]:
    df = pd.read_csv(f"../data/csv_split/{split}.csv")
    print(f"\n{split.upper()} set:")
    print(df["label"].value_counts())


TRAIN set:
label
C    70034
A    49058
F    26087
Name: count, dtype: int64

VAL set:
label
A    11191
C     7182
F     6289
Name: count, dtype: int64

TEST set:
label
A    11989
F    10545
C     3306
Name: count, dtype: int64


In [11]:
# Kiểm tra subject theo từng label trong mỗi split
for split in ["train", "val", "test"]:
    df = pd.read_csv(f"../data/csv_split/{split}.csv")

    # Extract subject ID từ file path (sub-xxx)
    df["subject"] = df["filepath"].str.extract(r"(sub-\d+)")

    print(f"\n===== {split.upper()} set =====")
    for label, group in df.groupby("label"):
        subjects = sorted(group["subject"].unique())
        print(f"Label {label}: {len(subjects)} subjects -> {subjects}")
    print("-" * 60)



===== TRAIN set =====
Label A: 23 subjects -> ['sub-002', 'sub-003', 'sub-004', 'sub-007', 'sub-008', 'sub-009', 'sub-014', 'sub-015', 'sub-016', 'sub-017', 'sub-018', 'sub-020', 'sub-021', 'sub-022', 'sub-024', 'sub-025', 'sub-026', 'sub-028', 'sub-030', 'sub-032', 'sub-033', 'sub-035', 'sub-036']
Label C: 25 subjects -> ['sub-037', 'sub-038', 'sub-039', 'sub-040', 'sub-042', 'sub-043', 'sub-044', 'sub-045', 'sub-046', 'sub-047', 'sub-048', 'sub-049', 'sub-050', 'sub-051', 'sub-052', 'sub-053', 'sub-055', 'sub-057', 'sub-058', 'sub-059', 'sub-060', 'sub-061', 'sub-062', 'sub-063', 'sub-064']
Label F: 13 subjects -> ['sub-070', 'sub-071', 'sub-072', 'sub-073', 'sub-075', 'sub-076', 'sub-078', 'sub-079', 'sub-082', 'sub-083', 'sub-084', 'sub-087', 'sub-088']
------------------------------------------------------------

===== VAL set =====
Label A: 6 subjects -> ['sub-006', 'sub-011', 'sub-012', 'sub-019', 'sub-023', 'sub-029']
Label C: 3 subjects -> ['sub-041', 'sub-056', 'sub-065']
La