## Data

In [17]:
import os
import pandas as pd

svs_dir = "DATA_DIRECTORY" 
output_csv = "dataset_csv/clam_slide_labels.csv"

svs_files = [f for f in os.listdir(svs_dir) if f.endswith(".svs")]

def get_label_from_sample_code(filename):
    try:
        sample_code = filename.split("-")[3][:2]
        if sample_code in ["01", "02"]:  
            return 1
        elif sample_code == "11":       
            return 0
        else:
            return None               
    except:
        return None

records = []
for f in svs_files:
    label = get_label_from_sample_code(f)
    if label is not None:
        records.append({"slide_id": f, "label": label})

df = pd.DataFrame(records)
df.to_csv(output_csv, index=False)

num_total = len(df)
num_cancer = sum(df["label"] == 1)
num_normal = sum(df["label"] == 0)

print(f" clam_slide_labels.csv save")
print(f"[ slide ]: {num_total}")
print(f"[ cancer (label=1)]: {num_cancer}")
print(f"[ normal (label=0)]: {num_normal}")

 clam_slide_labels.csv save
[ slide ]: 784
[ cancer (label=1)]: 540
[ normal (label=0)]: 244


In [19]:
label_csv = "dataset_csv/clam_slide_labels.csv"

df = pd.read_csv(label_csv)

def get_submitter_id(filename):
    return "-".join(filename.split("-")[:3])

df["submitter_id"] = df["slide_id"].apply(get_submitter_id)

case_cancer = df[df["label"] == 1]["submitter_id"].nunique()
case_normal = df[df["label"] == 0]["submitter_id"].nunique()

print(f"[ cancer case (label=1)]: {case_cancer}")
print(f"[ normal case (label=0)]: {case_normal}")

[ cancer case (label=1)]: 477
[ normal case (label=0)]: 198


## train_data

In [None]:
import pandas as pd
import os

csv_input_path = 'slide_labels.csv'
feature_dir = 'features'
csv_output_path = 'train_slide_labels.csv'

df = pd.read_csv('csv_input_path')

df['slide_path'] = df['slide_id'].apply(lambda x: os.path.joun(feature_dir, f"{x}.pt"))

df = df[['slide_id', 'slide_path', 'label']]
df.to_csv(csv_output_path, index=False)

print(f"transform finish~~ : {csv_output_path}")
print(df.head())

## Split_data_K-Fold (K=10)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import os

label_csv_path = "dataset_csv/slide_labels.csv"
save_dir = "dataset_csv/splits/task1"
os.makedirs(save_dir, exist_ok=True)

df = pd.read_csv(label_csv_path)
assert "slide_id" in df.columns and "label" in df.columns, "slide_labels.csv에 'slide_id', 'label' 컬럼 있어함요"

X = df["slide_id"].values
y = df["label"].values 

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=220)

for i, (train_val_idx, test_idx) in enumerate(skf.split(X, y)):
    X_train_val, y_train_val = X[train_val_idx], y[train_val_idx]
    skf_inner = StratifiedKFold(n_splits=9, shuffle=True, random_state=220)
    train_idx_inner, val_idx_inner = next(skf_inner.split(X_train_val, y_train_val))
    
    train_idx = train_val_idx[train_idx_inner]
    val_idx = train_val_idx[val_idx_inner]

    bool_df = pd.DataFrame(index=X, columns=["train", "val", "test"], data=False)
    bool_df.loc[X[train_idx], "train"] = True
    bool_df.loc[X[val_idx], "val"] = True
    bool_df.loc[X[test_idx], "test"] = True
    bool_df.index.name = "slide_id"
    bool_df.to_csv(f"{save_dir}/splits_{i}_bool.csv")

    def count_class(indices):
        y_sub = y[np.isin(X, X[indices])]
        normal = np.sum(y_sub == 0)
        tumor = np.sum(y_sub == 1)
        return normal, tumor

    train_n, train_t = count_class(train_idx)
    val_n, val_t = count_class(val_idx)
    test_n, test_t = count_class(test_idx)

    desc_df = pd.DataFrame({
        "train": [train_n, train_t],
        "val": [val_n, val_t],
        "test": [test_n, test_t],
    }, index=["normal_tissue", "tumor_tissue"])
    desc_df.to_csv(f"{save_dir}/splits_{i}_descriptor.csv")

    max_len = max(len(train_idx), len(val_idx), len(test_idx))
    pad = lambda arr: np.pad(arr, (0, max_len - len(arr)), constant_values="")
    split_df = pd.DataFrame({
        "train": pad(X[train_idx]),
        "val": pad(X[val_idx]),
        "test": pad(X[test_idx]),
    })
    split_df.to_csv(f"{save_dir}/splits_{i}.csv", index_label="index")

print("10 split finish")