In [2]:
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import torch


In [None]:
import pandas as pd
import numpy as np

# ==========================
# 1) Load data
# ==========================
whole_df = pd.read_csv("image_level_predictions.csv")  # full data with preds
val_fold_df = pd.read_csv("_val_fold_0.csv")  # given validation patient split

# ==========================
# 2) Separate train vs evaluation patients
# ==========================
whole_ids = set(whole_df['patient_id'])
val_fold_ids = set(val_fold_df['patient_id'])

# these IDs stay as TRAIN
train_ids = whole_ids - val_fold_ids

# patient-level evaluation dataframe (only for val/test splitting)
eval_df = whole_df[whole_df['patient_id'].isin(val_fold_ids)].reset_index(drop=True)

print("Train patients:", len(train_ids))
print("Eval candidates:", len(val_fold_ids))

# ==========================
# 3) Patient-level cancer label aggregation
# ==========================
patient_level = eval_df.groupby("patient_id")["cancer"].max().reset_index()

positive_patients = patient_level[patient_level["cancer"] == 1]["patient_id"].values
negative_patients = patient_level[patient_level["cancer"] == 0]["patient_id"].values

print("Positive patients:", len(positive_patients))
print("Negative patients:", len(negative_patients))

# ==========================
# 4) Stratified 50/50 split â†’ val / test
# ==========================
np.random.seed(42)
np.random.shuffle(positive_patients)
np.random.shuffle(negative_patients)

pos_mid = len(positive_patients) // 2
neg_mid = len(negative_patients) // 2

val_patients = np.concatenate([positive_patients[:pos_mid], negative_patients[:neg_mid]])
test_patients = np.concatenate([positive_patients[pos_mid:], negative_patients[neg_mid:]])

print("Val patients:", len(val_patients))
print("Test patients:", len(test_patients))

# ==========================
# 5) Map back to full image-level DF
# ==========================
train_df = whole_df[whole_df["patient_id"].isin(train_ids)].reset_index(drop=True)
final_val_df = whole_df[whole_df["patient_id"].isin(val_patients)].reset_index(drop=True)
final_test_df = whole_df[whole_df["patient_id"].isin(test_patients)].reset_index(drop=True)

# ==========================
# 6) Summary
# ==========================
print("\n===== Image-Level Count =====")
print("Total:", len(whole_df))
print("Train:", len(train_df))
print("Val:", len(final_val_df))
print("Test:", len(final_test_df))
print("Check:", len(train_df) + len(final_val_df) + len(final_test_df))

# ==========================
# 7) (Optional) Save results
# ==========================
# train_df.to_csv("train_split.csv", index=False)
# final_val_df.to_csv("val_split.csv", index=False)
# final_test_df.to_csv("test_split.csv", index=False)
# print("Saved train/val/test splits!")
