In [1]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
import regex as re

In [2]:
# ----- paths -----
base_dir = "/net/scratch/kmballantyne/msds_final/chexphoto_FL"
train_csv = os.path.join(base_dir, "train.csv") #CheXphoto train metadata
meta_dir  = os.path.join(base_dir, "metadata")
images_root = os.path.join(base_dir, "images")  # we'll resolve from here
os.makedirs(meta_dir, exist_ok=True)

print("Loading train.csv metadata ...")
df_train = pd.read_csv(train_csv)

Loading train.csv metadata ...


In [3]:
clients = [1, 2, 3, 4, 5]
splits = ["train", "valid"]

for cid in clients:
    for split in splits:
        fname = f"client{cid}_{split}.csv"
        fpath = os.path.join(meta_dir, fname)
        if not os.path.exists(fpath):
            print(f"[WARN] {fpath} not found, skipping.")
            continue

        print(f"Loading {fpath} ...")
        df = pd.read_csv(fpath)

        # 1) patient_id from Path
        df["patient_id"] = df["Path"].str.extract(r"(patient\d+)", expand=False)

        # quick sanity: how many unique patients?
        n_patients = df["patient_id"].nunique()
        print(f"  client{cid} {split}: {n_patients} unique patients")

        # 2) binary Pleural Effusion label
        # column name from your header: Pleural_Effusion
        pe_col = "Pleural_Effusion"
        df[pe_col] = pd.to_numeric(df[pe_col], errors="coerce")
        df["PE_label"] = (df[pe_col].fillna(0) > 0).astype(int)

        print(f"  client{cid} {split}: PE+ count = {int(df['PE_label'].sum())}, total = {len(df)}")

        # 3) build local_path to images on /net/scratch
        # You rsynced to: images/train/synthetic/... and images/valid/synthetic/...
        # Your Path starts with "CheXphoto-v1.0/...".
        # On disk, you likely have: images/train/synthetic/... (no CheXphoto-v1.0 folder).
        # So we strip the leading "CheXphoto-v1.0/" if present.

        def to_local_path(path: str) -> str:
            rel = path
            if rel.startswith("CheXphoto-v1.0/"):
                rel = rel[len("CheXphoto-v1.0/"):]  # remove prefix
            return os.path.join(images_root, rel)

        df["local_path"] = df["Path"].apply(to_local_path)

        # Optional: check a couple of files actually exist
        sample_paths = df["local_path"].head(5).tolist()
        missing = [p for p in sample_paths if not os.path.exists(p)]
        if missing:
            print("  [WARN] Some sample local_paths do not exist, e.g.:")
            for p in missing[:3]:
                print("    MISSING:", p)
        else:
            print("  Sample local_paths look good.")

        # 4) save cleaned version
        out_name = f"client{cid}_{split}_clean.csv"
        out_path = os.path.join(meta_dir, out_name)
        df.to_csv(out_path, index=False)
        print(f"  Saved cleaned CSV: {out_path}\n")

print("Done cleaning all client CSVs.")

Loading /net/scratch/kmballantyne/msds_final/chexphoto_FL/metadata/client1_train.csv ...
  client1 train: 600 unique patients
  client1 train: PE+ count = 810, total = 2118
  Sample local_paths look good.
  Saved cleaned CSV: /net/scratch/kmballantyne/msds_final/chexphoto_FL/metadata/client1_train_clean.csv

Loading /net/scratch/kmballantyne/msds_final/chexphoto_FL/metadata/client1_valid.csv ...
  client1 valid: 40 unique patients
  client1 valid: PE+ count = 10, total = 43
  Sample local_paths look good.
  Saved cleaned CSV: /net/scratch/kmballantyne/msds_final/chexphoto_FL/metadata/client1_valid_clean.csv

Loading /net/scratch/kmballantyne/msds_final/chexphoto_FL/metadata/client2_train.csv ...
  client2 train: 600 unique patients
  client2 train: PE+ count = 761, total = 1990
  Sample local_paths look good.
  Saved cleaned CSV: /net/scratch/kmballantyne/msds_final/chexphoto_FL/metadata/client2_train_clean.csv

Loading /net/scratch/kmballantyne/msds_final/chexphoto_FL/metadata/client2