In [None]:
import os
import re
import pydicom
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange, tqdm

In [None]:
df = pd.read_csv('/opt/gpudata/midrc-sift/obj_ids.csv')
dcm_dir = '/opt/gpudata/midrc-sift/dcm'

In [None]:
obj_ids = df[["series_uid", "ann_id", "annotation"]].rename(columns={"ann_id": "obj_id", "annotation": "fname"})

In [None]:
dcms = []
common_uids = []
for i in trange(len(obj_ids)):
    row = obj_ids.iloc[i]
    series_uid = row["series_uid"]
    fname = row["fname"]
    fpath = os.path.join(dcm_dir, series_uid, fname)
    if not os.path.exists(fpath):
        break
    dcm = pydicom.dcmread(fpath)
    dcms.append(dcm)

    matches = re.findall(r"__([\d\.]*?)__seg.dcm", fname)
    assert len(matches) == 1
    common_uid = matches[0]
    common_uids.append(common_uid)

In [None]:
labels = []
for dcm in tqdm(dcms):
    assert hasattr(dcm, "SegmentSequence")
    labels.append([seg.SegmentLabel for seg in dcm.SegmentSequence])

In [None]:
temp = [
    (series_uid, common_uid, label)
    for series_uid, common_uid, ls in zip(obj_ids["series_uid"], common_uids, labels)
    for label in ls
]
label_df = pd.DataFrame({
    "series_uid": [x[0] for x in temp],
    "image_uid": [x[1] for x in temp],
    "label": [x[2] for x in temp],
    "values": [1] * len(temp),
}).drop_duplicates()
label_df = label_df.pivot(index=["series_uid", "image_uid"], columns="label", values="values").fillna(0).astype(int)

label_df.columns.name = None
label_df = label_df.reset_index()

In [None]:
label_df.to_csv("/opt/gpudata/midrc-sift/labels.csv", index=False)

In [None]:
label_df = pd.read_csv("/opt/gpudata/midrc-sift/labels.csv")
label_df

In [None]:
label_df.columns