In [1]:
import pandas as pd

In [21]:
metadata_file = "/Users/james/Documents/dataset/MIMIC-CXR/metadata.csv"

df = pd.read_csv(metadata_file)
print(f'image count: {len(df)}')

image count: 359972


In [23]:
print(df["Procedure"].value_counts())

Procedure
CHEST (PA AND LAT)                                             234970
CHEST (PORTABLE AP)                                            123571
DX CHEST PORTABLE PICC LINE PLACEMENT                             328
DX CHEST PORT LINE/TUBE PLCMT 1 EXAM                              255
DX CHEST PORT LINE/TUBE PLCMT 2 EXAMS                             165
DX CHEST PORT LINE/TUBE PLCMT 3 EXAMS                             157
DX CHEST & RIBS                                                   121
DX CHEST WITH DECUB                                                93
CHEST (SINGLE VIEW)                                                83
DX CHEST 2 VIEW PICC LINE PLACEMENT                                76
DX CHEST PORT LINE/TUBE PLCMT 4 EXAMS                              60
CHEST PORT LINE PLACEMENT                                          20
DX CHEST PORT LINE/TUBE PLCMT 5 EXAMS                              17
TRAUMA #3 (PORT CHEST ONLY)                                        14
CHEST PORT

# View

In [3]:
print(df["View"].value_counts())

View
antero-posterior         146448
postero-anterior          95858
lateral                   82612
left lateral              35033
left anterior oblique        21
Name: count, dtype: int64


# StudyID Null check

In [4]:
print(f'\"StudyID\".isna count: {df["StudyID"].isna().sum()}')

print('\n\"StudyID\".isna value count:')
nan_study_df = df[df["StudyID"].isna()]
print(nan_study_df["View"].value_counts())

"StudyID".isna count: 11582

"StudyID".isna value count:
View
antero-posterior         4541
postero-anterior         3161
lateral                  2790
left lateral             1088
left anterior oblique       2
Name: count, dtype: int64


# Study per StudyID

In [5]:
study_counts = df.groupby("PatientID")["StudyID"].nunique()
print(study_counts)

PatientID
10000032    4
10000764    1
10000898    2
10000935    6
10000980    9
           ..
19999287    7
19999376    1
19999442    2
19999733    1
19999987    3
Name: StudyID, Length: 64011, dtype: int64


In [6]:
patient_counts = study_counts.value_counts().sort_index()
print(patient_counts)

StudyID
0       1507
1      32013
2      10247
3       5264
4       3316
       ...  
106        1
116        1
123        1
127        1
158        1
Name: count, Length: 89, dtype: int64


# StudyID & View

In [15]:
study_ids = df["StudyID"].unique()

In [19]:
print(list(df["View"][df["StudyID"] == study_ids[0]]))

['lateral', 'postero-anterior']


# Export CSV (data selection)

In [31]:
df2 = df[["ImageID", "StudyID", "View"]]
print(len(df2))
df2 = df2.dropna()
print(len(df2))


359972
348390


In [40]:
df2 = df2[df2["View"].isin(["postero-anterior", "antero-posterior"])]
print(len(df2))
df2.value_counts("View")

234604


View
antero-posterior    141907
postero-anterior     92697
Name: count, dtype: int64

In [44]:
df2["StudyID"] = df2["StudyID"].str.replace("^s", "", regex=True)

# Merge it with CheXbert label

In [56]:
import sys
import os

sys.path.append(os.path.abspath("./../src/preprocess"))
from preprocess import label_preprocess

csv_file = "/Users/james/Documents/dataset/MIMIC-CXR/labels/mimic-cxr-2.0.0-chexpert.csv"
df_label = pd.read_csv(csv_file)
df_label.drop(columns=["subject_id"], inplace=True)
df_label.replace(-1, 0, inplace=True)
df_label.fillna(0, inplace=True)

In [57]:
print(df_label["study_id"].dtype)  # study_id의 데이터 타입 확인
print(df2["StudyID"].dtype)

int64
int64


In [58]:
df2["StudyID"] = df2["StudyID"].astype(int)

In [62]:
merged_df = pd.merge(df_label, df2, left_on="study_id", right_on="StudyID", how="inner")
merged_df.drop(columns=["study_id"], inplace=True)
print(merged_df.head())

   Atelectasis  Cardiomegaly  Consolidation  Edema  \
0          0.0           0.0            0.0    0.0   
1          0.0           0.0            0.0    0.0   
2          0.0           0.0            0.0    0.0   
3          0.0           0.0            0.0    0.0   
4          0.0           0.0            0.0    0.0   

   Enlarged Cardiomediastinum  Fracture  Lung Lesion  Lung Opacity  \
0                         0.0       0.0          0.0           0.0   
1                         0.0       0.0          0.0           0.0   
2                         0.0       0.0          0.0           0.0   
3                         0.0       0.0          0.0           0.0   
4                         0.0       0.0          0.0           0.0   

   No Finding  Pleural Effusion  Pleural Other  Pneumonia  Pneumothorax  \
0         1.0               0.0            0.0        0.0           0.0   
1         1.0               0.0            0.0        0.0           0.0   
2         1.0               0

In [63]:
merged_df.to_csv("/Users/james/Documents/dataset/MIMIC-CXR/data_selection-241201.csv", index=False)