In [9]:
from pathlib import Path

root = Path("/mnt/jbrockma")
cxr8_root = root / "CXR8"

In [10]:
with open(cxr8_root / "test_list.txt") as f:
    test_files = [line.strip() for line in f.readlines()]

with open(cxr8_root / "train_val_list.txt") as f:
    train_val_files = [line.strip() for line in f.readlines()]

print(test_files[:5])
print(train_val_files[:5])

['00000003_000.png', '00000003_001.png', '00000003_002.png', '00000003_003.png', '00000003_004.png']
['00000001_000.png', '00000001_001.png', '00000001_002.png', '00000002_000.png', '00000004_000.png']


In [11]:
test_ids = set([int(test_file.split("_")[0]) for test_file in test_files])
train_val_ids = set([int(train_val_file.split("_")[0]) for train_val_file in train_val_files])

# is split on patient level?
print(any([test_id in train_val_ids for test_id in test_ids]))
print(any([train_val_id in test_ids for train_val_id in train_val_ids]))

False
False


In [12]:
cxr8_images = list((cxr8_root / "images" / "images").iterdir())
image_files = set([image.name for image in cxr8_images])
image_files

{'00002602_000.png',
 '00001992_010.png',
 '00017546_006.png',
 '00004833_009.png',
 '00030111_002.png',
 '00020156_006.png',
 '00006745_000.png',
 '00027994_000.png',
 '00018693_000.png',
 '00008888_020.png',
 '00012644_001.png',
 '00029813_018.png',
 '00000445_000.png',
 '00004506_002.png',
 '00018448_002.png',
 '00017841_002.png',
 '00009680_000.png',
 '00010563_018.png',
 '00020540_005.png',
 '00027725_053.png',
 '00004832_034.png',
 '00026185_002.png',
 '00002771_000.png',
 '00004850_003.png',
 '00015313_013.png',
 '00011190_002.png',
 '00016205_001.png',
 '00019967_029.png',
 '00013695_001.png',
 '00028795_007.png',
 '00006926_003.png',
 '00019507_000.png',
 '00002524_020.png',
 '00012598_000.png',
 '00007046_005.png',
 '00018009_000.png',
 '00013937_003.png',
 '00007624_050.png',
 '00025849_002.png',
 '00002769_000.png',
 '00004507_000.png',
 '00003639_003.png',
 '00013077_017.png',
 '00000116_030.png',
 '00018360_009.png',
 '00011156_004.png',
 '00014933_002.png',
 '00017494_00

In [13]:
print(all([test_file in image_files for test_file in test_files]))
print(all([train_val_file in image_files for train_val_file in train_val_files]))

True
True


In [14]:
print(len(image_files))

112120


In [16]:
print(len(test_files))
print(len(train_val_files))
print(len(test_files) + len(train_val_files) == len(image_files))

25596
86524
True


In [34]:
import pandas as pd
data_root = root / "bachelor-thesis-data"
df = pd.read_csv(data_root / "chest" / "image-patients.csv")
df

Unnamed: 0,file_name,patient_id
0,00000001_000.png,1
1,00000001_001.png,1
2,00000001_002.png,1
3,00000002_000.png,2
4,00000003_001.png,3
...,...,...
112115,00030801_001.png,30801
112116,00030802_000.png,30802
112117,00030803_000.png,30803
112118,00030804_000.png,30804


In [35]:
filtered_df = df[df["file_name"].isin(train_val_files)]
filtered_df

Unnamed: 0,file_name,patient_id
0,00000001_000.png,1
1,00000001_001.png,1
2,00000001_002.png,1
3,00000002_000.png,2
12,00000004_000.png,4
...,...,...
112100,00030789_000.png,30789
112106,00030793_000.png,30793
112108,00030795_000.png,30795
112114,00030801_000.png,30801


In [36]:
patient_ids = pd.Series(filtered_df["patient_id"].unique())
train_patients = patient_ids.sample(frac=70/(70+10), random_state=183)
train_patients

24124    25173
15497    16204
6612      6856
18183    19019
16322    17070
         ...  
16570    17335
21280    22267
22386    23427
2138      2230
26593    27751
Length: 24507, dtype: int64

In [37]:
val_patients = patient_ids[~patient_ids.isin(train_patients)]
val_patients

1            2
9           11
10          12
17          20
20          23
         ...  
27981    30726
27984    30733
27997    30763
27998    30764
28000    30772
Length: 3501, dtype: int64

In [42]:
train_patients.isin(val_patients).value_counts()

False    24507
dtype: int64

In [43]:
val_patients.isin(train_patients).value_counts()

False    3501
dtype: int64

In [44]:
train_df = filtered_df[filtered_df["patient_id"].isin(train_patients)]
train_df

Unnamed: 0,file_name,patient_id
0,00000001_000.png,1
1,00000001_001.png,1
2,00000001_002.png,1
12,00000004_000.png,4
13,00000005_000.png,5
...,...,...
112100,00030789_000.png,30789
112106,00030793_000.png,30793
112108,00030795_000.png,30795
112114,00030801_000.png,30801


In [45]:
val_df = filtered_df[filtered_df["patient_id"].isin(val_patients)]
val_df

Unnamed: 0,file_name,patient_id
3,00000002_000.png,2
28,00000011_000.png,11
29,00000011_001.png,11
30,00000011_002.png,11
31,00000011_003.png,11
...,...,...
112062,00030763_000.png,30763
112063,00030764_000.png,30764
112073,00030772_000.png,30772
112074,00030772_001.png,30772


In [46]:
train_df.isin(val_df).value_counts()

file_name  patient_id
False      False         75948
dtype: int64

In [47]:
val_df.isin(train_df).value_counts()

file_name  patient_id
False      False         10576
dtype: int64

In [49]:
train_file_names = train_df["file_name"]

if False:
    with open("txt", "w") as f:
        f.write("\n".join(train_file_names))