In [1]:
import os

In [23]:
data_path = os.path.join(os.environ['HOME'], 'VisualSearch')
version = 'mmc-amd-splitA'  # zero-eye overlap between train/val/test
version = 'mmc-amd-splitAP' # zero-subject overlap between train/val/test
datasets = ['%s-%s' % (version, x) for x in 'train val test'.split()]
print (datasets)

modalities = 'cfp oct'.split()
id_types = 'Image Eye Subject'.split()

['mmc-amd-splitAP-train', 'mmc-amd-splitAP-val', 'mmc-amd-splitAP-test']


## Read image/eye/subject ids per dataset

In [24]:
id_sets = dict([(m,{}) for m in modalities])

for m in modalities:
    for id_t in id_types:
        id_sets[m][id_t] = {}
        print (m, id_t)
        for dataset in datasets:
            id_file = os.path.join(data_path, dataset, '%sSets' % id_t, '%s.txt' % m)
            ids = [x.strip() for x in open(id_file).readlines() if x.strip()]
            id_sets[m][id_t][dataset] = ids
            print (dataset, len(ids))
        print ('')

cfp Image
mmc-amd-splitAP-train 934
mmc-amd-splitAP-val 80
mmc-amd-splitAP-test 80

cfp Eye
mmc-amd-splitAP-train 933
mmc-amd-splitAP-val 80
mmc-amd-splitAP-test 80

cfp Subject
mmc-amd-splitAP-train 707
mmc-amd-splitAP-val 62
mmc-amd-splitAP-test 60

oct Image
mmc-amd-splitAP-train 1024
mmc-amd-splitAP-val 134
mmc-amd-splitAP-test 131

oct Eye
mmc-amd-splitAP-train 659
mmc-amd-splitAP-val 79
mmc-amd-splitAP-test 79

oct Subject
mmc-amd-splitAP-train 516
mmc-amd-splitAP-val 62
mmc-amd-splitAP-test 60



## Check image overlap between datasets

In [25]:
id_t = 'Image'
for m in modalities:
    for i in range(len(datasets)-1):
        for j in range(i+1, len(datasets)):
            n_i = len(set(id_sets[m][id_t][datasets[i]]))
            n_j = len(set(id_sets[m][id_t][datasets[j]]))
            overlap = set(id_sets[m][id_t][datasets[i]]).intersection(set(id_sets[m][id_t][datasets[j]]))
            print (m, datasets[i], datasets[j], n_i, n_j, '->', len(overlap))

cfp mmc-amd-splitAP-train mmc-amd-splitAP-val 934 80 -> 0
cfp mmc-amd-splitAP-train mmc-amd-splitAP-test 934 80 -> 0
cfp mmc-amd-splitAP-val mmc-amd-splitAP-test 80 80 -> 0
oct mmc-amd-splitAP-train mmc-amd-splitAP-val 1024 134 -> 0
oct mmc-amd-splitAP-train mmc-amd-splitAP-test 1024 131 -> 0
oct mmc-amd-splitAP-val mmc-amd-splitAP-test 134 131 -> 0


## Check eye overlap between datasets

In [26]:
id_t = 'Eye'
for m in modalities:
    for i in range(len(datasets)-1):
        for j in range(i+1, len(datasets)):
            n_i = len(set(id_sets[m][id_t][datasets[i]]))
            n_j = len(set(id_sets[m][id_t][datasets[j]]))
            overlap = set(id_sets[m][id_t][datasets[i]]).intersection(set(id_sets[m][id_t][datasets[j]]))
            print (m, datasets[i], datasets[j], n_i, n_j, '->', len(overlap))

cfp mmc-amd-splitAP-train mmc-amd-splitAP-val 933 80 -> 0
cfp mmc-amd-splitAP-train mmc-amd-splitAP-test 933 80 -> 0
cfp mmc-amd-splitAP-val mmc-amd-splitAP-test 80 80 -> 0
oct mmc-amd-splitAP-train mmc-amd-splitAP-val 659 79 -> 0
oct mmc-amd-splitAP-train mmc-amd-splitAP-test 659 79 -> 0
oct mmc-amd-splitAP-val mmc-amd-splitAP-test 79 79 -> 0


## Check subject overlap between datasets

In [27]:
id_t = 'Subject'
for m in modalities:
    for i in range(len(datasets)-1):
        for j in range(i+1, len(datasets)):
            n_i = len(set(id_sets[m][id_t][datasets[i]]))
            n_j = len(set(id_sets[m][id_t][datasets[j]]))
            overlap = set(id_sets[m][id_t][datasets[i]]).intersection(set(id_sets[m][id_t][datasets[j]]))
            print (m, datasets[i], datasets[j], n_i, n_j, '->', len(overlap))

cfp mmc-amd-splitAP-train mmc-amd-splitAP-val 707 62 -> 0
cfp mmc-amd-splitAP-train mmc-amd-splitAP-test 707 60 -> 0
cfp mmc-amd-splitAP-val mmc-amd-splitAP-test 62 60 -> 0
oct mmc-amd-splitAP-train mmc-amd-splitAP-val 516 62 -> 0
oct mmc-amd-splitAP-train mmc-amd-splitAP-test 516 60 -> 0
oct mmc-amd-splitAP-val mmc-amd-splitAP-test 62 60 -> 0


## Counter labels per dataset

In [28]:
labels = 'h d p w'.split()
count = dict([(m,{'Image':{}}) for m in modalities])

for dataset in datasets:
    for m in modalities:
        count[m]['Image'][dataset] = dict([(y,0) for y in labels])
        imset = id_sets[m]['Image'][dataset]
        for x in imset:
            y = x.split('-')[1]
            count[m]['Image'][dataset][y] += 1

print(' '.join(datasets))
print('(cfp, oct) '*len(datasets))
for y in labels:
    line = []
    for dataset in datasets:
        n_cfp = count['cfp']['Image'][dataset][y]
        n_oct = count['oct']['Image'][dataset][y]
        line.append((n_cfp, n_oct))
    print (y, line)    


mmc-amd-splitAP-train mmc-amd-splitAP-val mmc-amd-splitAP-test
(cfp, oct) (cfp, oct) (cfp, oct) 
h [(155, 156), (20, 20), (20, 20)]
d [(67, 42), (20, 36), (20, 28)]
p [(259, 294), (20, 43), (20, 43)]
w [(453, 532), (20, 35), (20, 40)]


## Count CFP images per dataset

In [29]:
all_set = set()
for dataset in datasets:
    img_set =  id_sets['cfp']['Image'][dataset]
    all_set = all_set.union(img_set)
    print (dataset, 'cfp', len(img_set))
print ('%d cfp images in total' % len(all_set))

mmc-amd-splitAP-train cfp 934
mmc-amd-splitAP-val cfp 80
mmc-amd-splitAP-test cfp 80
1094 cfp images in total


## Count OCT images per dataset

In [30]:
all_set = set()
for dataset in datasets:
    img_set =  id_sets['oct']['Image'][dataset]
    all_set = all_set.union(img_set)
    print (dataset, 'oct', len(img_set))
print ('%d oct images in total' % len(all_set))

mmc-amd-splitAP-train oct 1024
mmc-amd-splitAP-val oct 134
mmc-amd-splitAP-test oct 131
1289 oct images in total


## Count eyes per dataset

In [31]:
all_cfp_set = set()
all_oct_set = set()

for dataset in datasets:
    cfp_set =  id_sets['cfp']['Eye'][dataset]
    oct_set =  id_sets['oct']['Eye'][dataset]
    common_set = set(cfp_set).union(set(oct_set))
    all_cfp_set = all_cfp_set.union(set(cfp_set))
    all_oct_set = all_oct_set.union(set(oct_set))   
    print (dataset, 'cfp', len(cfp_set), 'oct', len(oct_set), 'cfp&oct', len(common_set))
print ('%d eyes have cfp images' % len(all_cfp_set))
print ('%d eyes have oct images' % len(all_oct_set))
print ('%d eyes have both cfp and oct images' % len(all_oct_set.intersection(all_cfp_set)))

mmc-amd-splitAP-train cfp 933 oct 659 cfp&oct 982
mmc-amd-splitAP-val cfp 80 oct 79 cfp&oct 80
mmc-amd-splitAP-test cfp 80 oct 79 cfp&oct 80
1093 eyes have cfp images
817 eyes have oct images
768 eyes have both cfp and oct images


## Count subjects per dataset

In [32]:
all_cfp_set = set()
all_oct_set = set()
for dataset in datasets:
    cfp_set =  id_sets['cfp']['Subject'][dataset]
    oct_set =  id_sets['oct']['Subject'][dataset]
    common_set = set(cfp_set).union(set(oct_set))
    all_cfp_set = all_cfp_set.union(set(cfp_set))
    all_oct_set = all_oct_set.union(set(oct_set))   
    print (dataset, 'cfp', len(cfp_set), 'oct', len(oct_set), 'cfp&oct', len(common_set))

print ('%d subjects have cfp images' % len(all_cfp_set))
print ('%d subjects have oct images' % len(all_oct_set))
print ('%d subjects have both cfp and oct images' % len(all_oct_set.intersection(all_cfp_set)))

mmc-amd-splitAP-train cfp 707 oct 516 cfp&oct 740
mmc-amd-splitAP-val cfp 62 oct 62 cfp&oct 62
mmc-amd-splitAP-test cfp 60 oct 60 cfp&oct 60
829 subjects have cfp images
638 subjects have oct images
605 subjects have both cfp and oct images
